15ab88298277f4c822a77f667aa4d2f7def37b3c
tdreszer
  Tue Apr 23 13:35:05 2013 -0700
Changes in preparation for checking in haplotypes code.
diff --git src/inc/vcfBits.h src/inc/vcfBits.h
index 774cec7..1a91fd3 100644
--- src/inc/vcfBits.h
+++ src/inc/vcfBits.h
@@ -1,71 +1,107 @@
 /* vcfBits.c/.h: Variant Call Format, analysis by bit maps.
  * The routines found here are dependent upon vcf.c/.h for accessing vcf records.
  * They allow analysis of a set of vcf records by bit maps with one bit map per variant
  * location and where each haplotype covered by the vcf record is represented by a single
  * bit (or pair of bits). Additional analysis can be performed by creating haplotype based
  * bit maps from variant bit maps.  There is one haplotype bit map for each haplotype
  * (subject chromosome) with one (or two) bits for each variant location in the set of records. */
 
 #ifndef vcfBits_h
 #define vcfBits_h
 
 #include "vcf.h"
 #include "bits.h"
 #include "elmTree.h"
 
+enum variantType
+/* VCF header defines INFO column components; each component has one of these types: */
+    {
+    vtNoType    = 0,   // uninitialized value (0) or unrecognized type name
+    vtSNP       = 1,   // Single Nucleotide polymorphism (most)
+    vtInsertion = 2,   // Insertion relative to reference genome
+    vtDeletion  = 3,   // Insertion relative to reference genome
+    };
+
 struct variantBits
 // all genotypes/haplotypes/alleles for one record are converted to a bit map
 // One struct per variant record in vcff->records.  One slot per genotype containing
 // 2 slots for haplotypes and then 1 or 2 bits per allele.
     {
     struct variantBits *next;
     struct vcfRecord *record;     // keep track of record for later interpretation
+    enum variantType vType;       // Might as well know this up front
     int genotypeSlots;            // subjects covered in vcf file
     unsigned char haplotypeSlots; // 2 unless haploid or homozygous only
     unsigned char alleleSlots;    // 1 for 1 alt allele, 2 for 2 or 3 alt alleles >3 unsupported
     int bitsOn;                   // count of bits on.
     Bits *bits;                   // allele bits genotype x haplotype x allele
     Bits *unphased;               // unphased bits (1 bit per genotype) if requested, else NULL
     void **variants;              // special purposes array of variants filled and used by caller
     };
 
 #define genoIxFromGenoHapIx(vBits,genoHaploIx)  (genoHaploIx / vBits->haplotypeSlots)
 #define hapIxFromGenoHapIx(vBits,genoHaploIx)   (genoHaploIx % vBits->haplotypeSlots)
 #define genoHapIx(vBits,genoIx,hapIx)          ((genoIx * vBits->haplotypeSlots) + hapIx)
 #define vBitsSlot(vBits,genoIx,hapIx,variantIx) \
         ( (genoHapIx(vBits,genoIx,hapIx) * vBits->alleleSlots) + variantIx)
 #define vBitsSlotCount(vBits) \
         ((vBits)->genotypeSlots * (vBits)->haplotypeSlots * (vBits)->alleleSlots)
+#define vcfBitsSubjectCount(vBits) ((vBits)->genotypeSlots)
 
 struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records,
                                             boolean phasedOnly, boolean homozygousOnly,
                                             boolean unphasedBits);
 // Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record
 // provided.  If records is NULL will use vcff->records. Bit map has one slot per genotype
 // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
 // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
 // is represented by two bits per allele (>3 non-reference alleles unsupported).
 // If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable)
 // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
 // If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes.
 // NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
 
-int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin);
-// Drops vBits found in less than a minimum number of haplotype genomes.
+Bits *vcfRecordHaploidBits(struct vcfFile *vcff, struct vcfRecord *record);
+// Returns array (1 bit per genotype) for each haploid genotype.
+// This is useful for interpreting chrX.
+
+int vBitsHaploidCount(struct variantBits *vBits);
+// Returns the number of subjects in the VCF dataset that are haploid at this location
+
+//#define vBitsSubjectChromCount(vBits) ((vBits)->genotypeSlots * (vBits)->haplotypeSlots)
+int vBitsSubjectChromCount(struct variantBits *vBits);
+// Returns the chromosomes in the VCF dataset that are covered at this location
+
+int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin,
+                             boolean dropRefErrors);
+// Drops vBits found in less than a minimum number of haplotype genomes.  Supplying 1 will
+// drop variants found in no haplotype genomes.  Declaring dropRefErrors will drop variants
+// in all haplotype genomes (variants where reference is wrong).
 // Returns count of vBits structure that were dropped.
 
+int vcfVariantBitsAlleleOccurs(struct variantBits *vBits, unsigned char alleleIx,
+                               boolean homozygousOrHaploid);
+// Counts the number of times a particular allele occurs in the subjects*chroms covered.
+// If homozygousOrHaploid then the allele must occur in both chroms to be counted
+
+int vcfVariantBitsReferenceOccurs(struct vcfFile *vcff, struct variantBits *vBitsList,
+                                  boolean homozygousOrHaploid);
+// For an entire list of vBits structs, counts the times the reference allele occurs.
+// If homozygousOrHaploid then the reference allele must occur in both chroms to be counted
+
+
 int vcfVariantMostPopularCmp(const void *va, const void *vb);
 // Compare to sort variantBits based upon how many genomes/chrom has the variant
 // This can be used to build haploBits in most populous order for tree building
 
 struct haploBits
 // all variants/haplotypes/genotypes for a set of records are converted to a bit map
 // One struct per haplotype genome covering vcff->records.  One slot per variant
 // and 1 or 2 bits per allele.  NOTE: variant slots will all be normalized to max.
     {
     struct haploBits *next;
     char *ids;                 // comma separated lists of genotype names and haplotypes
     int haploGenomes;          // count of haploid genomes this structure covers
     int genomeIx;              // genome sample index (allows later lookups)
     unsigned char haploidIx;   // haploid index [0,1] (allows later  lookups)
     int variantSlots;          // count of variants covered in set of vcf records