15ab88298277f4c822a77f667aa4d2f7def37b3c
tdreszer
  Tue Apr 23 13:35:05 2013 -0700
Changes in preparation for checking in haplotypes code.
diff --git src/lib/vcfBits.c src/lib/vcfBits.c
index dbe6736..0ed67b9 100644
--- src/lib/vcfBits.c
+++ src/lib/vcfBits.c
@@ -30,35 +30,60 @@
 if (record->genotypes == NULL)
     vcfParseGenotypes(record);
 assert(vcff->genotypeCount > 0);
 int ix = 0;
 
 // allocate vBits struct
 struct variantBits *vBits;
 struct lm *lm = vcfFileLm(vcff);
 lmAllocVar(lm,vBits);
 vBits->genotypeSlots  = vcff->genotypeCount;
 vBits->record         = record;
 vBits->haplotypeSlots = 2;  // assume diploid, but then check for complete haploid
 if (homozygousOnly)
     vBits->haplotypeSlots = 1;
 else
-    { // spin through all the subjects to see if any are non-haploid
+    { // spin through all the subjects to see if all are haploid
     for (ix = 0; ix < vcff->genotypeCount && record->genotypes[ix].isHaploid;ix++)
         ;
     if (ix == vcff->genotypeCount)
-        vBits->haplotypeSlots = 1;
+        vBits->haplotypeSlots = 1;  // All are haploid: chrY
+    assert(vBits->haplotypeSlots == 2 || record->chrom[strlen(record->chrom) - 1] == 'Y');
+    }
+
+// Type of variant?
+assert(record->alleles != NULL);
+if(record->alleles != NULL)
+    {
+    int nonRefLen = 0;
+    assert(record->alleles[0] != NULL);
+    int refLen = record->alleles[0] ? strlen(record->alleles[0]) : 0;
+    for (ix = 1; ix < record->alleleCount; ix++)
+        {
+        assert(record->alleles[ix] != NULL);
+        int thisLen = record->alleles[ix] && differentWord(record->alleles[ix],"<DEL>") ?
+                                                                  strlen(record->alleles[ix]) : 0;
+        if (nonRefLen < thisLen)
+            nonRefLen = thisLen;
+        }
+    if (refLen == 1 && nonRefLen == 1)
+        vBits->vType = vtSNP;
+    else if (refLen < nonRefLen)
+        vBits->vType = vtInsertion;
+    else if (refLen > nonRefLen)
+        vBits->vType = vtDeletion;
+    // else unknown
     }
 
 // allocate bits
 assert(record->alleleCount < 4);
 vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2);
 int bitCount = vBitsSlotCount(vBits);
 Bits *bits = lmBitAlloc(lm,bitCount);
 vBits->bits = bits;
 
 
 // walk through genotypes and set the bit
 for (ix = 0; ix < vcff->genotypeCount;ix++)
     {
     struct vcfGenotype *gt = gt = &(record->genotypes[ix]);
 
@@ -101,77 +126,268 @@
 // parse genotypes if needed
 if (record->genotypes == NULL)
     vcfParseGenotypes(record);
 
 // walk through genotypes and set the bit
 int ix = 0;
 for ( ; ix < vcff->genotypeCount;ix++)
     {
     if (record->genotypes[ix].isPhased == FALSE
     && record->genotypes[ix].isHaploid == FALSE)
         bitSetOne(bits,ix);
     }
 return bits;
 }
 
+Bits *vcfRecordHaploidBits(struct vcfFile *vcff, struct vcfRecord *record)
+// Returns array (1 bit per genotype) for each haploid genotype.
+// This is useful for interpreting chrX.
+{
+// allocate bits
+Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount);
+
+// parse genotypes if needed
+if (record->genotypes == NULL)
+    vcfParseGenotypes(record);
+
+// walk through genotypes and set the bit
+int ix = 0;
+for ( ; ix < vcff->genotypeCount;ix++)
+    {
+    if (record->genotypes[ix].isHaploid)
+        bitSetOne(bits,ix);
+    }
+return bits;
+}
+
+int vBitsHaploidCount(struct variantBits *vBits)
+// Returns the number of subjects in the VCF dataset that are haploid at this location
+{
+int haploid = 0;  // Most are not haploid
+
+char *chrom = vBits->record->chrom;
+if (chrom[strlen(chrom) - 1] == 'Y')
+    haploid = vcfBitsSubjectCount(vBits);
+else if (chrom[strlen(chrom) - 1] == 'X')  // chrX is tricky: males are haploid except in PAR
+    {
+    struct vcfRecord *record = vBits->record;
+    struct vcfFile *vcff = record->file;
+
+    // parse genotypes if needed
+    if (record->genotypes == NULL)
+        vcfParseGenotypes(record);
+
+    // walk through genotypes
+    int ix = 0;
+    for ( ; ix < vcff->genotypeCount;ix++)
+        {
+        if (record->genotypes[ix].isHaploid)
+            haploid++;
+        }
+    }
+return haploid;
+}
+
+int vBitsSubjectChromCount(struct variantBits *vBits)
+// Returns the chromosomes in the VCF dataset that are covered at this location
+{
+int haploid = vBitsHaploidCount(vBits);
+int diploid = vcfBitsSubjectCount(vBits) - haploid;
+return ((diploid * 2) + haploid);
+}
+
 struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records,
                                             boolean phasedOnly, boolean homozygousOnly,
                                             boolean unphasedBits)
 // Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record
 // provided.  If records is NULL will use vcff->records. Bit map has one slot per genotype
 // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
 // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
 // is represented by two bits per allele (>3 non-reference alleles unsupported).
 // If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable)
 // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
 // If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes.
 // NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
 {
 struct variantBits *vBitsList = NULL;
 
 struct vcfRecord *record = records ? records : vcff->records;
 for (;record != NULL; record = record->next)
     {
     struct variantBits *vBits = vcfOneRecordToVariantBits(vcff, record, phasedOnly, homozygousOnly);
     if (unphasedBits)
         vBits->unphased = vcfRecordUnphasedBits(vcff, record);
     slAddHead(&vBitsList,vBits);
     }
 slReverse(&vBitsList);
 
 return vBitsList;
 }
 
-int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin)
-// Drops vBits found in less than a minimum number of haplotype genomes.
+int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin,
+                             boolean dropRefErrors)
+// Drops vBits found in less than a minimum number of haplotype genomes.  Supplying 1 will
+// drop variants found in no haplotype genomes.  Declaring dropRefErrors will drop variants
+// in all haplotype genomes (variants where reference is wrong).
 // Returns count of vBits structure that were dropped.
 {
 struct variantBits *vBitsKeep = NULL;
 struct variantBits *vBits = NULL;
 int dropped = 0;
 while ((vBits = slPopHead(vBitsList)) != NULL)
     {
     if (vBits->bitsOn >= haploGenomeMin)
         slAddHead(&vBitsKeep,vBits);
+    else
+        dropped++;  // if not demoted, drop memory, since this is on lm
+    }
+*vBitsList = vBitsKeep;
+
+if (dropRefErrors)
+    {
+    vBitsKeep = NULL;
+    while ((vBits = slPopHead(vBitsList)) != NULL)
+        {
+        boolean foundRef = FALSE;
+        int genoIx = 0;
+        for (; !foundRef && genoIx < vBits->genotypeSlots; genoIx++)
+            {
+            int haploIx = 0;
+            for (; !foundRef && haploIx < vBits->haplotypeSlots; haploIx++)
+                {
+                if (bitCountRange(vBits->bits,
+                                  vBitsSlot(vBits,genoIx,haploIx,0), vBits->alleleSlots) == 0)
+                    foundRef = TRUE;
+                }
+            }
+
+        if (foundRef)
+            slAddHead(&vBitsKeep,vBits);
     else  // drop memory, since this is on lm
-        dropped++;
+            dropped++;  // if not demoted, drop memory, since this is on lm
     }
-slReverse(&vBitsKeep);
 *vBitsList = vBitsKeep;
+    }
+else
+    slReverse(vBitsList);
+
 return dropped;
 }
 
+int vcfVariantBitsAlleleOccurs(struct variantBits *vBits, unsigned char alleleIx,
+                               boolean homozygousOrHaploid)
+// Counts the number of times a particular allele occurs in the subjects*chroms covered.
+// If homozygousOrHaploid then the allele must occur in both chroms to be counted
+{
+assert((vBits->alleleSlots == 1 && alleleIx <= 1)
+    || (vBits->alleleSlots == 2 && alleleIx <= 3));
+
+// To count ref, we need to know the number of subjects the bitmap covers
+// chrX is the culprit here since it is haploid/diploid
+int subjTotal = vBits->genotypeSlots * vBits->haplotypeSlots;
+if (alleleIx == 0)
+    subjTotal = vBitsSubjectChromCount(vBits);
+
+// Simple case can be taken care of easily
+if (vBits->alleleSlots == 1
+&&  (!homozygousOrHaploid || vBits->haplotypeSlots == 1))
+    {
+    if (alleleIx == 1)           // non-reference
+        return vBits->bitsOn;
+    else                         // reference
+        return subjTotal - vBits->bitsOn;
+    }
+
+// Otherwise, we have to walk the bitmap
+int occurs           = 0;
+int occursHomozygous = 0;
+int genoIx = 0;
+for (; genoIx < vBits->genotypeSlots; genoIx++)
+    {
+    int haploIx = 0;
+    int occursThisSubject = 0;
+    for (; haploIx < vBits->haplotypeSlots; haploIx++)
+        {
+        int bitSlot = vBitsSlot(vBits,genoIx,haploIx,0);
+        switch (alleleIx)
+            {
+            case 0: if (bitCountRange(vBits->bits,bitSlot, vBits->alleleSlots) != 0)
+                        occursThisSubject++; // Any non-reference bit
+                    break;
+            case 1: if (vBits->alleleSlots == 1)
+                        {
+                        if (bitReadOne(vBits->bits,bitSlot))
+                            occursThisSubject++;
+                        }
+                    else
+                        {
+                        if ( bitReadOne(vBits->bits,bitSlot)
+                        &&  !bitReadOne(vBits->bits,bitSlot + 1) )
+                            occursThisSubject++;
+                        }
+                    break;
+            case 2: if (!bitReadOne(vBits->bits,bitSlot)
+                    &&   bitReadOne(vBits->bits,bitSlot + 1) )
+                        occursThisSubject++;
+                    break;
+            case 3: if (bitCountRange(vBits->bits,bitSlot, 2) == 2)
+                        occursThisSubject++;
+                    break;
+            default:
+                    break;
+            }
+        }
+    occurs += occursThisSubject;
+    if ((alleleIx == 0 && occursThisSubject == 0)
+    ||  (alleleIx >  0 && vBits->haplotypeSlots == occursThisSubject))
+        occursHomozygous++;
+    }
+if (alleleIx == 0)
+    occurs = subjTotal - occurs;
+
+return (homozygousOrHaploid ? occursHomozygous : occurs);
+}
+
+int vcfVariantBitsReferenceOccurs(struct vcfFile *vcff, struct variantBits *vBitsList,
+                                  boolean homozygousOrHaploid)
+// For an entire list of vBits structs, counts the times the reference allele occurs.
+// If homozygousOrHaploid then the reference allele must occur in both chroms to be counted
+{
+assert(vBitsList != NULL);
+struct variantBits *vBits = vBitsList;
+// Allocate temporary vBits struct which can hold the 'OR' version of all bitmaps in list
+struct lm *lm = vcfFileLm(vcff);
+struct variantBits *vBitsTemp;
+lmAllocVar(lm,vBitsTemp);
+vBitsTemp->genotypeSlots  = vBits->genotypeSlots;
+vBitsTemp->haplotypeSlots = vBits->haplotypeSlots;
+vBitsTemp->alleleSlots    = vBits->alleleSlots;
+vBitsTemp->record         = vBits->record;
+
+int bitSlots = vBitsSlotCount(vBits);
+vBitsTemp->bits = lmBitClone(lm,vBits->bits, bitSlots);
+
+
+for (vBits = vBits->next; vBits != NULL; vBits = vBits->next)
+    bitOr(vBitsTemp->bits, vBits->bits, bitSlots);
+
+vBitsTemp->bitsOn = bitCountRange(vBitsTemp->bits,0,bitSlots);
+
+// Note the memory will be leaked but it is lm
+return vcfVariantBitsAlleleOccurs(vBitsTemp, 0, homozygousOrHaploid);
+}
+
 int vcfVariantMostPopularCmp(const void *va, const void *vb)
 // Compare to sort variantBits based upon how many genomes/chrom has the variant
 // This can be used to build haploBits in most populous order for tree building
 {
 const struct variantBits *a = *((struct variantBits **)va);
 const struct variantBits *b = *((struct variantBits **)vb);
 //int ret = memcmp(a->bits, b->bits, bitToByteSize(vBitsSlotCount(a)));
 int ret = (b->bitsOn - a->bitsOn); // higher bits first (descending order)
 if (ret == 0)
     { // sort deterministically
     ret = (a->record->chromStart - b->record->chromStart);
     if (ret == 0)
         ret = (a->record->chromEnd - b->record->chromEnd);
     }
 return ret;
@@ -226,76 +442,82 @@
             if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,0)))
                 {
                 bitSetOne(        bits,hBitsSlot(hBits,variantIx,     0));
                 hBits->bitsOn++;
                 }
 
             if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2)
                 {
                 if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1)))
                     {
                     bitSetOne(        bits,hBitsSlot(hBits,variantIx,     1));
                     hBits->bitsOn++;
                     }
                 }
             }
-        if (!ignoreReference || hBits->bitsOn > 0) // gonna keep it so flesh it out
+
+        // gonna keep it? so flesh it out (ignoreRef means > 0 && < all)
+        if (!ignoreReference || hBits->bitsOn)
             {
             hBits->genomeIx  = genoIx;
             hBits->haploidIx = haploIx;
             struct vcfRecord *record = vBitsList->record; // any will do
             struct vcfGenotype *gt = &(record->genotypes[genoIx]);
             if (gt->isHaploid || vBitsList->haplotypeSlots == 1)
                 { // chrX will have haplotypeSlots==2 but be haploid for this subject.
                   // Meanwhile if vBits were for homozygous only,  haplotypeSlots==1
                 //assert(haploIx == 0);
                 hBits->ids = lmCloneString(lm,gt->id);
                 }
             else
                 {
                 int sz = strlen(gt->id) + 3;
                 hBits->ids = lmAlloc(lm,sz);
                 safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx);
                 }
             slAddHead(&hBitsList,hBits);
             }
         //else
         //    haploBitsFree(&hBits); // lm so just abandon
         }
     }
 slReverse(&hBitsList);
 
 return hBitsList;
 }
 
+/*
 int vcfHaploBitsOnCmp(const void *va, const void *vb)
 // Compare to sort haploBits based upon how many bits are on
 {
 const struct haploBits *a = *((struct haploBits **)va);
 const struct haploBits *b = *((struct haploBits **)vb);
 int ret = (a->bitsOn - b->bitsOn);
 if (ret == 0)
     { // sort deterministically
     ret = (a->genomeIx - b->genomeIx);
     if (ret == 0)
         ret = (a->haploidIx - b->haploidIx);
     }
 return ret;
 }
+*/
 
-int vcfHaploMemCmp(const void *va, const void *vb)
-// Compare to sort haploBits based upon how many bits are on
+int vcfHaploBitsMemCmp(const void *va, const void *vb)
+// Compare to sort haploBits based on bit by bit memory compare
+// When hBits has been created from vBits sorted by popularity,
+// a mem sort will then allow elmTreeGrow() to add popular variants first.
 {
 const struct haploBits *a = *((struct haploBits **)va);
 const struct haploBits *b = *((struct haploBits **)vb);
 int ret = memcmp(a->bits, b->bits, bitToByteSize(hBitsSlotCount(a)));
 if (ret == 0)
     { // sort deterministically
     ret = (a->genomeIx - b->genomeIx);
     if (ret == 0)
         ret = (a->haploidIx - b->haploidIx);
     }
 return ret;
 }
 
 static struct haploBits *hBitsCollapsePair(struct lm *lm, struct haploBits *hBits,
                                            struct haploBits *dupMap)
@@ -313,31 +535,31 @@
 
 int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList,
                                       int haploGenomeMin)
 // Collapses a list of haploBits based upon identical bit arrays.
 // If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes.
 // Returns count of hBit structs removed.
 {
 struct lm *lm = vcfFileLm(vcff);
 int collapsed = 0;
 struct haploBits *hBitsPassed = NULL;
 struct haploBits *hBitsToTest = *phBitsList;
 if (hBitsToTest->next == NULL)
     return 0;
 
 // Sort early bits first
-slSort(&hBitsToTest, vcfHaploMemCmp);
+slSort(&hBitsToTest, vcfHaploBitsMemCmp);
 
 // Walk through those of matching size and compare
 int bitCount = hBitsSlotCount(hBitsToTest);
 int memSize = bitToByteSize(bitCount);
 struct haploBits *hBitsQ = NULL;
 while ((hBitsQ = slPopHead(&hBitsToTest)) != NULL)
     {
     struct haploBits *hBitsT = hBitsToTest; //hBitsQ->next; is already null.
     for (;hBitsT != NULL;hBitsT = hBitsT->next)
         {
         // If the 2 bitmaps do not have the same number of bits on, they are not identical
         if (hBitsT->bitsOn != hBitsQ->bitsOn)
             break;
 
         // If 2 bitmaps are identical, then collapse into one