f766ff086ef77a4730d568e9343a990c0033cf4b
tdreszer
  Mon May 20 12:58:37 2013 -0700
Fixed accounting bug where chrX reference haplotypes could have >100 percent frequency.
diff --git src/lib/vcfBits.c src/lib/vcfBits.c
index d0c5da6..c1dc612 100644
--- src/lib/vcfBits.c
+++ src/lib/vcfBits.c
@@ -1,688 +1,693 @@
 /* vcfBits.c/.h: Variant Call Format, analysis by bit maps.
  * The routines found here are dependent upon vcf.c/.h for accessing vcf records.
  * They allow analysis of a set of vcf records by bit maps with one bit map per variant
  * location and where each haplotype covered by the vcf record is represented by a single
  * bit (or pair of bits). Additional analysis can be performed by creating haplotype based
  * bit maps from variant bit maps.  There is one haplotype bit map for each haplotype
  * (subject chromosome) with one (or two) bits for each variant location in the set of records. */
 
 #include "common.h"
 #include "dnautil.h"
 #include "errabort.h"
 #include <limits.h>
 #include "localmem.h"
 #include "net.h"
 #include "regexHelper.h"
 #include "vcf.h"
 #include "vcfBits.h"
 
 
 static struct variantBits *vcfOneRecordToVariantBits(struct vcfFile *vcff,struct vcfRecord *record,
                                                      boolean phasedOnly, boolean homozygousOnly)
 // Returns bit array covering all genotypes/haplotype/alleles per record.  One slot per genotype
 // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
 // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
 // is represented by two bits per allele (>3 alternate alleles is unsupported).
 // If phasedOnly, unphased haplotype bits will all be set only if all agree (00 is uninterpretable)
 // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
 {
 // parse genotypes if needed
 if (record->genotypes == NULL)
     vcfParseGenotypes(record);
 assert(vcff->genotypeCount > 0);
 int ix = 0;
 
 // allocate vBits struct
 struct variantBits *vBits;
 struct lm *lm = vcfFileLm(vcff);
 lmAllocVar(lm,vBits);
 vBits->genotypeSlots  = vcff->genotypeCount;
 vBits->record         = record;
 vBits->haplotypeSlots = 2;  // assume diploid, but then check for complete haploid
 if (homozygousOnly)
     vBits->haplotypeSlots = 1;
 else
     { // spin through all the subjects to see if all are haploid
     for (ix = 0; ix < vcff->genotypeCount && record->genotypes[ix].isHaploid;ix++)
         ;
     if (ix == vcff->genotypeCount)
         vBits->haplotypeSlots = 1;  // All are haploid: chrY
     assert(vBits->haplotypeSlots == 2 || record->chrom[strlen(record->chrom) - 1] == 'Y');
     }
 
 // Type of variant?
 assert(record->alleles != NULL);
 if(record->alleles != NULL)
     {
     int nonRefLen = 0;
     assert(record->alleles[0] != NULL);
     int refLen = record->alleles[0] ? strlen(record->alleles[0]) : 0;
     for (ix = 1; ix < record->alleleCount; ix++)
         {
         assert(record->alleles[ix] != NULL);
         int thisLen = record->alleles[ix] && differentWord(record->alleles[ix],"<DEL>") ?
                                                                   strlen(record->alleles[ix]) : 0;
         if (nonRefLen < thisLen)
             nonRefLen = thisLen;
         }
     if (refLen == 1 && nonRefLen == 1)
         vBits->vType = vtSNP;
     else if (refLen < nonRefLen)
         vBits->vType = vtInsertion;
     else if (refLen > nonRefLen)
         vBits->vType = vtDeletion;
     // else unknown
     }
 
 // allocate bits
 assert(record->alleleCount < 4);
 vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2);
 int bitCount = vBitsSlotCount(vBits);
 Bits *bits = lmBitAlloc(lm,bitCount);
 vBits->bits = bits;
 
 
 // walk through genotypes and set the bit
 for (ix = 0; ix < vcff->genotypeCount;ix++)
     {
     struct vcfGenotype *gt = gt = &(record->genotypes[ix]);
 
     boolean homozygous = (gt->isHaploid || gt->hapIxA == gt->hapIxB);
 
     if ((!phasedOnly || gt->isPhased || homozygous))
         {
         if ((!homozygousOnly || homozygous) && gt->hapIxA > 0)
             {
             switch (gt->hapIxA)
                 {
                 case 1: bitSetOne(bits,  vBitsSlot(vBits,ix,0,0));   vBits->bitsOn++;    break;
                 case 2: bitSetOne(bits,  vBitsSlot(vBits,ix,0,1));   vBits->bitsOn++;    break;
                 case 3: bitSetRange(bits,vBitsSlot(vBits,ix,0,0),2); vBits->bitsOn += 2; break;
                 default:                                             break;
                 }
             }
         if (!gt->isHaploid && !homozygousOnly && gt->hapIxB > 0)
             {
             switch (gt->hapIxB)
                 {
                 case 1: bitSetOne(bits,  vBitsSlot(vBits,ix,1,0));   vBits->bitsOn++;    break;
                 case 2: bitSetOne(bits,  vBitsSlot(vBits,ix,1,1));   vBits->bitsOn++;    break;
                 case 3: bitSetRange(bits,vBitsSlot(vBits,ix,1,0),2); vBits->bitsOn += 2; break;
                 default:                                             break;
                 }
             }
         }
     }
 
 return vBits;
 }
 
 static Bits *vcfRecordUnphasedBits(struct vcfFile *vcff, struct vcfRecord *record)
 // Returns array (1 bit per genotype) for each unphased genotype.
 {
 // allocate bits
 Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount);
 
 // parse genotypes if needed
 if (record->genotypes == NULL)
     vcfParseGenotypes(record);
 
 // walk through genotypes and set the bit
 int ix = 0;
 for ( ; ix < vcff->genotypeCount;ix++)
     {
     if (record->genotypes[ix].isPhased == FALSE
     && record->genotypes[ix].isHaploid == FALSE)
         bitSetOne(bits,ix);
     }
 return bits;
 }
 
 Bits *vcfRecordHaploidBits(struct vcfFile *vcff, struct vcfRecord *record)
 // Returns array (1 bit per genotype) for each haploid genotype.
 // This is useful for interpreting chrX.
 {
 // allocate bits
 Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount);
 
 // parse genotypes if needed
 if (record->genotypes == NULL)
     vcfParseGenotypes(record);
 
 // walk through genotypes and set the bit
 int ix = 0;
 for ( ; ix < vcff->genotypeCount;ix++)
     {
     if (record->genotypes[ix].isHaploid)
         bitSetOne(bits,ix);
     }
 return bits;
 }
 
 int vBitsHaploidCount(struct variantBits *vBits)
 // Returns the number of subjects in the VCF dataset that are haploid at this location
 {
 int haploid = 0;  // Most are not haploid
 
 char *chrom = vBits->record->chrom;
 if (chrom[strlen(chrom) - 1] == 'Y')
     haploid = vcfBitsSubjectCount(vBits);
 else if (chrom[strlen(chrom) - 1] == 'X')  // chrX is tricky: males are haploid except in PAR
     {
     struct vcfRecord *record = vBits->record;
     struct vcfFile *vcff = record->file;
 
     // parse genotypes if needed
     if (record->genotypes == NULL)
         vcfParseGenotypes(record);
 
     // walk through genotypes
     int ix = 0;
     for ( ; ix < vcff->genotypeCount;ix++)
         {
         if (record->genotypes[ix].isHaploid)
             haploid++;
         }
     }
 return haploid;
 }
 
 int vBitsSubjectChromCount(struct variantBits *vBits)
 // Returns the chromosomes in the VCF dataset that are covered at this location
 {
 int haploid = vBitsHaploidCount(vBits);
 int diploid = vcfBitsSubjectCount(vBits) - haploid;
 return ((diploid * 2) + haploid);
 }
 
 struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records,
                                             boolean phasedOnly, boolean homozygousOnly,
                                             boolean unphasedBits)
 // Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record
 // provided.  If records is NULL will use vcff->records. Bit map has one slot per genotype
 // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
 // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
 // is represented by two bits per allele (>3 non-reference alleles unsupported).
 // If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable)
 // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
 // If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes.
 // NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
 {
 struct variantBits *vBitsList = NULL;
 
 struct vcfRecord *record = records ? records : vcff->records;
 for (;record != NULL; record = record->next)
     {
     struct variantBits *vBits = vcfOneRecordToVariantBits(vcff, record, phasedOnly, homozygousOnly);
     if (unphasedBits)
         vBits->unphased = vcfRecordUnphasedBits(vcff, record);
     slAddHead(&vBitsList,vBits);
     }
 slReverse(&vBitsList);
 
 return vBitsList;
 }
 
 int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin,
                              boolean dropRefMissing)
 // Drops vBits found in less than a minimum number of haplotype genomes.  Supplying 1 will
 // drop variants found in no haplotype genomes.  Declaring dropRefMissing will drop variants
 // in all haplotype genomes (variants where reference is not represented in dataset and
 // *might* be in error). Returns count of vBits structure that were dropped.
 {
 struct variantBits *vBitsKeep = NULL;
 struct variantBits *vBits = NULL;
 int dropped = 0;
 while ((vBits = slPopHead(vBitsList)) != NULL)
     {
     if (vBits->bitsOn >= haploGenomeMin)
         slAddHead(&vBitsKeep,vBits);
     else
         dropped++;  // if not demoted, drop memory, since this is on lm
     }
 *vBitsList = vBitsKeep;
 
 if (dropRefMissing)
     {
     vBitsKeep = NULL;
     while ((vBits = slPopHead(vBitsList)) != NULL)
         {
         boolean foundRef = FALSE;
         int genoIx = 0;
         for (; !foundRef && genoIx < vBits->genotypeSlots; genoIx++)
             {
             int haploIx = 0;
             for (; !foundRef && haploIx < vBits->haplotypeSlots; haploIx++)
                 {
                 if (bitCountRange(vBits->bits,
                                   vBitsSlot(vBits,genoIx,haploIx,0), vBits->alleleSlots) == 0)
                     foundRef = TRUE;
                 }
             }
 
         if (foundRef)
             slAddHead(&vBitsKeep,vBits);
         else  // drop memory, since this is on lm
             dropped++;  // if not demoted, drop memory, since this is on lm
         }
     *vBitsList = vBitsKeep;
     }
 else
     slReverse(vBitsList);
 
 return dropped;
 }
 
 int vcfVariantBitsAlleleOccurs(struct variantBits *vBits, unsigned char alleleIx,
                                boolean homozygousOrHaploid)
 // Counts the number of times a particular allele occurs in the subjects*chroms covered.
 // If homozygousOrHaploid then the allele must occur in both chroms to be counted
 {
 assert((vBits->alleleSlots == 1 && alleleIx <= 1)
     || (vBits->alleleSlots == 2 && alleleIx <= 3));
 
 // To count ref, we need to know the number of subjects the bitmap covers
 // chrX is the culprit here since it is haploid/diploid
 int subjTotal = vBits->genotypeSlots * vBits->haplotypeSlots;
 if (alleleIx == 0)
     subjTotal = vBitsSubjectChromCount(vBits);
 
 // Simple case can be taken care of easily
 if (vBits->alleleSlots == 1
 &&  (!homozygousOrHaploid || vBits->haplotypeSlots == 1))
     {
     if (alleleIx == 1)           // non-reference
         return vBits->bitsOn;
     else                         // reference
         return subjTotal - vBits->bitsOn;
     }
 
 // Otherwise, we have to walk the bitmap
 int occurs           = 0;
 int occursHomozygous = 0;
 int genoIx = 0;
 for (; genoIx < vBits->genotypeSlots; genoIx++)
     {
     int haploIx = 0;
     int occursThisSubject = 0;
     for (; haploIx < vBits->haplotypeSlots; haploIx++)
         {
         int bitSlot = vBitsSlot(vBits,genoIx,haploIx,0);
         switch (alleleIx)
             {
             case 0: if (bitCountRange(vBits->bits,bitSlot, vBits->alleleSlots) != 0)
                         occursThisSubject++; // Any non-reference bit
                     break;
             case 1: if (vBits->alleleSlots == 1)
                         {
                         if (bitReadOne(vBits->bits,bitSlot))
                             occursThisSubject++;
                         }
                     else
                         {
                         if ( bitReadOne(vBits->bits,bitSlot)
                         &&  !bitReadOne(vBits->bits,bitSlot + 1) )
                             occursThisSubject++;
                         }
                     break;
             case 2: if (!bitReadOne(vBits->bits,bitSlot)
                     &&   bitReadOne(vBits->bits,bitSlot + 1) )
                         occursThisSubject++;
                     break;
             case 3: if (bitCountRange(vBits->bits,bitSlot, 2) == 2)
                         occursThisSubject++;
                     break;
             default:
                     break;
             }
         }
     occurs += occursThisSubject;
     if ((alleleIx == 0 && occursThisSubject == 0)
     ||  (alleleIx >  0 && vBits->haplotypeSlots == occursThisSubject))
         occursHomozygous++;
     }
 if (alleleIx == 0)
     occurs = subjTotal - occurs;
 
 return (homozygousOrHaploid ? occursHomozygous : occurs);
 }
 
 int vcfVariantBitsReferenceOccurs(struct vcfFile *vcff, struct variantBits *vBitsList,
                                   boolean homozygousOrHaploid)
 // For an entire list of vBits structs, counts the times the reference allele occurs.
 // If homozygousOrHaploid then the reference allele must occur in both chroms to be counted
 {
 assert(vBitsList != NULL);
 struct variantBits *vBits = vBitsList;
 // Allocate temporary vBits struct which can hold the 'OR' version of all bitmaps in list
 struct lm *lm = vcfFileLm(vcff);
 struct variantBits *vBitsTemp;
 lmAllocVar(lm,vBitsTemp);
 vBitsTemp->genotypeSlots  = vBits->genotypeSlots;
 vBitsTemp->haplotypeSlots = vBits->haplotypeSlots;
 vBitsTemp->alleleSlots    = vBits->alleleSlots;
 vBitsTemp->record         = vBits->record;
 
 int bitSlots = vBitsSlotCount(vBits);
 vBitsTemp->bits = lmBitClone(lm,vBits->bits, bitSlots);
 
 
 for (vBits = vBits->next; vBits != NULL; vBits = vBits->next)
     bitOr(vBitsTemp->bits, vBits->bits, bitSlots);
 
 vBitsTemp->bitsOn = bitCountRange(vBitsTemp->bits,0,bitSlots);
 
 // Note the memory will be leaked but it is lm
 return vcfVariantBitsAlleleOccurs(vBitsTemp, 0, homozygousOrHaploid);
 }
 
 int vcfVariantMostPopularCmp(const void *va, const void *vb)
 // Compare to sort variantBits based upon how many genomes/chrom has the variant
 // This can be used to build haploBits in most populous order for tree building
 {
 const struct variantBits *a = *((struct variantBits **)va);
 const struct variantBits *b = *((struct variantBits **)vb);
 //int ret = memcmp(a->bits, b->bits, bitToByteSize(vBitsSlotCount(a)));
 int ret = (b->bitsOn - a->bitsOn); // higher bits first (descending order)
 if (ret == 0)
     { // sort deterministically
     ret = (a->record->chromStart - b->record->chromStart);
     if (ret == 0)
         ret = (a->record->chromEnd - b->record->chromEnd);
     }
 return ret;
 }
 
 struct haploBits *vcfVariantBitsToHaploBits(struct vcfFile *vcff, struct variantBits *vBitsList,
                                             boolean ignoreReference)
 // Converts a set of variant bits to haplotype bits, resulting in one bit struct
 // per haplotype genome that has non-reference variations.  If ignoreReference, only
 // haplotype genomes with at least one non-reference variant are returned.
 // A haploBit array has one variant slot per vBit struct and one or more bits per allele.
 // NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
 {
 // First determine dimensions
 int variantSlots = 0;
 unsigned char alleleSlots  = 1;
 struct variantBits *vBits = vBitsList;
 for ( ; vBits != NULL; vBits = vBits->next)
     {
     variantSlots++;
     if (vBits->alleleSlots == 2)
         alleleSlots = 2; // Will normalize allele slots across all variants.
     // This should catch any problems where chrX is haploid/diploid
     assert(vBitsList->haplotypeSlots == vBits->haplotypeSlots);
     }
 
 
 // Now create a struct per haplotype
 struct lm *lm = vcfFileLm(vcff);
 struct haploBits *hBitsList = NULL;
 int genoIx = 0;
 for (; genoIx < vBitsList->genotypeSlots; genoIx++)
     {
     int haploIx = 0;
     for (; haploIx < vBitsList->haplotypeSlots; haploIx++)
         {
         // allocate struct
         struct haploBits *hBits;
         lmAllocVar(lm,hBits);
         hBits->variantSlots = variantSlots;
         hBits->alleleSlots  = alleleSlots;
         hBits->haploGenomes = 1;
         int bitCount = hBitsSlotCount(hBits);
 
         // allocate bits
         Bits *bits = lmBitAlloc(lm,bitCount);
         hBits->bits = bits;
 
         int variantIx = 0;
         for (vBits =  vBitsList; vBits != NULL; vBits = vBits->next, variantIx++)
             {
             if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,0)))
                 {
                 bitSetOne(        bits,hBitsSlot(hBits,variantIx,     0));
                 hBits->bitsOn++;
                 }
 
             if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2)
                 {
                 if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1)))
                     {
                     bitSetOne(        bits,hBitsSlot(hBits,variantIx,     1));
                     hBits->bitsOn++;
                     }
                 }
             }
 
         // gonna keep it? so flesh it out (ignoreRef means > 0 && < all)
         if (!ignoreReference || hBits->bitsOn)
             {
             hBits->genomeIx  = genoIx;
             hBits->haploidIx = haploIx;
             struct vcfRecord *record = vBitsList->record; // any will do
             struct vcfGenotype *gt = &(record->genotypes[genoIx]);
+
+            if (hBits->bitsOn                   // if including reference, then chrX could
+            ||  haploIx == 0 || !gt->isHaploid) // have unused diploid positions!
+                {
             if (gt->isHaploid || vBitsList->haplotypeSlots == 1)
                 { // chrX will have haplotypeSlots==2 but be haploid for this subject.
                   // Meanwhile if vBits were for homozygous only,  haplotypeSlots==1
                 //assert(haploIx == 0);
                 hBits->ids = lmCloneString(lm,gt->id);
                 }
             else
                 {
                 int sz = strlen(gt->id) + 3;
                 hBits->ids = lmAlloc(lm,sz);
                 safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx);
                 }
             slAddHead(&hBitsList,hBits);
             }
+            }
         //else
         //    haploBitsFree(&hBits); // lm so just abandon
         }
     }
 slReverse(&hBitsList);
 
 return hBitsList;
 }
 
 /*
 int vcfHaploBitsOnCmp(const void *va, const void *vb)
 // Compare to sort haploBits based upon how many bits are on
 {
 const struct haploBits *a = *((struct haploBits **)va);
 const struct haploBits *b = *((struct haploBits **)vb);
 int ret = (a->bitsOn - b->bitsOn);
 if (ret == 0)
     { // sort deterministically
     ret = (a->genomeIx - b->genomeIx);
     if (ret == 0)
         ret = (a->haploidIx - b->haploidIx);
     }
 return ret;
 }
 */
 
 int vcfHaploBitsMemCmp(const void *va, const void *vb)
 // Compare to sort haploBits based on bit by bit memory compare
 // When hBits has been created from vBits sorted by popularity,
 // a mem sort will then allow elmTreeGrow() to add popular variants first.
 {
 const struct haploBits *a = *((struct haploBits **)va);
 const struct haploBits *b = *((struct haploBits **)vb);
 int ret = memcmp(a->bits, b->bits, bitToByteSize(hBitsSlotCount(a)));
 if (ret == 0)
     { // sort deterministically
     ret = (a->genomeIx - b->genomeIx);
     if (ret == 0)
         ret = (a->haploidIx - b->haploidIx);
     }
 return ret;
 }
 
 static struct haploBits *hBitsCollapsePair(struct lm *lm, struct haploBits *hBits,
                                            struct haploBits *dupMap)
 // Adds Ids of duplicate bitmap to hBits and abandons the duplicate.
 {
 char *ids = lmAlloc(lm,strlen(dupMap->ids) + strlen(hBits->ids) + 2);
 // NOTE: Putting Dup Ids first could leave these in lowest genomeIx/haploidIx order
 sprintf(ids,"%s,%s",dupMap->ids,hBits->ids);
 hBits->ids = ids;
 hBits->haploGenomes++;
 hBits->genomeIx  = dupMap->genomeIx;  // Arbitrary but could leave lowest first
 hBits->haploidIx = dupMap->haploidIx;
 return hBits;
 }
 
 int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList,
                                       int haploGenomeMin)
 // Collapses a list of haploBits based upon identical bit arrays.
 // If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes.
 // Returns count of hBit structs removed.
 {
 struct lm *lm = vcfFileLm(vcff);
 int collapsed = 0;
 struct haploBits *hBitsPassed = NULL;
 struct haploBits *hBitsToTest = *phBitsList;
 if (hBitsToTest->next == NULL)
     return 0;
 
 // Sort early bits first
 slSort(&hBitsToTest, vcfHaploBitsMemCmp);
 
 // Walk through those of matching size and compare
 int bitCount = hBitsSlotCount(hBitsToTest);
 int memSize = bitToByteSize(bitCount);
 struct haploBits *hBitsQ = NULL;
 while ((hBitsQ = slPopHead(&hBitsToTest)) != NULL)
     {
     struct haploBits *hBitsT = hBitsToTest; //hBitsQ->next; is already null.
     for (;hBitsT != NULL;hBitsT = hBitsT->next)
         {
         // If the 2 bitmaps do not have the same number of bits on, they are not identical
         if (hBitsT->bitsOn != hBitsQ->bitsOn)
             break;
 
         // If 2 bitmaps are identical, then collapse into one
         if (memcmp(hBitsT->bits,hBitsQ->bits,memSize) == 0)
             {     // collapse
             hBitsCollapsePair(lm,hBitsT,hBitsQ);
             hBitsQ = NULL; // abandon lm memory
             collapsed++;
             break;  // hBitsQ used up so make hBitsT the new Q.
             }
         }
 
     // If hBitsQ survived the collapsing, then don't loose it.
     if (hBitsQ != NULL)
         slAddHead(&hBitsPassed,hBitsQ);
     }
 
 // Now that we have collapsed hBits, we can drop any that are not found in enough subjects
 if (haploGenomeMin > 1)
     {
     hBitsToTest = hBitsPassed;
     hBitsPassed = NULL;
     while ((hBitsQ = slPopHead(hBitsToTest)) != NULL)
         {
         if (hBitsQ->haploGenomes >= haploGenomeMin)
             slAddHead(&hBitsPassed,hBitsQ);
         else  // drop memory, since this is on lm
             collapsed++;
         }
     // double pop passes means no reverse required.
     }
 else
     slReverse(&hBitsPassed);
 
 *phBitsList = hBitsPassed;
 
 return collapsed;
 }
 
 unsigned char vcfHaploBitsToVariantAlleleIx(struct haploBits *hBits,int bitIx)
 // Given a hBits struct and bitIx, what is the actual variant allele ix
 // to use when accessing the vcfRecord?  NOTE: here 0 is reference allele
 {
 if (hBits->alleleSlots == 1)
     return bitReadOne(hBits->bits,bitIx); // 0 or 1
 else
     {
     assert(hBits->alleleSlots == 2); // Only supports 1-3 variants encoded as 2 bits
     unsigned char varIx = 0;
     int bit = variantSlotFromBitIx(hBits,bitIx);
     if (bitReadOne(hBits->bits,bit++))
         varIx = 1;  // Note that bitmaps represent 1,2,3 as 10,01,11
     if (bitReadOne(hBits->bits,bit))
         varIx += 2;
     return varIx;
     }
 }
 
 enum elmNodeOverlap vcfHaploBitsCmp(const struct slList *elA, const struct slList *elB,
                                     int *matchWeight, void *extra)
 // HaploBits compare routine for building tree of relations using elmTreeGrow().
 {
 const struct haploBits *a = (struct haploBits *)elA;
 const struct haploBits *b = (struct haploBits *)elB;
 
 int bitCount = hBitsSlotCount(a);
 int and = bitAndCount(a->bits,b->bits,bitCount);
 if (matchWeight)
     *matchWeight = and;
 if (and == 0)// && a->bitsOn > 0)
     return enoExcluding; // nothing in common
 if (a->bitsOn == and)
     {
     if (b->bitsOn == and)
         return enoEqual; // perfect match
     else
         return enoSubset;
     }
 // super and mixed?
 if (b->bitsOn == and)
     return enoSuperset;
 return enoMixed;
 }
 
 struct slList *vcfHaploBitsMatching(const struct slList *elA, const struct slList *elB,
                                     void *extra)
 // Returns a HaploBits structure representing the common parts of elements A and B.
 // Used with elmTreeGrow() to create nodes that are the common parts between leaves/branches.
 {
 struct haploBits *a = (struct haploBits *)elA;
 struct haploBits *b = (struct haploBits *)elB;
 struct lm *lm = extra;
 
 // If the 2 bitmaps are equal, then collapse into a single struct
 if (enoEqual == vcfHaploBitsCmp(elA,elB,NULL,extra))
     {
     if (hBitsIsReal(a)) // bitmap represents variants on at least one subject chromosome
         {
         assert(!hBitsIsReal(b));  // Won't always work but hBits have been precollapsed
         if (hBitsIsReal(b)) // Should not be if these have already been collapsed.
             {
             hBitsCollapsePair(lm,a,b);    // a is real, b is real so collapse into one
             //haploBitsFreeList(&b);   // lm so just abandon
             return (struct slList *)a;
             }
         else
             return (struct slList *)a; // a is real, b is not so just use a
         }
     else if (hBitsIsReal(b))
         return (struct slList *)b;    // b is real, a is not so just use b
     }
 
 // Must make non "real" bitmap which is the common bits of a and b
 struct haploBits *match;
 lmAllocVar(lm,match);
 match->alleleSlots  = a->alleleSlots;
 match->variantSlots = a->variantSlots;
 match->haploGenomes = 0;  // Marking this as a subset
 
 int bitCount  = hBitsSlotCount(match);
 match->bits = lmBitClone(lm,a->bits, bitCount);
 bitAnd(match->bits, b->bits, bitCount);
 match->bitsOn = bitCountRange(match->bits, 0, bitCount);
 
 return (struct slList *)match;
 }