src/lib/vcf.c 752bae9723eeeabb02d950157ab0f9512fd689b3

752bae9723eeeabb02d950157ab0f9512fd689b3
tdreszer
  Mon Jan 28 15:05:59 2013 -0800
Checking in 'elmTree' which uses neighbor joining to build a tree in local memory.  Also checking in 2 sets of changes to vcf.  First, added support for a 'reuse' local memory pool which aid traversing giant files.  Second, added routines for bitmap based analysis of variants.
diff --git src/lib/vcf.c src/lib/vcf.c
index 9acc900..4eb8f5b 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -156,74 +156,76 @@
 static bool vcfFileStopDueToErrors(struct vcfFile *vcff)
 /* determine if we should stop due to the number of errors */
 {
 return vcff->errCnt > vcff->maxErr;
 }
 
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 #if defined(__GNUC__)
 __attribute__((format(printf, 2, 3)))
 #endif
 ;
 
 static void vcfFileErr(struct vcfFile *vcff, char *format, ...)
 /* Send error message to errabort stack's warn handler and abort */
 {
+vcff->errCnt++;
+if (vcff->maxErr == VCF_IGNORE_ERRS)
+    return;
 va_list args;
 va_start(args, format);
 char formatPlus[1024];
 if (vcff->lf != NULL)
     sprintf(formatPlus, "%s:%d: %s", vcff->lf->fileName, vcff->lf->lineIx, format);
 else
     strcpy(formatPlus, format);
 vaWarn(formatPlus, args);
 va_end(args);
-vcff->errCnt++;
 if (vcfFileStopDueToErrors(vcff))
     errAbort("VCF: %d parser errors, quitting", vcff->errCnt);
 }
 
 static void *vcfFileAlloc(struct vcfFile *vcff, size_t size)
 /* Use vcff's local mem to allocate memory. */
 {
-return lmAlloc(vcff->pool->lm, size);
+return lmAlloc( vcfFileLm(vcff), size);
 }
 
-static char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size)
+inline char *vcfFileCloneStrZ(struct vcfFile *vcff, char *str, size_t size)
 /* Use vcff's local mem to allocate memory for a string and copy it. */
 {
-return lmCloneStringZ(vcff->pool->lm, str, size);
+return lmCloneStringZ( vcfFileLm(vcff), str, size);
 }
 
-static char *vcfFileCloneStr(struct vcfFile *vcff, char *str)
+inline char *vcfFileCloneStr(struct vcfFile *vcff, char *str)
 /* Use vcff's local mem to allocate memory for a string and copy it. */
 {
 return vcfFileCloneStrZ(vcff, str, strlen(str));
 }
 
-static char *vcfFileCloneSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
+inline char *vcfFileCloneSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
 /* Allocate memory for and copy a substring of line. */
 {
 return vcfFileCloneStrZ(vcff, line+substr.rm_so, (substr.rm_eo - substr.rm_so));
 }
 
-#define vcfFileCloneVar(var) lmCloneMem(vcff->pool->lm, var, sizeof(var));
+#define vcfFileCloneVar(vcff, var) lmCloneMem( vcfFileLm(vcff), var, sizeof(var))
 
 char *vcfFilePooledStr(struct vcfFile *vcff, char *str)
 /* Allocate memory for a string from vcff's shared string pool. */
 {
-return hashStoreName(vcff->pool, str);
+return hashStoreName(vcff->pool, str);  // Always stored in main pool, not reuse pool
 }
 
 static enum vcfInfoType vcfInfoTypeFromSubstr(struct vcfFile *vcff, char *line, regmatch_t substr)
 /* Translate substring of line into vcfInfoType or complain. */
 {
 char typeWord[16];
 int substrLen = substr.rm_eo - substr.rm_so;
 if (substrLen > sizeof(typeWord) - 1)
     {
     vcfFileErr(vcff, "substring passed to vcfInfoTypeFromSubstr is too long.");
     return vcfInfoNoType;
     }
 safencpy(typeWord, sizeof(typeWord), line + substr.rm_so, substrLen);
 if (sameString("Integer", typeWord))
     return vcfInfoInteger;
@@ -402,37 +404,74 @@
 	vcfFileErr(vcff, "FORMAT column is given, but no sample IDs for genotype columns...?");
     vcff->genotypeCount = (wordCount - 9);
     vcff->genotypeIds = vcfFileAlloc(vcff, vcff->genotypeCount * sizeof(char *));
     int i;
     for (i = 9;  i < wordCount;  i++)
 	vcff->genotypeIds[i-9] = vcfFileCloneStr(vcff, words[i]);
     }
 }
 
 static struct vcfFile *vcfFileNew()
 /* Return a new, empty vcfFile object. */
 {
 struct vcfFile *vcff = NULL;
 AllocVar(vcff);
 vcff->pool = hashNew(0);
+vcff->reusePool = NULL;  // Must explicitly request a separate record pool
 return vcff;
 }
 
+void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize)
+// Creates a separate memory pool for records.  Establishing this pool allows
+// using vcfFileFlushRecords to abandon previously read records and free
+// the associated memory. Very useful when reading an entire file in batches.
+{
+assert(vcff->reusePool == NULL); // don't duplicate this
+vcff->reusePool = lmInit(initialSize);
+}
+
+void vcfFileAbandonReusePool(struct vcfFile *vcff)
+// Abandons all previously allocated data from the reuse pool and reverts to
+// common pool. The vcf->records set will also be abandoned as pointers are invalid.
+// USE WITH CAUTION.  All previously allocated pointers from this pool are now invalid.
+{
+assert(vcff->reusePool != NULL); // don't duplicate this
+//printf("Reuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),lmSize(vcff->reusePool));
+lmCleanup(&vcff->reusePool);
+vcff->records = NULL;
+}
+
+void vcfFileFlushRecords(struct vcfFile *vcff)
+// Abandons all previously read vcff->records and flushes the reuse pool (if it exists).
+// USE WITH CAUTION.  All previously allocated record pointers are now invalid.
+{
+if (vcff->reusePool != NULL)
+    {
+    size_t poolSize = lmSize(vcff->reusePool);
+    //if (poolSize > (48 * 1024 * 1024))
+    //    printf("\nReuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),poolSize);
+    lmCleanup(&vcff->reusePool);
+    vcff->reusePool = lmInit(poolSize);
+    }
+vcff->records = NULL;
+}
+
 static struct vcfFile *vcfFileHeaderFromLineFile(struct lineFile *lf, int maxErr)
 /* Parse a VCF file into a vcfFile object.  If maxErr not zero, then
  * continue to parse until this number of error have been reached.  A maxErr
- * less than zero does not stop and reports all errors. */
+ * less than zero does not stop and reports all errors.
+ * Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 initVcfSpecInfoDefs();
 initVcfSpecGtFormatDefs();
 if (lf == NULL)
     return NULL;
 struct vcfFile *vcff = vcfFileNew();
 vcff->lf = lf;
 vcff->fileOrUrl = vcfFileCloneStr(vcff, lf->fileName);
 vcff->maxErr = (maxErr < 0) ? INT_MAX : maxErr;
 
 struct dyString *dyHeader = dyStringNew(1024);
 char *line = NULL;
 // First, metadata lines beginning with "##":
 while (lineFileNext(lf, &line, NULL) && startsWith("##", line))
     {
@@ -694,131 +733,198 @@
     if (allSameFirstBase)
 	{
 	rec->chromStart++;
 	for (i = 0;  i < rec->alleleCount;  i++)
 	    {
 	    if (rec->alleles[i][1] == '\0')
 		rec->alleles[i] = vcfFilePooledStr(vcff, "-");
 	    else
 		rec->alleles[i] = vcfFilePooledStr(vcff, rec->alleles[i]+1);
 	    }
 	}
     }
 return chromStartOrig;
 }
 
-static void vcfParseData(struct vcfFile *vcff, int maxRecords)
-/* Given a vcfFile into which the header has been parsed, and whose lineFile is positioned
- * at the beginning of a data row, parse and store all data rows from lineFile. */
+static struct vcfRecord *vcfParseData(struct vcfFile *vcff, int maxRecords)
+// Given a vcfFile into which the header has been parsed, and whose
+// lineFile is positioned at the beginning of a data row, parse and
+// return all data rows from lineFile.
 {
 if (vcff == NULL)
-    return;
+    return NULL;
 int recCount = 0;
+struct vcfRecord *records = NULL;
 struct vcfRecord *record;
 while ((record = vcfNextRecord(vcff)) != NULL)
     {
     if (maxRecords >= 0 && recCount >= maxRecords)
 	break;
-    slAddHead(&(vcff->records), record);
+    slAddHead(&records, record);
     recCount++;
     }
-slReverse(&(vcff->records));
-lineFileClose(&(vcff->lf));
+slReverse(&records);
+
+return records;
 }
 
 struct vcfFile *vcfFileMayOpen(char *fileOrUrl, int maxErr, int maxRecords, boolean parseAll)
 /* Open fileOrUrl and parse VCF header; return NULL if unable.
  * If parseAll, then read in all lines, parse and store in
  * vcff->records; if maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
- * and reports all errors. */
+ * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 struct lineFile *lf = NULL;
 if (startsWith("http://", fileOrUrl) || startsWith("ftp://", fileOrUrl) ||
     startsWith("https://", fileOrUrl))
     lf = netLineFileOpen(fileOrUrl);
 else
     lf = lineFileMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (parseAll)
-    vcfParseData(vcff, maxRecords);
+    {
+    vcff->records = vcfParseData(vcff, maxRecords);
+    lineFileClose(&(vcff->lf)); // Not sure why it is closed.  Angie?
+    }
 return vcff;
 }
 
 struct vcfFile *vcfTabixFileMayOpen(char *fileOrUrl, char *chrom, int start, int end,
 				    int maxErr, int maxRecords)
 /* Open a VCF file that has been compressed and indexed by tabix and
  * parse VCF header, or return NULL if unable.  If chrom is non-NULL,
  * seek to the position range and parse all lines in range into
  * vcff->records.  If maxErr >= zero, then continue to parse until
  * there are maxErr+1 errors.  A maxErr less than zero does not stop
- * and reports all errors. */
+ * and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence */
 {
 struct lineFile *lf = lineFileTabixMayOpen(fileOrUrl, TRUE);
 struct vcfFile *vcff = vcfFileHeaderFromLineFile(lf, maxErr);
 if (vcff == NULL)
     return NULL;
 if (isNotEmpty(chrom) && start != end)
     {
     if (lineFileSetTabixRegion(lf, chrom, start, end))
-	vcfParseData(vcff, maxRecords);
+        {
+        vcff->records = vcfParseData(vcff, maxRecords);
+        lineFileClose(&(vcff->lf)); // Not sure why it is closed.  Angie?
+        }
     }
 return vcff;
 }
 
+int vcfRecordCmp(const void *va, const void *vb)
+/* Compare to sort based on position. */
+{
+const struct vcfRecord *a = *((struct vcfRecord **)va);
+const struct vcfRecord *b = *((struct vcfRecord **)vb);
+int dif;
+dif = strcmp(a->chrom, b->chrom);
+if (dif == 0)
+    dif = a->chromStart - b->chromStart;
+if (dif == 0)
+    dif = a->chromEnd - b->chromEnd; // shortest first
+if (dif == 0)
+    dif = strcmp(a->name, b->name);  // finally by name
+return dif;
+}
+
+int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end,
+                      int maxErr, int maxRecords)
+// Reads a batch of records from an opened and indexed VCF file, returning number
+// of records in batch.  Seeks to the start position and parses all lines in range,
+// adding them to vcff->records.  Note: vcff->records will continue to be sorted,
+// even if batches are loaded out of order.  If maxErr >= zero, then continue to
+// parse until there are maxErr+1 errors.  A maxErr less than zero does not stop
+// and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence
+{
+int count = 0;
+
+if (lineFileSetTabixRegion(vcff->lf, chrom, start, end))
+    {
+    struct vcfRecord *records = vcfParseData(vcff, maxRecords);
+    if (records)
+        {
+        struct vcfRecord *lastRec = vcff->records;
+        count = slCount(records);
+        if (lastRec == NULL)
+            vcff->records = records;
+        else
+            {
+            for (; lastRec->next != NULL; lastRec = lastRec->next)
+                ;
+            lastRec->next = records;
+            // Do we need to sort?
+            if (vcfRecordCmp(&lastRec,&records) > 0)
+                slSort(&(vcff->records), vcfRecordCmp);
+            }
+        }
+    }
+
+return count;
+}
+
 void vcfFileFree(struct vcfFile **pVcff)
 /* Free a vcfFile object. */
 {
 if (pVcff == NULL || *pVcff == NULL)
     return;
 struct vcfFile *vcff = *pVcff;
+if (vcff->maxErr == VCF_IGNORE_ERRS && vcff->errCnt > 0)
+    {
+    vcff->maxErr++; // Never completely silent!  Go ahead and report how many errs were detected
+    vcfFileErr(vcff,"Closing with %d errors.",vcff->errCnt);
+    }
 freez(&(vcff->headerString));
 hashFree(&(vcff->pool));
+if (vcff->reusePool)
+    vcfFileAbandonReusePool(vcff);
 hashFree(&(vcff->byName));
 lineFileClose(&(vcff->lf));
 freez(pVcff);
 }
 
 const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId)
 /* Return all records with name=variantId, or NULL if not found. */
 {
 struct vcfRecord *varList = NULL;
 if (vcff->byName == NULL)
     {
     vcff->byName = hashNew(0);
     struct vcfRecord *rec;
     for (rec = vcff->records;  rec != NULL;  rec = rec->next)
 	{
 	if (sameString (rec->name, variantId))
 	    {
 	    // Make shallow copy of rec so we can alter ->next:
-	    struct vcfRecord *newRec = vcfFileCloneVar(rec);
+	    struct vcfRecord *newRec = vcfFileCloneVar(vcff,rec);
 	    slAddHead(&varList, newRec);
 	    }
 	hashAdd(vcff->byName, rec->name, rec);
 	}
     slReverse(&varList);
     }
 else
     {
     struct hashEl *hel = hashLookup(vcff->byName, variantId);
     while (hel != NULL)
 	{
 	if (sameString(hel->name, variantId))
 	    {
 	    struct vcfRecord *rec = hel->val;
-	    struct vcfRecord *newRec = vcfFileCloneVar(rec);
+	    struct vcfRecord *newRec = vcfFileCloneVar(vcff,rec);
 	    slAddHead(&varList, newRec);
 	    }
 	hel = hel->next;
 	}
     // Don't reverse varList -- hash element list was already reversed
     }
 return varList;
 }
 
 const struct vcfInfoElement *vcfRecordFindInfo(const struct vcfRecord *record, char *key)
 /* Find an INFO element, or NULL. */
 {
 int i;
 for (i = 0;  i < record->infoCount;  i++)
     {
@@ -977,15 +1083,457 @@
         "    string info;       \"Additional information encoded as a semicolon-separated series "
                                  "of short keys with optional comma-separated values\""
         "    string format;     \"If genotype columns are specified in header, a "
                                  "semicolon-separated list of of short keys starting with GT\""
         "    string genotypes;  \"If genotype columns are specified in header, a tab-separated "
                                  "set of genotype column values; each value is a colon-separated "
                                  "list of values corresponding to keys in the format column\""
         "    )";
 
 struct asObject *vcfAsObj()
 // Return asObject describing fields of VCF
 {
 return asParseText(vcfDataLineAutoSqlString);
 }
 
+// - - - - - - Support for bit map based analysis of variants - - - - - -
+
+static struct variantBits *vcfOneRecordToVariantBits(struct vcfFile *vcff,struct vcfRecord *record,
+                                                     boolean phasedOnly, boolean homozygousOnly)
+// Returns bit array covering all genotypes/haplotype/alleles per record.  One slot per genotype
+// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
+// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
+// is represented by two bits per allele (>3 alternate alleles is unsupported).
+// If phasedOnly, unphased haplotype bits will all be set only if all agree (00 is uninterpretable)
+// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
+{
+// parse genotypes if needed
+if (record->genotypes == NULL)
+    vcfParseGenotypes(record);
+struct vcfGenotype *gt = &(record->genotypes[0]);
+assert(vcff->genotypeCount > 0 && gt != NULL);
+
+// allocate vBits struct
+struct variantBits *vBits;
+struct lm *lm = vcfFileLm(vcff);
+lmAllocVar(lm,vBits);
+vBits->genotypeSlots = vcff->genotypeCount;
+vBits->haplotypeSlots = (gt->isHaploid || homozygousOnly ? 1 : 2);
+vBits->record = record;
+
+// allocate bits
+assert(record->alleleCount < 4);
+vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2);
+int bitCount = vBitsSlotCount(vBits);
+Bits *bits = lmBitAlloc(lm,bitCount);
+vBits->bits = bits;
+
+
+// walk through genotypes and set the bit
+int ix = 0;
+for ( ; ix < vcff->genotypeCount;ix++)
+    {
+    gt = &(record->genotypes[ix]);
+
+    boolean homozygous = (gt->isHaploid || gt->hapIxA == gt->hapIxB);
+
+    if ((!phasedOnly || gt->isPhased || homozygous))
+        {
+        if ((!homozygousOnly || homozygous) && gt->hapIxA > 0)
+            {
+            switch (gt->hapIxA)
+                {
+                case 1: bitSetOne(bits,  vBitsSlot(vBits,ix,0,0));   vBits->bitsOn++;    break;
+                case 2: bitSetOne(bits,  vBitsSlot(vBits,ix,0,1));   vBits->bitsOn++;    break;
+                case 3: bitSetRange(bits,vBitsSlot(vBits,ix,0,0),2); vBits->bitsOn += 2; break;
+                default:                                             break;
+                }
+            }
+        if (!gt->isHaploid && !homozygousOnly && gt->hapIxB > 0)
+            {
+            switch (gt->hapIxB)
+                {
+                case 1: bitSetOne(bits,  vBitsSlot(vBits,ix,1,0));   vBits->bitsOn++;    break;
+                case 2: bitSetOne(bits,  vBitsSlot(vBits,ix,1,1));   vBits->bitsOn++;    break;
+                case 3: bitSetRange(bits,vBitsSlot(vBits,ix,1,0),2); vBits->bitsOn += 2; break;
+                default:                                             break;
+                }
+            }
+        }
+    }
+
+return vBits;
+}
+
+static Bits *vcfRecordUnphasedBits(struct vcfFile *vcff, struct vcfRecord *record)
+// Returns array (1 bit per genotype) for each unphased genotype.
+{
+// allocate bits
+Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount);
+
+// parse genotypes if needed
+if (record->genotypes == NULL)
+    vcfParseGenotypes(record);
+
+// walk through genotypes and set the bit
+int ix = 0;
+for ( ; ix < vcff->genotypeCount;ix++)
+    {
+    if (record->genotypes[ix].isPhased == FALSE)
+        bitSetOne(bits,ix);
+    }
+return bits;
+}
+
+struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records,
+                                            boolean phasedOnly, boolean homozygousOnly,
+                                            boolean unphasedBits)
+// Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record
+// provided.  If records is NULL will use vcff->records. Bit map has one slot per genotype
+// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
+// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
+// is represented by two bits per allele (>3 non-reference alleles unsupported).
+// If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable)
+// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
+// If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes.
+// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
+{
+struct variantBits *vBitsList = NULL;
+
+struct vcfRecord *record = records ? records : vcff->records;
+for (;record != NULL; record = record->next)
+    {
+    struct variantBits *vBits = vcfOneRecordToVariantBits(vcff, record, phasedOnly, homozygousOnly);
+    if (unphasedBits)
+        vBits->unphased = vcfRecordUnphasedBits(vcff, record);
+    slAddHead(&vBitsList,vBits);
+    }
+slReverse(&vBitsList);
+
+return vBitsList;
+}
+
+int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin)
+// Drops vBits found in less than a minimum number of haplotype genomes.
+// Returns count of vBits structure that were dropped.
+{
+struct variantBits *vBitsKeep = NULL;
+struct variantBits *vBits = NULL;
+int dropped = 0;
+while ((vBits = slPopHead(vBitsList)) != NULL)
+    {
+    if (vBits->bitsOn >= haploGenomeMin)
+        slAddHead(&vBitsKeep,vBits);
+    else  // drop memory, since this is on lm
+        dropped++;
+    }
+slReverse(&vBitsKeep);
+*vBitsList = vBitsKeep;
+return dropped;
+}
+
+int vcfVariantMostPopularCmp(const void *va, const void *vb)
+// Compare to sort variantBits based upon how many genomes/chrom has the variant
+// This can be used to build haploBits in most populous order for tree building
+{
+const struct variantBits *a = *((struct variantBits **)va);
+const struct variantBits *b = *((struct variantBits **)vb);
+//int ret = memcmp(a->bits, b->bits, bitToByteSize(vBitsSlotCount(a)));
+int ret = (b->bitsOn - a->bitsOn); // higher bits first (descending order)
+if (ret == 0)
+    { // sort deterministically
+    ret = (a->record->chromStart - b->record->chromStart);
+    if (ret == 0)
+        ret = (a->record->chromEnd - b->record->chromEnd);
+    }
+return ret;
+}
+
+struct haploBits *vcfVariantBitsToHaploBits(struct vcfFile *vcff, struct variantBits *vBitsList,
+                                            boolean ignoreReference)
+// Converts a set of variant bits to haplotype bits, resulting in one bit struct
+// per haplotype genome that has non-reference variations.  If ignoreReference, only
+// haplotype genomes with at least one non-reference variant are returned.
+// A haploBit array has one variant slot per vBit struct and one or more bits per allele.
+// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this.
+{
+// First determine dimensions
+int variantSlots = 0;
+unsigned char alleleSlots  = 1;
+struct variantBits *vBits = vBitsList;
+for ( ; vBits != NULL; vBits = vBits->next)
+    {
+    variantSlots++;
+    if (vBits->alleleSlots == 2)
+        alleleSlots = 2; // Will normalize allele slots across all variants.
+    // This should catch any problems where chrX is haploid/diploid
+    assert(vBitsList->haplotypeSlots == vBits->haplotypeSlots);
+    }
+
+
+// Now create a struct per haplotype
+struct lm *lm = vcfFileLm(vcff);
+struct haploBits *hBitsList = NULL;
+int genoIx = 0;
+for (; genoIx < vBitsList->genotypeSlots; genoIx++)
+    {
+    int haploIx = 0;
+    for (; haploIx < vBitsList->haplotypeSlots; haploIx++)
+        {
+        // allocate struct
+        struct haploBits *hBits;
+        lmAllocVar(lm,hBits);
+        hBits->variantSlots = variantSlots;
+        hBits->alleleSlots  = alleleSlots;
+        hBits->haploGenomes = 1;
+        int bitCount = hBitsSlotCount(hBits);
+
+        // allocate bits
+        Bits *bits = lmBitAlloc(lm,bitCount);
+        hBits->bits = bits;
+
+        int variantIx = 0;
+        for (vBits =  vBitsList; vBits != NULL; vBits = vBits->next, variantIx++)
+            {
+            if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,0)))
+                {
+                bitSetOne(        bits,hBitsSlot(hBits,variantIx,     0));
+                hBits->bitsOn++;
+                }
+
+            if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2)
+                {
+                if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1)))
+                    {
+                    bitSetOne(        bits,hBitsSlot(hBits,variantIx,     1));
+                    hBits->bitsOn++;
+                    }
+                }
+            }
+        if (!ignoreReference || hBits->bitsOn > 0) // gonna keep it so flesh it out
+            {
+            hBits->genomeIx  = genoIx;
+            hBits->haploidIx = haploIx;
+            struct vcfRecord *record = vBitsList->record; // any will do
+            struct vcfGenotype *gt = &(record->genotypes[genoIx]);
+            if (vBitsList->haplotypeSlots == 1)
+                hBits->ids = lmCloneString(lm,gt->id);
+            else
+                {
+                int sz = strlen(gt->id) + 3;
+                hBits->ids = lmAlloc(lm,sz);
+                safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx);
+                }
+            slAddHead(&hBitsList,hBits);
+            }
+        //else
+        //    haploBitsFree(&hBits); // lm so just abandon
+        }
+    }
+slReverse(&hBitsList);
+
+return hBitsList;
+}
+
+int vcfHaploBitsOnCmp(const void *va, const void *vb)
+// Compare to sort haploBits based upon how many bits are on
+{
+const struct haploBits *a = *((struct haploBits **)va);
+const struct haploBits *b = *((struct haploBits **)vb);
+int ret = (a->bitsOn - b->bitsOn);
+if (ret == 0)
+    { // sort deterministically
+    ret = (a->genomeIx - b->genomeIx);
+    if (ret == 0)
+        ret = (a->haploidIx - b->haploidIx);
+    }
+return ret;
+}
+
+int vcfHaploMemCmp(const void *va, const void *vb)
+// Compare to sort haploBits based upon how many bits are on
+{
+const struct haploBits *a = *((struct haploBits **)va);
+const struct haploBits *b = *((struct haploBits **)vb);
+int ret = memcmp(a->bits, b->bits, bitToByteSize(hBitsSlotCount(a)));
+if (ret == 0)
+    { // sort deterministically
+    ret = (a->genomeIx - b->genomeIx);
+    if (ret == 0)
+        ret = (a->haploidIx - b->haploidIx);
+    }
+return ret;
+}
+
+static struct haploBits *hBitsCollapsePair(struct lm *lm, struct haploBits *hBits,
+                                           struct haploBits *dupMap)
+// Adds Ids of duplicate bitmap to hBits and abandons the duplicate.
+{
+char *ids = lmAlloc(lm,strlen(dupMap->ids) + strlen(hBits->ids) + 2);
+// NOTE: Putting Dup Ids first could leave these in lowest genomeIx/haploidIx order
+sprintf(ids,"%s,%s",dupMap->ids,hBits->ids);
+hBits->ids = ids;
+hBits->haploGenomes++;
+hBits->genomeIx  = dupMap->genomeIx;  // Arbitrary but could leave lowest first
+hBits->haploidIx = dupMap->haploidIx;
+return hBits;
+}
+
+int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList,
+                                      int haploGenomeMin)
+// Collapses a list of haploBits based upon identical bit arrays.
+// If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes.
+// Returns count of hBit structs removed.
+{
+struct lm *lm = vcfFileLm(vcff);
+int collapsed = 0;
+struct haploBits *hBitsPassed = NULL;
+struct haploBits *hBitsToTest = *phBitsList;
+if (hBitsToTest->next == NULL)
+    return 0;
+
+// Sort early bits first
+slSort(&hBitsToTest, vcfHaploMemCmp);
+
+// Walk through those of matching size and compare
+int bitCount = hBitsSlotCount(hBitsToTest);
+int memSize = bitToByteSize(bitCount);
+struct haploBits *hBitsQ = NULL;
+while ((hBitsQ = slPopHead(&hBitsToTest)) != NULL)
+    {
+    struct haploBits *hBitsT = hBitsToTest; //hBitsQ->next; is already null.
+    for (;hBitsT != NULL;hBitsT = hBitsT->next)
+        {
+        // If the 2 bitmaps do not have the same number of bits on, they are not identical
+        if (hBitsT->bitsOn != hBitsQ->bitsOn)
+            break;
+
+        // If 2 bitmaps are identical, then collapse into one
+        if (memcmp(hBitsT->bits,hBitsQ->bits,memSize) == 0)
+            {     // collapse
+            hBitsCollapsePair(lm,hBitsT,hBitsQ);
+            hBitsQ = NULL; // abandon lm memory
+            collapsed++;
+            break;  // hBitsQ used up so make hBitsT the new Q.
+            }
+        }
+
+    // If hBitsQ survived the collapsing, then don't loose it.
+    if (hBitsQ != NULL)
+        slAddHead(&hBitsPassed,hBitsQ);
+    }
+
+// Now that we have collapsed hBits, we can drop any that are not found in enough subjects
+if (haploGenomeMin > 1)
+    {
+    hBitsToTest = hBitsPassed;
+    hBitsPassed = NULL;
+    while ((hBitsQ = slPopHead(hBitsToTest)) != NULL)
+        {
+        if (hBitsQ->haploGenomes >= haploGenomeMin)
+            slAddHead(&hBitsPassed,hBitsQ);
+        else  // drop memory, since this is on lm
+            collapsed++;
+        }
+    // double pop passes means no reverse required.
+    }
+else
+    slReverse(&hBitsPassed);
+
+*phBitsList = hBitsPassed;
+
+return collapsed;
+}
+
+unsigned char vcfHaploBitsToVariantIx(struct haploBits *hBits,int bitIx)
+// Given a hBits struct and bitIx, what is the actual variant allele ix
+// to use when accessing the vcfRecord?  NOTE: here 0 is reference allele
+{
+if (hBits->alleleSlots == 1)
+    return bitReadOne(hBits->bits,bitIx); // 0 or 1
+else
+    {
+    assert(hBits->alleleSlots == 2); // Only supports 1-3 variants encoded as 2 bits
+    unsigned char varIx = 0;
+    int bit = variantSlotFromBitIx(hBits,bitIx);
+    if (bitReadOne(hBits->bits,bit++))
+        varIx = 1;  // Note that bitmaps represent 1,2,3 as 10,01,11
+    if (bitReadOne(hBits->bits,bit))
+        varIx += 2;
+    return varIx;
+    }
+}
+
+enum elmNodeOverlap vcfHaploBitsCmp(const struct slList *elA, const struct slList *elB,
+                                    int *matchWeight, void *extra)
+// HaploBits compare routine for building tree of relations using elmTreeGrow().
+{
+const struct haploBits *a = (struct haploBits *)elA;
+const struct haploBits *b = (struct haploBits *)elB;
+
+int bitCount = hBitsSlotCount(a);
+int and = bitAndCount(a->bits,b->bits,bitCount);
+if (matchWeight)
+    *matchWeight = and;
+if (and == 0)// && a->bitsOn > 0)
+    return enoExcluding; // nothing in common
+if (a->bitsOn == and)
+    {
+    if (b->bitsOn == and)
+        return enoEqual; // perfect match
+    else
+        return enoSubset;
+    }
+// super and mixed?
+if (b->bitsOn == and)
+    return enoSuperset;
+return enoMixed;
+}
+
+struct slList *vcfHaploBitsMatching(const struct slList *elA, const struct slList *elB,
+                                    void *extra)
+// Returns a HaploBits structure representing the common parts of elements A and B.
+// Used with elmTreeGrow() to create nodes that are the common parts between leaves/branches.
+{
+struct haploBits *a = (struct haploBits *)elA;
+struct haploBits *b = (struct haploBits *)elB;
+struct lm *lm = extra;
+
+// If the 2 bitmaps are equal, then collapse into a single struct
+if (enoEqual == vcfHaploBitsCmp(elA,elB,NULL,extra))
+    {
+    if (hBitsIsReal(a)) // bitmap represents variants on at least one subject chromosome
+        {
+        assert(!hBitsIsReal(b));  // Won't always work but hBits have been precollapsed
+        if (hBitsIsReal(b)) // Should not be if these have already been collapsed.
+            {
+            hBitsCollapsePair(lm,a,b);    // a is real, b is real so collapse into one
+            //haploBitsFreeList(&b);   // lm so just abandon
+            return (struct slList *)a;
+            }
+        else
+            return (struct slList *)a; // a is real, b is not so just use a
+        }
+    else if (hBitsIsReal(b))
+        return (struct slList *)b;    // b is real, a is not so just use b
+    }
+
+// Must make non "real" bitmap which is the common bits of a and b
+struct haploBits *match;
+lmAllocVar(lm,match);
+match->alleleSlots  = a->alleleSlots;
+match->variantSlots = a->variantSlots;
+match->haploGenomes = 0;  // Marking this as a subset
+
+int bitCount  = hBitsSlotCount(match);
+match->bits = lmBitClone(lm,a->bits, bitCount);
+bitAnd(match->bits, b->bits, bitCount);
+match->bitsOn = bitCountRange(match->bits, 0, bitCount);
+
+return (struct slList *)match;
+}
+
+
+
+
+
+