f53f70b9f131634b0eb32706a98f525fefa55cce
tdreszer
  Tue Feb 5 17:36:46 2013 -0800
Some changes requested by Angie in code review, along with a bug fix.  Next checkin will break out the vcfBits routines into separate .c/.h files
diff --git src/lib/vcf.c src/lib/vcf.c
index 4eb8f5b..3687fbb 100644
--- src/lib/vcf.c
+++ src/lib/vcf.c
@@ -417,41 +417,30 @@
 AllocVar(vcff);
 vcff->pool = hashNew(0);
 vcff->reusePool = NULL;  // Must explicitly request a separate record pool
 return vcff;
 }
 
 void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize)
 // Creates a separate memory pool for records.  Establishing this pool allows
 // using vcfFileFlushRecords to abandon previously read records and free
 // the associated memory. Very useful when reading an entire file in batches.
 {
 assert(vcff->reusePool == NULL); // don't duplicate this
 vcff->reusePool = lmInit(initialSize);
 }
 
-void vcfFileAbandonReusePool(struct vcfFile *vcff)
-// Abandons all previously allocated data from the reuse pool and reverts to
-// common pool. The vcf->records set will also be abandoned as pointers are invalid.
-// USE WITH CAUTION.  All previously allocated pointers from this pool are now invalid.
-{
-assert(vcff->reusePool != NULL); // don't duplicate this
-//printf("Reuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),lmSize(vcff->reusePool));
-lmCleanup(&vcff->reusePool);
-vcff->records = NULL;
-}
-
 void vcfFileFlushRecords(struct vcfFile *vcff)
 // Abandons all previously read vcff->records and flushes the reuse pool (if it exists).
 // USE WITH CAUTION.  All previously allocated record pointers are now invalid.
 {
 if (vcff->reusePool != NULL)
     {
     size_t poolSize = lmSize(vcff->reusePool);
     //if (poolSize > (48 * 1024 * 1024))
     //    printf("\nReuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),poolSize);
     lmCleanup(&vcff->reusePool);
     vcff->reusePool = lmInit(poolSize);
     }
 vcff->records = NULL;
 }
 
@@ -819,78 +808,76 @@
 const struct vcfRecord *a = *((struct vcfRecord **)va);
 const struct vcfRecord *b = *((struct vcfRecord **)vb);
 int dif;
 dif = strcmp(a->chrom, b->chrom);
 if (dif == 0)
     dif = a->chromStart - b->chromStart;
 if (dif == 0)
     dif = a->chromEnd - b->chromEnd; // shortest first
 if (dif == 0)
     dif = strcmp(a->name, b->name);  // finally by name
 return dif;
 }
 
 int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end,
                       int maxErr, int maxRecords)
-// Reads a batch of records from an opened and indexed VCF file, returning number
-// of records in batch.  Seeks to the start position and parses all lines in range,
-// adding them to vcff->records.  Note: vcff->records will continue to be sorted,
-// even if batches are loaded out of order.  If maxErr >= zero, then continue to
-// parse until there are maxErr+1 errors.  A maxErr less than zero does not stop
-// and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence
+// Reads a batch of records from an opened and indexed VCF file, adding them to
+// vcff->records and returning the count of new records added in this batch.
+// Note: vcff->records will continue to be sorted, even if batches are loaded
+// out of order.  Additionally, resulting vcff->records will contain no duplicates
+// so returned count refects only the new records added, as opposed to all records
+// in range.  If maxErr >= zero, then continue to parse until there are maxErr+1
+// errors.  A maxErr less than zero does not stop and reports all errors.  Set
+// maxErr to VCF_IGNORE_ERRS for silence.
 {
-int count = 0;
+int oldCount = slCount(vcff->records);
 
 if (lineFileSetTabixRegion(vcff->lf, chrom, start, end))
     {
     struct vcfRecord *records = vcfParseData(vcff, maxRecords);
     if (records)
         {
         struct vcfRecord *lastRec = vcff->records;
-        count = slCount(records);
         if (lastRec == NULL)
             vcff->records = records;
         else
             {
-            for (; lastRec->next != NULL; lastRec = lastRec->next)
-                ;
-            lastRec->next = records;
-            // Do we need to sort?
-            if (vcfRecordCmp(&lastRec,&records) > 0)
-                slSort(&(vcff->records), vcfRecordCmp);
+            // Considered just asserting the batches were in order, but a problem may
+            // result when non-overlapping location windows pick up the same long variant.
+            slSortMergeUniq(&(vcff->records), records, vcfRecordCmp, NULL);
             }
         }
     }
 
-return count;
+return slCount(vcff->records) - oldCount;
 }
 
 void vcfFileFree(struct vcfFile **pVcff)
 /* Free a vcfFile object. */
 {
 if (pVcff == NULL || *pVcff == NULL)
     return;
 struct vcfFile *vcff = *pVcff;
 if (vcff->maxErr == VCF_IGNORE_ERRS && vcff->errCnt > 0)
     {
     vcff->maxErr++; // Never completely silent!  Go ahead and report how many errs were detected
     vcfFileErr(vcff,"Closing with %d errors.",vcff->errCnt);
     }
 freez(&(vcff->headerString));
 hashFree(&(vcff->pool));
 if (vcff->reusePool)
-    vcfFileAbandonReusePool(vcff);
+    lmCleanup(&vcff->reusePool);
 hashFree(&(vcff->byName));
 lineFileClose(&(vcff->lf));
 freez(pVcff);
 }
 
 const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId)
 /* Return all records with name=variantId, or NULL if not found. */
 {
 struct vcfRecord *varList = NULL;
 if (vcff->byName == NULL)
     {
     vcff->byName = hashNew(0);
     struct vcfRecord *rec;
     for (rec = vcff->records;  rec != NULL;  rec = rec->next)
 	{
@@ -1097,54 +1084,62 @@
 
 // - - - - - - Support for bit map based analysis of variants - - - - - -
 
 static struct variantBits *vcfOneRecordToVariantBits(struct vcfFile *vcff,struct vcfRecord *record,
                                                      boolean phasedOnly, boolean homozygousOnly)
 // Returns bit array covering all genotypes/haplotype/alleles per record.  One slot per genotype
 // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
 // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
 // is represented by two bits per allele (>3 alternate alleles is unsupported).
 // If phasedOnly, unphased haplotype bits will all be set only if all agree (00 is uninterpretable)
 // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
 {
 // parse genotypes if needed
 if (record->genotypes == NULL)
     vcfParseGenotypes(record);
-struct vcfGenotype *gt = &(record->genotypes[0]);
-assert(vcff->genotypeCount > 0 && gt != NULL);
+assert(vcff->genotypeCount > 0);
+int ix = 0;
 
 // allocate vBits struct
 struct variantBits *vBits;
 struct lm *lm = vcfFileLm(vcff);
 lmAllocVar(lm,vBits);
 vBits->genotypeSlots = vcff->genotypeCount;
-vBits->haplotypeSlots = (gt->isHaploid || homozygousOnly ? 1 : 2);
 vBits->record = record;
+vBits->haplotypeSlots = 2;  // assume diploid, but then check for complete haploid
+if (homozygousOnly)
+    vBits->haplotypeSlots = 1;
+else
+    { // spin through all the subjects to see if any are non-haploid
+    for (ix = 0; ix < vcff->genotypeCount && record->genotypes[ix].isHaploid;ix++)
+        ;
+    if (ix == vcff->genotypeCount)
+        vBits->haplotypeSlots = 1;
+    }
 
 // allocate bits
 assert(record->alleleCount < 4);
 vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2);
 int bitCount = vBitsSlotCount(vBits);
 Bits *bits = lmBitAlloc(lm,bitCount);
 vBits->bits = bits;
 
 
 // walk through genotypes and set the bit
-int ix = 0;
-for ( ; ix < vcff->genotypeCount;ix++)
+for (ix = 0; ix < vcff->genotypeCount;ix++)
     {
-    gt = &(record->genotypes[ix]);
+    struct vcfGenotype *gt = gt = &(record->genotypes[ix]);
 
     boolean homozygous = (gt->isHaploid || gt->hapIxA == gt->hapIxB);
 
     if ((!phasedOnly || gt->isPhased || homozygous))
         {
         if ((!homozygousOnly || homozygous) && gt->hapIxA > 0)
             {
             switch (gt->hapIxA)
                 {
                 case 1: bitSetOne(bits,  vBitsSlot(vBits,ix,0,0));   vBits->bitsOn++;    break;
                 case 2: bitSetOne(bits,  vBitsSlot(vBits,ix,0,1));   vBits->bitsOn++;    break;
                 case 3: bitSetRange(bits,vBitsSlot(vBits,ix,0,0),2); vBits->bitsOn += 2; break;
                 default:                                             break;
                 }
             }
@@ -1166,31 +1161,32 @@
 
 static Bits *vcfRecordUnphasedBits(struct vcfFile *vcff, struct vcfRecord *record)
 // Returns array (1 bit per genotype) for each unphased genotype.
 {
 // allocate bits
 Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount);
 
 // parse genotypes if needed
 if (record->genotypes == NULL)
     vcfParseGenotypes(record);
 
 // walk through genotypes and set the bit
 int ix = 0;
 for ( ; ix < vcff->genotypeCount;ix++)
     {
-    if (record->genotypes[ix].isPhased == FALSE)
+    if (record->genotypes[ix].isPhased == FALSE
+    && record->genotypes[ix].isHaploid == FALSE)
         bitSetOne(bits,ix);
     }
 return bits;
 }
 
 struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records,
                                             boolean phasedOnly, boolean homozygousOnly,
                                             boolean unphasedBits)
 // Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record
 // provided.  If records is NULL will use vcff->records. Bit map has one slot per genotype
 // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of
 // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles
 // is represented by two bits per allele (>3 non-reference alleles unsupported).
 // If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable)
 // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot.
@@ -1303,32 +1299,36 @@
             if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2)
                 {
                 if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1)))
                     {
                     bitSetOne(        bits,hBitsSlot(hBits,variantIx,     1));
                     hBits->bitsOn++;
                     }
                 }
             }
         if (!ignoreReference || hBits->bitsOn > 0) // gonna keep it so flesh it out
             {
             hBits->genomeIx  = genoIx;
             hBits->haploidIx = haploIx;
             struct vcfRecord *record = vBitsList->record; // any will do
             struct vcfGenotype *gt = &(record->genotypes[genoIx]);
-            if (vBitsList->haplotypeSlots == 1)
+            if (gt->isHaploid || vBitsList->haplotypeSlots == 1)
+                { // chrX will have haplotypeSlots==2 but be haploid for this subject.
+                  // Meanwhile if vBits were for homozygous only,  haplotypeSlots==1
+                //assert(haploIx == 0);
                 hBits->ids = lmCloneString(lm,gt->id);
+                }
             else
                 {
                 int sz = strlen(gt->id) + 3;
                 hBits->ids = lmAlloc(lm,sz);
                 safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx);
                 }
             slAddHead(&hBitsList,hBits);
             }
         //else
         //    haploBitsFree(&hBits); // lm so just abandon
         }
     }
 slReverse(&hBitsList);
 
 return hBitsList;
@@ -1432,31 +1432,31 @@
         if (hBitsQ->haploGenomes >= haploGenomeMin)
             slAddHead(&hBitsPassed,hBitsQ);
         else  // drop memory, since this is on lm
             collapsed++;
         }
     // double pop passes means no reverse required.
     }
 else
     slReverse(&hBitsPassed);
 
 *phBitsList = hBitsPassed;
 
 return collapsed;
 }
 
-unsigned char vcfHaploBitsToVariantIx(struct haploBits *hBits,int bitIx)
+unsigned char vcfHaploBitsToVariantAlleleIx(struct haploBits *hBits,int bitIx)
 // Given a hBits struct and bitIx, what is the actual variant allele ix
 // to use when accessing the vcfRecord?  NOTE: here 0 is reference allele
 {
 if (hBits->alleleSlots == 1)
     return bitReadOne(hBits->bits,bitIx); // 0 or 1
 else
     {
     assert(hBits->alleleSlots == 2); // Only supports 1-3 variants encoded as 2 bits
     unsigned char varIx = 0;
     int bit = variantSlotFromBitIx(hBits,bitIx);
     if (bitReadOne(hBits->bits,bit++))
         varIx = 1;  // Note that bitmaps represent 1,2,3 as 10,01,11
     if (bitReadOne(hBits->bits,bit))
         varIx += 2;
     return varIx;
@@ -1519,21 +1519,15 @@
 
 // Must make non "real" bitmap which is the common bits of a and b
 struct haploBits *match;
 lmAllocVar(lm,match);
 match->alleleSlots  = a->alleleSlots;
 match->variantSlots = a->variantSlots;
 match->haploGenomes = 0;  // Marking this as a subset
 
 int bitCount  = hBitsSlotCount(match);
 match->bits = lmBitClone(lm,a->bits, bitCount);
 bitAnd(match->bits, b->bits, bitCount);
 match->bitsOn = bitCountRange(match->bits, 0, bitCount);
 
 return (struct slList *)match;
 }
-
-
-
-
-
-