f53f70b9f131634b0eb32706a98f525fefa55cce tdreszer Tue Feb 5 17:36:46 2013 -0800 Some changes requested by Angie in code review, along with a bug fix. Next checkin will break out the vcfBits routines into separate .c/.h files diff --git src/lib/vcf.c src/lib/vcf.c index 4eb8f5b..3687fbb 100644 --- src/lib/vcf.c +++ src/lib/vcf.c @@ -417,41 +417,30 @@ AllocVar(vcff); vcff->pool = hashNew(0); vcff->reusePool = NULL; // Must explicitly request a separate record pool return vcff; } void vcfFileMakeReusePool(struct vcfFile *vcff, int initialSize) // Creates a separate memory pool for records. Establishing this pool allows // using vcfFileFlushRecords to abandon previously read records and free // the associated memory. Very useful when reading an entire file in batches. { assert(vcff->reusePool == NULL); // don't duplicate this vcff->reusePool = lmInit(initialSize); } -void vcfFileAbandonReusePool(struct vcfFile *vcff) -// Abandons all previously allocated data from the reuse pool and reverts to -// common pool. The vcf->records set will also be abandoned as pointers are invalid. -// USE WITH CAUTION. All previously allocated pointers from this pool are now invalid. -{ -assert(vcff->reusePool != NULL); // don't duplicate this -//printf("Reuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),lmSize(vcff->reusePool)); -lmCleanup(&vcff->reusePool); -vcff->records = NULL; -} - void vcfFileFlushRecords(struct vcfFile *vcff) // Abandons all previously read vcff->records and flushes the reuse pool (if it exists). // USE WITH CAUTION. All previously allocated record pointers are now invalid. { if (vcff->reusePool != NULL) { size_t poolSize = lmSize(vcff->reusePool); //if (poolSize > (48 * 1024 * 1024)) // printf("\nReuse pool %ld of %ld unused\n",lmAvailable(vcff->reusePool),poolSize); lmCleanup(&vcff->reusePool); vcff->reusePool = lmInit(poolSize); } vcff->records = NULL; } @@ -819,78 +808,76 @@ const struct vcfRecord *a = *((struct vcfRecord **)va); const struct vcfRecord *b = *((struct vcfRecord **)vb); int dif; dif = strcmp(a->chrom, b->chrom); if (dif == 0) dif = a->chromStart - b->chromStart; if (dif == 0) dif = a->chromEnd - b->chromEnd; // shortest first if (dif == 0) dif = strcmp(a->name, b->name); // finally by name return dif; } int vcfTabixBatchRead(struct vcfFile *vcff, char *chrom, int start, int end, int maxErr, int maxRecords) -// Reads a batch of records from an opened and indexed VCF file, returning number -// of records in batch. Seeks to the start position and parses all lines in range, -// adding them to vcff->records. Note: vcff->records will continue to be sorted, -// even if batches are loaded out of order. If maxErr >= zero, then continue to -// parse until there are maxErr+1 errors. A maxErr less than zero does not stop -// and reports all errors. Set maxErr to VCF_IGNORE_ERRS for silence +// Reads a batch of records from an opened and indexed VCF file, adding them to +// vcff->records and returning the count of new records added in this batch. +// Note: vcff->records will continue to be sorted, even if batches are loaded +// out of order. Additionally, resulting vcff->records will contain no duplicates +// so returned count refects only the new records added, as opposed to all records +// in range. If maxErr >= zero, then continue to parse until there are maxErr+1 +// errors. A maxErr less than zero does not stop and reports all errors. Set +// maxErr to VCF_IGNORE_ERRS for silence. { -int count = 0; +int oldCount = slCount(vcff->records); if (lineFileSetTabixRegion(vcff->lf, chrom, start, end)) { struct vcfRecord *records = vcfParseData(vcff, maxRecords); if (records) { struct vcfRecord *lastRec = vcff->records; - count = slCount(records); if (lastRec == NULL) vcff->records = records; else { - for (; lastRec->next != NULL; lastRec = lastRec->next) - ; - lastRec->next = records; - // Do we need to sort? - if (vcfRecordCmp(&lastRec,&records) > 0) - slSort(&(vcff->records), vcfRecordCmp); + // Considered just asserting the batches were in order, but a problem may + // result when non-overlapping location windows pick up the same long variant. + slSortMergeUniq(&(vcff->records), records, vcfRecordCmp, NULL); } } } -return count; +return slCount(vcff->records) - oldCount; } void vcfFileFree(struct vcfFile **pVcff) /* Free a vcfFile object. */ { if (pVcff == NULL || *pVcff == NULL) return; struct vcfFile *vcff = *pVcff; if (vcff->maxErr == VCF_IGNORE_ERRS && vcff->errCnt > 0) { vcff->maxErr++; // Never completely silent! Go ahead and report how many errs were detected vcfFileErr(vcff,"Closing with %d errors.",vcff->errCnt); } freez(&(vcff->headerString)); hashFree(&(vcff->pool)); if (vcff->reusePool) - vcfFileAbandonReusePool(vcff); + lmCleanup(&vcff->reusePool); hashFree(&(vcff->byName)); lineFileClose(&(vcff->lf)); freez(pVcff); } const struct vcfRecord *vcfFileFindVariant(struct vcfFile *vcff, char *variantId) /* Return all records with name=variantId, or NULL if not found. */ { struct vcfRecord *varList = NULL; if (vcff->byName == NULL) { vcff->byName = hashNew(0); struct vcfRecord *rec; for (rec = vcff->records; rec != NULL; rec = rec->next) { @@ -1097,54 +1084,62 @@ // - - - - - - Support for bit map based analysis of variants - - - - - - static struct variantBits *vcfOneRecordToVariantBits(struct vcfFile *vcff,struct vcfRecord *record, boolean phasedOnly, boolean homozygousOnly) // Returns bit array covering all genotypes/haplotype/alleles per record. One slot per genotype // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles // is represented by two bits per allele (>3 alternate alleles is unsupported). // If phasedOnly, unphased haplotype bits will all be set only if all agree (00 is uninterpretable) // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. { // parse genotypes if needed if (record->genotypes == NULL) vcfParseGenotypes(record); -struct vcfGenotype *gt = &(record->genotypes[0]); -assert(vcff->genotypeCount > 0 && gt != NULL); +assert(vcff->genotypeCount > 0); +int ix = 0; // allocate vBits struct struct variantBits *vBits; struct lm *lm = vcfFileLm(vcff); lmAllocVar(lm,vBits); vBits->genotypeSlots = vcff->genotypeCount; -vBits->haplotypeSlots = (gt->isHaploid || homozygousOnly ? 1 : 2); vBits->record = record; +vBits->haplotypeSlots = 2; // assume diploid, but then check for complete haploid +if (homozygousOnly) + vBits->haplotypeSlots = 1; +else + { // spin through all the subjects to see if any are non-haploid + for (ix = 0; ix < vcff->genotypeCount && record->genotypes[ix].isHaploid;ix++) + ; + if (ix == vcff->genotypeCount) + vBits->haplotypeSlots = 1; + } // allocate bits assert(record->alleleCount < 4); vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2); int bitCount = vBitsSlotCount(vBits); Bits *bits = lmBitAlloc(lm,bitCount); vBits->bits = bits; // walk through genotypes and set the bit -int ix = 0; -for ( ; ix < vcff->genotypeCount;ix++) +for (ix = 0; ix < vcff->genotypeCount;ix++) { - gt = &(record->genotypes[ix]); + struct vcfGenotype *gt = gt = &(record->genotypes[ix]); boolean homozygous = (gt->isHaploid || gt->hapIxA == gt->hapIxB); if ((!phasedOnly || gt->isPhased || homozygous)) { if ((!homozygousOnly || homozygous) && gt->hapIxA > 0) { switch (gt->hapIxA) { case 1: bitSetOne(bits, vBitsSlot(vBits,ix,0,0)); vBits->bitsOn++; break; case 2: bitSetOne(bits, vBitsSlot(vBits,ix,0,1)); vBits->bitsOn++; break; case 3: bitSetRange(bits,vBitsSlot(vBits,ix,0,0),2); vBits->bitsOn += 2; break; default: break; } } @@ -1166,31 +1161,32 @@ static Bits *vcfRecordUnphasedBits(struct vcfFile *vcff, struct vcfRecord *record) // Returns array (1 bit per genotype) for each unphased genotype. { // allocate bits Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount); // parse genotypes if needed if (record->genotypes == NULL) vcfParseGenotypes(record); // walk through genotypes and set the bit int ix = 0; for ( ; ix < vcff->genotypeCount;ix++) { - if (record->genotypes[ix].isPhased == FALSE) + if (record->genotypes[ix].isPhased == FALSE + && record->genotypes[ix].isHaploid == FALSE) bitSetOne(bits,ix); } return bits; } struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records, boolean phasedOnly, boolean homozygousOnly, boolean unphasedBits) // Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record // provided. If records is NULL will use vcff->records. Bit map has one slot per genotype // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles // is represented by two bits per allele (>3 non-reference alleles unsupported). // If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable) // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. @@ -1303,32 +1299,36 @@ if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2) { if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1))) { bitSetOne( bits,hBitsSlot(hBits,variantIx, 1)); hBits->bitsOn++; } } } if (!ignoreReference || hBits->bitsOn > 0) // gonna keep it so flesh it out { hBits->genomeIx = genoIx; hBits->haploidIx = haploIx; struct vcfRecord *record = vBitsList->record; // any will do struct vcfGenotype *gt = &(record->genotypes[genoIx]); - if (vBitsList->haplotypeSlots == 1) + if (gt->isHaploid || vBitsList->haplotypeSlots == 1) + { // chrX will have haplotypeSlots==2 but be haploid for this subject. + // Meanwhile if vBits were for homozygous only, haplotypeSlots==1 + //assert(haploIx == 0); hBits->ids = lmCloneString(lm,gt->id); + } else { int sz = strlen(gt->id) + 3; hBits->ids = lmAlloc(lm,sz); safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx); } slAddHead(&hBitsList,hBits); } //else // haploBitsFree(&hBits); // lm so just abandon } } slReverse(&hBitsList); return hBitsList; @@ -1432,31 +1432,31 @@ if (hBitsQ->haploGenomes >= haploGenomeMin) slAddHead(&hBitsPassed,hBitsQ); else // drop memory, since this is on lm collapsed++; } // double pop passes means no reverse required. } else slReverse(&hBitsPassed); *phBitsList = hBitsPassed; return collapsed; } -unsigned char vcfHaploBitsToVariantIx(struct haploBits *hBits,int bitIx) +unsigned char vcfHaploBitsToVariantAlleleIx(struct haploBits *hBits,int bitIx) // Given a hBits struct and bitIx, what is the actual variant allele ix // to use when accessing the vcfRecord? NOTE: here 0 is reference allele { if (hBits->alleleSlots == 1) return bitReadOne(hBits->bits,bitIx); // 0 or 1 else { assert(hBits->alleleSlots == 2); // Only supports 1-3 variants encoded as 2 bits unsigned char varIx = 0; int bit = variantSlotFromBitIx(hBits,bitIx); if (bitReadOne(hBits->bits,bit++)) varIx = 1; // Note that bitmaps represent 1,2,3 as 10,01,11 if (bitReadOne(hBits->bits,bit)) varIx += 2; return varIx; @@ -1519,21 +1519,15 @@ // Must make non "real" bitmap which is the common bits of a and b struct haploBits *match; lmAllocVar(lm,match); match->alleleSlots = a->alleleSlots; match->variantSlots = a->variantSlots; match->haploGenomes = 0; // Marking this as a subset int bitCount = hBitsSlotCount(match); match->bits = lmBitClone(lm,a->bits, bitCount); bitAnd(match->bits, b->bits, bitCount); match->bitsOn = bitCountRange(match->bits, 0, bitCount); return (struct slList *)match; } - - - - - -