716be62e7d41dc55f37642a85c7868bc6023c470 tdreszer Tue Feb 5 18:24:43 2013 -0800 Split vcfBits.c/.h off from vcf.c/.h. No code changes made. diff --git src/lib/vcfBits.c src/lib/vcfBits.c new file mode 100644 index 0000000..358e60c --- /dev/null +++ src/lib/vcfBits.c @@ -0,0 +1,463 @@ +/* VCF: Variant Call Format, version 4.0 / 4.1 + * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-40 + * http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41 + */ + +#include "common.h" +#include "dnautil.h" +#include "errabort.h" +#include +#include "localmem.h" +#include "net.h" +#include "regexHelper.h" +#include "vcf.h" +#include "vcfBits.h" + + +static struct variantBits *vcfOneRecordToVariantBits(struct vcfFile *vcff,struct vcfRecord *record, + boolean phasedOnly, boolean homozygousOnly) +// Returns bit array covering all genotypes/haplotype/alleles per record. One slot per genotype +// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of +// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles +// is represented by two bits per allele (>3 alternate alleles is unsupported). +// If phasedOnly, unphased haplotype bits will all be set only if all agree (00 is uninterpretable) +// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. +{ +// parse genotypes if needed +if (record->genotypes == NULL) + vcfParseGenotypes(record); +assert(vcff->genotypeCount > 0); +int ix = 0; + +// allocate vBits struct +struct variantBits *vBits; +struct lm *lm = vcfFileLm(vcff); +lmAllocVar(lm,vBits); +vBits->genotypeSlots = vcff->genotypeCount; +vBits->record = record; +vBits->haplotypeSlots = 2; // assume diploid, but then check for complete haploid +if (homozygousOnly) + vBits->haplotypeSlots = 1; +else + { // spin through all the subjects to see if any are non-haploid + for (ix = 0; ix < vcff->genotypeCount && record->genotypes[ix].isHaploid;ix++) + ; + if (ix == vcff->genotypeCount) + vBits->haplotypeSlots = 1; + } + +// allocate bits +assert(record->alleleCount < 4); +vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2); +int bitCount = vBitsSlotCount(vBits); +Bits *bits = lmBitAlloc(lm,bitCount); +vBits->bits = bits; + + +// walk through genotypes and set the bit +for (ix = 0; ix < vcff->genotypeCount;ix++) + { + struct vcfGenotype *gt = gt = &(record->genotypes[ix]); + + boolean homozygous = (gt->isHaploid || gt->hapIxA == gt->hapIxB); + + if ((!phasedOnly || gt->isPhased || homozygous)) + { + if ((!homozygousOnly || homozygous) && gt->hapIxA > 0) + { + switch (gt->hapIxA) + { + case 1: bitSetOne(bits, vBitsSlot(vBits,ix,0,0)); vBits->bitsOn++; break; + case 2: bitSetOne(bits, vBitsSlot(vBits,ix,0,1)); vBits->bitsOn++; break; + case 3: bitSetRange(bits,vBitsSlot(vBits,ix,0,0),2); vBits->bitsOn += 2; break; + default: break; + } + } + if (!gt->isHaploid && !homozygousOnly && gt->hapIxB > 0) + { + switch (gt->hapIxB) + { + case 1: bitSetOne(bits, vBitsSlot(vBits,ix,1,0)); vBits->bitsOn++; break; + case 2: bitSetOne(bits, vBitsSlot(vBits,ix,1,1)); vBits->bitsOn++; break; + case 3: bitSetRange(bits,vBitsSlot(vBits,ix,1,0),2); vBits->bitsOn += 2; break; + default: break; + } + } + } + } + +return vBits; +} + +static Bits *vcfRecordUnphasedBits(struct vcfFile *vcff, struct vcfRecord *record) +// Returns array (1 bit per genotype) for each unphased genotype. +{ +// allocate bits +Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount); + +// parse genotypes if needed +if (record->genotypes == NULL) + vcfParseGenotypes(record); + +// walk through genotypes and set the bit +int ix = 0; +for ( ; ix < vcff->genotypeCount;ix++) + { + if (record->genotypes[ix].isPhased == FALSE + && record->genotypes[ix].isHaploid == FALSE) + bitSetOne(bits,ix); + } +return bits; +} + +struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records, + boolean phasedOnly, boolean homozygousOnly, + boolean unphasedBits) +// Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record +// provided. If records is NULL will use vcff->records. Bit map has one slot per genotype +// containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of +// 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles +// is represented by two bits per allele (>3 non-reference alleles unsupported). +// If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable) +// Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. +// If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes. +// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this. +{ +struct variantBits *vBitsList = NULL; + +struct vcfRecord *record = records ? records : vcff->records; +for (;record != NULL; record = record->next) + { + struct variantBits *vBits = vcfOneRecordToVariantBits(vcff, record, phasedOnly, homozygousOnly); + if (unphasedBits) + vBits->unphased = vcfRecordUnphasedBits(vcff, record); + slAddHead(&vBitsList,vBits); + } +slReverse(&vBitsList); + +return vBitsList; +} + +int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin) +// Drops vBits found in less than a minimum number of haplotype genomes. +// Returns count of vBits structure that were dropped. +{ +struct variantBits *vBitsKeep = NULL; +struct variantBits *vBits = NULL; +int dropped = 0; +while ((vBits = slPopHead(vBitsList)) != NULL) + { + if (vBits->bitsOn >= haploGenomeMin) + slAddHead(&vBitsKeep,vBits); + else // drop memory, since this is on lm + dropped++; + } +slReverse(&vBitsKeep); +*vBitsList = vBitsKeep; +return dropped; +} + +int vcfVariantMostPopularCmp(const void *va, const void *vb) +// Compare to sort variantBits based upon how many genomes/chrom has the variant +// This can be used to build haploBits in most populous order for tree building +{ +const struct variantBits *a = *((struct variantBits **)va); +const struct variantBits *b = *((struct variantBits **)vb); +//int ret = memcmp(a->bits, b->bits, bitToByteSize(vBitsSlotCount(a))); +int ret = (b->bitsOn - a->bitsOn); // higher bits first (descending order) +if (ret == 0) + { // sort deterministically + ret = (a->record->chromStart - b->record->chromStart); + if (ret == 0) + ret = (a->record->chromEnd - b->record->chromEnd); + } +return ret; +} + +struct haploBits *vcfVariantBitsToHaploBits(struct vcfFile *vcff, struct variantBits *vBitsList, + boolean ignoreReference) +// Converts a set of variant bits to haplotype bits, resulting in one bit struct +// per haplotype genome that has non-reference variations. If ignoreReference, only +// haplotype genomes with at least one non-reference variant are returned. +// A haploBit array has one variant slot per vBit struct and one or more bits per allele. +// NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this. +{ +// First determine dimensions +int variantSlots = 0; +unsigned char alleleSlots = 1; +struct variantBits *vBits = vBitsList; +for ( ; vBits != NULL; vBits = vBits->next) + { + variantSlots++; + if (vBits->alleleSlots == 2) + alleleSlots = 2; // Will normalize allele slots across all variants. + // This should catch any problems where chrX is haploid/diploid + assert(vBitsList->haplotypeSlots == vBits->haplotypeSlots); + } + + +// Now create a struct per haplotype +struct lm *lm = vcfFileLm(vcff); +struct haploBits *hBitsList = NULL; +int genoIx = 0; +for (; genoIx < vBitsList->genotypeSlots; genoIx++) + { + int haploIx = 0; + for (; haploIx < vBitsList->haplotypeSlots; haploIx++) + { + // allocate struct + struct haploBits *hBits; + lmAllocVar(lm,hBits); + hBits->variantSlots = variantSlots; + hBits->alleleSlots = alleleSlots; + hBits->haploGenomes = 1; + int bitCount = hBitsSlotCount(hBits); + + // allocate bits + Bits *bits = lmBitAlloc(lm,bitCount); + hBits->bits = bits; + + int variantIx = 0; + for (vBits = vBitsList; vBits != NULL; vBits = vBits->next, variantIx++) + { + if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,0))) + { + bitSetOne( bits,hBitsSlot(hBits,variantIx, 0)); + hBits->bitsOn++; + } + + if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2) + { + if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1))) + { + bitSetOne( bits,hBitsSlot(hBits,variantIx, 1)); + hBits->bitsOn++; + } + } + } + if (!ignoreReference || hBits->bitsOn > 0) // gonna keep it so flesh it out + { + hBits->genomeIx = genoIx; + hBits->haploidIx = haploIx; + struct vcfRecord *record = vBitsList->record; // any will do + struct vcfGenotype *gt = &(record->genotypes[genoIx]); + if (gt->isHaploid || vBitsList->haplotypeSlots == 1) + { // chrX will have haplotypeSlots==2 but be haploid for this subject. + // Meanwhile if vBits were for homozygous only, haplotypeSlots==1 + //assert(haploIx == 0); + hBits->ids = lmCloneString(lm,gt->id); + } + else + { + int sz = strlen(gt->id) + 3; + hBits->ids = lmAlloc(lm,sz); + safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx); + } + slAddHead(&hBitsList,hBits); + } + //else + // haploBitsFree(&hBits); // lm so just abandon + } + } +slReverse(&hBitsList); + +return hBitsList; +} + +int vcfHaploBitsOnCmp(const void *va, const void *vb) +// Compare to sort haploBits based upon how many bits are on +{ +const struct haploBits *a = *((struct haploBits **)va); +const struct haploBits *b = *((struct haploBits **)vb); +int ret = (a->bitsOn - b->bitsOn); +if (ret == 0) + { // sort deterministically + ret = (a->genomeIx - b->genomeIx); + if (ret == 0) + ret = (a->haploidIx - b->haploidIx); + } +return ret; +} + +int vcfHaploMemCmp(const void *va, const void *vb) +// Compare to sort haploBits based upon how many bits are on +{ +const struct haploBits *a = *((struct haploBits **)va); +const struct haploBits *b = *((struct haploBits **)vb); +int ret = memcmp(a->bits, b->bits, bitToByteSize(hBitsSlotCount(a))); +if (ret == 0) + { // sort deterministically + ret = (a->genomeIx - b->genomeIx); + if (ret == 0) + ret = (a->haploidIx - b->haploidIx); + } +return ret; +} + +static struct haploBits *hBitsCollapsePair(struct lm *lm, struct haploBits *hBits, + struct haploBits *dupMap) +// Adds Ids of duplicate bitmap to hBits and abandons the duplicate. +{ +char *ids = lmAlloc(lm,strlen(dupMap->ids) + strlen(hBits->ids) + 2); +// NOTE: Putting Dup Ids first could leave these in lowest genomeIx/haploidIx order +sprintf(ids,"%s,%s",dupMap->ids,hBits->ids); +hBits->ids = ids; +hBits->haploGenomes++; +hBits->genomeIx = dupMap->genomeIx; // Arbitrary but could leave lowest first +hBits->haploidIx = dupMap->haploidIx; +return hBits; +} + +int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList, + int haploGenomeMin) +// Collapses a list of haploBits based upon identical bit arrays. +// If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes. +// Returns count of hBit structs removed. +{ +struct lm *lm = vcfFileLm(vcff); +int collapsed = 0; +struct haploBits *hBitsPassed = NULL; +struct haploBits *hBitsToTest = *phBitsList; +if (hBitsToTest->next == NULL) + return 0; + +// Sort early bits first +slSort(&hBitsToTest, vcfHaploMemCmp); + +// Walk through those of matching size and compare +int bitCount = hBitsSlotCount(hBitsToTest); +int memSize = bitToByteSize(bitCount); +struct haploBits *hBitsQ = NULL; +while ((hBitsQ = slPopHead(&hBitsToTest)) != NULL) + { + struct haploBits *hBitsT = hBitsToTest; //hBitsQ->next; is already null. + for (;hBitsT != NULL;hBitsT = hBitsT->next) + { + // If the 2 bitmaps do not have the same number of bits on, they are not identical + if (hBitsT->bitsOn != hBitsQ->bitsOn) + break; + + // If 2 bitmaps are identical, then collapse into one + if (memcmp(hBitsT->bits,hBitsQ->bits,memSize) == 0) + { // collapse + hBitsCollapsePair(lm,hBitsT,hBitsQ); + hBitsQ = NULL; // abandon lm memory + collapsed++; + break; // hBitsQ used up so make hBitsT the new Q. + } + } + + // If hBitsQ survived the collapsing, then don't loose it. + if (hBitsQ != NULL) + slAddHead(&hBitsPassed,hBitsQ); + } + +// Now that we have collapsed hBits, we can drop any that are not found in enough subjects +if (haploGenomeMin > 1) + { + hBitsToTest = hBitsPassed; + hBitsPassed = NULL; + while ((hBitsQ = slPopHead(hBitsToTest)) != NULL) + { + if (hBitsQ->haploGenomes >= haploGenomeMin) + slAddHead(&hBitsPassed,hBitsQ); + else // drop memory, since this is on lm + collapsed++; + } + // double pop passes means no reverse required. + } +else + slReverse(&hBitsPassed); + +*phBitsList = hBitsPassed; + +return collapsed; +} + +unsigned char vcfHaploBitsToVariantAlleleIx(struct haploBits *hBits,int bitIx) +// Given a hBits struct and bitIx, what is the actual variant allele ix +// to use when accessing the vcfRecord? NOTE: here 0 is reference allele +{ +if (hBits->alleleSlots == 1) + return bitReadOne(hBits->bits,bitIx); // 0 or 1 +else + { + assert(hBits->alleleSlots == 2); // Only supports 1-3 variants encoded as 2 bits + unsigned char varIx = 0; + int bit = variantSlotFromBitIx(hBits,bitIx); + if (bitReadOne(hBits->bits,bit++)) + varIx = 1; // Note that bitmaps represent 1,2,3 as 10,01,11 + if (bitReadOne(hBits->bits,bit)) + varIx += 2; + return varIx; + } +} + +enum elmNodeOverlap vcfHaploBitsCmp(const struct slList *elA, const struct slList *elB, + int *matchWeight, void *extra) +// HaploBits compare routine for building tree of relations using elmTreeGrow(). +{ +const struct haploBits *a = (struct haploBits *)elA; +const struct haploBits *b = (struct haploBits *)elB; + +int bitCount = hBitsSlotCount(a); +int and = bitAndCount(a->bits,b->bits,bitCount); +if (matchWeight) + *matchWeight = and; +if (and == 0)// && a->bitsOn > 0) + return enoExcluding; // nothing in common +if (a->bitsOn == and) + { + if (b->bitsOn == and) + return enoEqual; // perfect match + else + return enoSubset; + } +// super and mixed? +if (b->bitsOn == and) + return enoSuperset; +return enoMixed; +} + +struct slList *vcfHaploBitsMatching(const struct slList *elA, const struct slList *elB, + void *extra) +// Returns a HaploBits structure representing the common parts of elements A and B. +// Used with elmTreeGrow() to create nodes that are the common parts between leaves/branches. +{ +struct haploBits *a = (struct haploBits *)elA; +struct haploBits *b = (struct haploBits *)elB; +struct lm *lm = extra; + +// If the 2 bitmaps are equal, then collapse into a single struct +if (enoEqual == vcfHaploBitsCmp(elA,elB,NULL,extra)) + { + if (hBitsIsReal(a)) // bitmap represents variants on at least one subject chromosome + { + assert(!hBitsIsReal(b)); // Won't always work but hBits have been precollapsed + if (hBitsIsReal(b)) // Should not be if these have already been collapsed. + { + hBitsCollapsePair(lm,a,b); // a is real, b is real so collapse into one + //haploBitsFreeList(&b); // lm so just abandon + return (struct slList *)a; + } + else + return (struct slList *)a; // a is real, b is not so just use a + } + else if (hBitsIsReal(b)) + return (struct slList *)b; // b is real, a is not so just use b + } + +// Must make non "real" bitmap which is the common bits of a and b +struct haploBits *match; +lmAllocVar(lm,match); +match->alleleSlots = a->alleleSlots; +match->variantSlots = a->variantSlots; +match->haploGenomes = 0; // Marking this as a subset + +int bitCount = hBitsSlotCount(match); +match->bits = lmBitClone(lm,a->bits, bitCount); +bitAnd(match->bits, b->bits, bitCount); +match->bitsOn = bitCountRange(match->bits, 0, bitCount); + +return (struct slList *)match; +}