15ab88298277f4c822a77f667aa4d2f7def37b3c tdreszer Tue Apr 23 13:35:05 2013 -0700 Changes in preparation for checking in haplotypes code. diff --git src/lib/vcfBits.c src/lib/vcfBits.c index dbe6736..0ed67b9 100644 --- src/lib/vcfBits.c +++ src/lib/vcfBits.c @@ -30,35 +30,60 @@ if (record->genotypes == NULL) vcfParseGenotypes(record); assert(vcff->genotypeCount > 0); int ix = 0; // allocate vBits struct struct variantBits *vBits; struct lm *lm = vcfFileLm(vcff); lmAllocVar(lm,vBits); vBits->genotypeSlots = vcff->genotypeCount; vBits->record = record; vBits->haplotypeSlots = 2; // assume diploid, but then check for complete haploid if (homozygousOnly) vBits->haplotypeSlots = 1; else - { // spin through all the subjects to see if any are non-haploid + { // spin through all the subjects to see if all are haploid for (ix = 0; ix < vcff->genotypeCount && record->genotypes[ix].isHaploid;ix++) ; if (ix == vcff->genotypeCount) - vBits->haplotypeSlots = 1; + vBits->haplotypeSlots = 1; // All are haploid: chrY + assert(vBits->haplotypeSlots == 2 || record->chrom[strlen(record->chrom) - 1] == 'Y'); + } + +// Type of variant? +assert(record->alleles != NULL); +if(record->alleles != NULL) + { + int nonRefLen = 0; + assert(record->alleles[0] != NULL); + int refLen = record->alleles[0] ? strlen(record->alleles[0]) : 0; + for (ix = 1; ix < record->alleleCount; ix++) + { + assert(record->alleles[ix] != NULL); + int thisLen = record->alleles[ix] && differentWord(record->alleles[ix],"<DEL>") ? + strlen(record->alleles[ix]) : 0; + if (nonRefLen < thisLen) + nonRefLen = thisLen; + } + if (refLen == 1 && nonRefLen == 1) + vBits->vType = vtSNP; + else if (refLen < nonRefLen) + vBits->vType = vtInsertion; + else if (refLen > nonRefLen) + vBits->vType = vtDeletion; + // else unknown } // allocate bits assert(record->alleleCount < 4); vBits->alleleSlots = (record->alleleCount <= 2 ? 1 : 2); int bitCount = vBitsSlotCount(vBits); Bits *bits = lmBitAlloc(lm,bitCount); vBits->bits = bits; // walk through genotypes and set the bit for (ix = 0; ix < vcff->genotypeCount;ix++) { struct vcfGenotype *gt = gt = &(record->genotypes[ix]); @@ -101,77 +126,268 @@ // parse genotypes if needed if (record->genotypes == NULL) vcfParseGenotypes(record); // walk through genotypes and set the bit int ix = 0; for ( ; ix < vcff->genotypeCount;ix++) { if (record->genotypes[ix].isPhased == FALSE && record->genotypes[ix].isHaploid == FALSE) bitSetOne(bits,ix); } return bits; } +Bits *vcfRecordHaploidBits(struct vcfFile *vcff, struct vcfRecord *record) +// Returns array (1 bit per genotype) for each haploid genotype. +// This is useful for interpreting chrX. +{ +// allocate bits +Bits *bits = lmBitAlloc(vcfFileLm(vcff),vcff->genotypeCount); + +// parse genotypes if needed +if (record->genotypes == NULL) + vcfParseGenotypes(record); + +// walk through genotypes and set the bit +int ix = 0; +for ( ; ix < vcff->genotypeCount;ix++) + { + if (record->genotypes[ix].isHaploid) + bitSetOne(bits,ix); + } +return bits; +} + +int vBitsHaploidCount(struct variantBits *vBits) +// Returns the number of subjects in the VCF dataset that are haploid at this location +{ +int haploid = 0; // Most are not haploid + +char *chrom = vBits->record->chrom; +if (chrom[strlen(chrom) - 1] == 'Y') + haploid = vcfBitsSubjectCount(vBits); +else if (chrom[strlen(chrom) - 1] == 'X') // chrX is tricky: males are haploid except in PAR + { + struct vcfRecord *record = vBits->record; + struct vcfFile *vcff = record->file; + + // parse genotypes if needed + if (record->genotypes == NULL) + vcfParseGenotypes(record); + + // walk through genotypes + int ix = 0; + for ( ; ix < vcff->genotypeCount;ix++) + { + if (record->genotypes[ix].isHaploid) + haploid++; + } + } +return haploid; +} + +int vBitsSubjectChromCount(struct variantBits *vBits) +// Returns the chromosomes in the VCF dataset that are covered at this location +{ +int haploid = vBitsHaploidCount(vBits); +int diploid = vcfBitsSubjectCount(vBits) - haploid; +return ((diploid * 2) + haploid); +} + struct variantBits *vcfRecordsToVariantBits(struct vcfFile *vcff, struct vcfRecord *records, boolean phasedOnly, boolean homozygousOnly, boolean unphasedBits) // Returns list of bit arrays covering all genotypes/haplotype/alleles per record for each record // provided. If records is NULL will use vcff->records. Bit map has one slot per genotype // containing 2 slots for haplotypes and 1 or 2 bits per allele. The normal (simple) case of // 1 reference and 1 alternate allele results in 1 allele bit with 0:ref. Two or three alt alleles // is represented by two bits per allele (>3 non-reference alleles unsupported). // If phasedOnly, unphased haplotype bits will be set only if both agree (00 is uninterpretable) // Haploid genotypes (e.g. chrY) and homozygousOnly bitmaps contain 1 haplotype slot. // If unphasedBits, then vBits->unphased will contain a bitmap with 1s for all unphased genotypes. // NOTE: allocated from vcff pool, so closing file or flushing reusePool will invalidate this. { struct variantBits *vBitsList = NULL; struct vcfRecord *record = records ? records : vcff->records; for (;record != NULL; record = record->next) { struct variantBits *vBits = vcfOneRecordToVariantBits(vcff, record, phasedOnly, homozygousOnly); if (unphasedBits) vBits->unphased = vcfRecordUnphasedBits(vcff, record); slAddHead(&vBitsList,vBits); } slReverse(&vBitsList); return vBitsList; } -int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin) -// Drops vBits found in less than a minimum number of haplotype genomes. +int vcfVariantBitsDropSparse(struct variantBits **vBitsList, int haploGenomeMin, + boolean dropRefErrors) +// Drops vBits found in less than a minimum number of haplotype genomes. Supplying 1 will +// drop variants found in no haplotype genomes. Declaring dropRefErrors will drop variants +// in all haplotype genomes (variants where reference is wrong). // Returns count of vBits structure that were dropped. { struct variantBits *vBitsKeep = NULL; struct variantBits *vBits = NULL; int dropped = 0; while ((vBits = slPopHead(vBitsList)) != NULL) { if (vBits->bitsOn >= haploGenomeMin) slAddHead(&vBitsKeep,vBits); + else + dropped++; // if not demoted, drop memory, since this is on lm + } +*vBitsList = vBitsKeep; + +if (dropRefErrors) + { + vBitsKeep = NULL; + while ((vBits = slPopHead(vBitsList)) != NULL) + { + boolean foundRef = FALSE; + int genoIx = 0; + for (; !foundRef && genoIx < vBits->genotypeSlots; genoIx++) + { + int haploIx = 0; + for (; !foundRef && haploIx < vBits->haplotypeSlots; haploIx++) + { + if (bitCountRange(vBits->bits, + vBitsSlot(vBits,genoIx,haploIx,0), vBits->alleleSlots) == 0) + foundRef = TRUE; + } + } + + if (foundRef) + slAddHead(&vBitsKeep,vBits); else // drop memory, since this is on lm - dropped++; + dropped++; // if not demoted, drop memory, since this is on lm } -slReverse(&vBitsKeep); *vBitsList = vBitsKeep; + } +else + slReverse(vBitsList); + return dropped; } +int vcfVariantBitsAlleleOccurs(struct variantBits *vBits, unsigned char alleleIx, + boolean homozygousOrHaploid) +// Counts the number of times a particular allele occurs in the subjects*chroms covered. +// If homozygousOrHaploid then the allele must occur in both chroms to be counted +{ +assert((vBits->alleleSlots == 1 && alleleIx <= 1) + || (vBits->alleleSlots == 2 && alleleIx <= 3)); + +// To count ref, we need to know the number of subjects the bitmap covers +// chrX is the culprit here since it is haploid/diploid +int subjTotal = vBits->genotypeSlots * vBits->haplotypeSlots; +if (alleleIx == 0) + subjTotal = vBitsSubjectChromCount(vBits); + +// Simple case can be taken care of easily +if (vBits->alleleSlots == 1 +&& (!homozygousOrHaploid || vBits->haplotypeSlots == 1)) + { + if (alleleIx == 1) // non-reference + return vBits->bitsOn; + else // reference + return subjTotal - vBits->bitsOn; + } + +// Otherwise, we have to walk the bitmap +int occurs = 0; +int occursHomozygous = 0; +int genoIx = 0; +for (; genoIx < vBits->genotypeSlots; genoIx++) + { + int haploIx = 0; + int occursThisSubject = 0; + for (; haploIx < vBits->haplotypeSlots; haploIx++) + { + int bitSlot = vBitsSlot(vBits,genoIx,haploIx,0); + switch (alleleIx) + { + case 0: if (bitCountRange(vBits->bits,bitSlot, vBits->alleleSlots) != 0) + occursThisSubject++; // Any non-reference bit + break; + case 1: if (vBits->alleleSlots == 1) + { + if (bitReadOne(vBits->bits,bitSlot)) + occursThisSubject++; + } + else + { + if ( bitReadOne(vBits->bits,bitSlot) + && !bitReadOne(vBits->bits,bitSlot + 1) ) + occursThisSubject++; + } + break; + case 2: if (!bitReadOne(vBits->bits,bitSlot) + && bitReadOne(vBits->bits,bitSlot + 1) ) + occursThisSubject++; + break; + case 3: if (bitCountRange(vBits->bits,bitSlot, 2) == 2) + occursThisSubject++; + break; + default: + break; + } + } + occurs += occursThisSubject; + if ((alleleIx == 0 && occursThisSubject == 0) + || (alleleIx > 0 && vBits->haplotypeSlots == occursThisSubject)) + occursHomozygous++; + } +if (alleleIx == 0) + occurs = subjTotal - occurs; + +return (homozygousOrHaploid ? occursHomozygous : occurs); +} + +int vcfVariantBitsReferenceOccurs(struct vcfFile *vcff, struct variantBits *vBitsList, + boolean homozygousOrHaploid) +// For an entire list of vBits structs, counts the times the reference allele occurs. +// If homozygousOrHaploid then the reference allele must occur in both chroms to be counted +{ +assert(vBitsList != NULL); +struct variantBits *vBits = vBitsList; +// Allocate temporary vBits struct which can hold the 'OR' version of all bitmaps in list +struct lm *lm = vcfFileLm(vcff); +struct variantBits *vBitsTemp; +lmAllocVar(lm,vBitsTemp); +vBitsTemp->genotypeSlots = vBits->genotypeSlots; +vBitsTemp->haplotypeSlots = vBits->haplotypeSlots; +vBitsTemp->alleleSlots = vBits->alleleSlots; +vBitsTemp->record = vBits->record; + +int bitSlots = vBitsSlotCount(vBits); +vBitsTemp->bits = lmBitClone(lm,vBits->bits, bitSlots); + + +for (vBits = vBits->next; vBits != NULL; vBits = vBits->next) + bitOr(vBitsTemp->bits, vBits->bits, bitSlots); + +vBitsTemp->bitsOn = bitCountRange(vBitsTemp->bits,0,bitSlots); + +// Note the memory will be leaked but it is lm +return vcfVariantBitsAlleleOccurs(vBitsTemp, 0, homozygousOrHaploid); +} + int vcfVariantMostPopularCmp(const void *va, const void *vb) // Compare to sort variantBits based upon how many genomes/chrom has the variant // This can be used to build haploBits in most populous order for tree building { const struct variantBits *a = *((struct variantBits **)va); const struct variantBits *b = *((struct variantBits **)vb); //int ret = memcmp(a->bits, b->bits, bitToByteSize(vBitsSlotCount(a))); int ret = (b->bitsOn - a->bitsOn); // higher bits first (descending order) if (ret == 0) { // sort deterministically ret = (a->record->chromStart - b->record->chromStart); if (ret == 0) ret = (a->record->chromEnd - b->record->chromEnd); } return ret; @@ -226,76 +442,82 @@ if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,0))) { bitSetOne( bits,hBitsSlot(hBits,variantIx, 0)); hBits->bitsOn++; } if (vBits->alleleSlots == 2) // (hBit->alleleSlot will also == 2) { if (bitReadOne(vBits->bits,vBitsSlot(vBits,genoIx,haploIx,1))) { bitSetOne( bits,hBitsSlot(hBits,variantIx, 1)); hBits->bitsOn++; } } } - if (!ignoreReference || hBits->bitsOn > 0) // gonna keep it so flesh it out + + // gonna keep it? so flesh it out (ignoreRef means > 0 && < all) + if (!ignoreReference || hBits->bitsOn) { hBits->genomeIx = genoIx; hBits->haploidIx = haploIx; struct vcfRecord *record = vBitsList->record; // any will do struct vcfGenotype *gt = &(record->genotypes[genoIx]); if (gt->isHaploid || vBitsList->haplotypeSlots == 1) { // chrX will have haplotypeSlots==2 but be haploid for this subject. // Meanwhile if vBits were for homozygous only, haplotypeSlots==1 //assert(haploIx == 0); hBits->ids = lmCloneString(lm,gt->id); } else { int sz = strlen(gt->id) + 3; hBits->ids = lmAlloc(lm,sz); safef(hBits->ids,sz,"%s-%c",gt->id,'a' + haploIx); } slAddHead(&hBitsList,hBits); } //else // haploBitsFree(&hBits); // lm so just abandon } } slReverse(&hBitsList); return hBitsList; } +/* int vcfHaploBitsOnCmp(const void *va, const void *vb) // Compare to sort haploBits based upon how many bits are on { const struct haploBits *a = *((struct haploBits **)va); const struct haploBits *b = *((struct haploBits **)vb); int ret = (a->bitsOn - b->bitsOn); if (ret == 0) { // sort deterministically ret = (a->genomeIx - b->genomeIx); if (ret == 0) ret = (a->haploidIx - b->haploidIx); } return ret; } +*/ -int vcfHaploMemCmp(const void *va, const void *vb) -// Compare to sort haploBits based upon how many bits are on +int vcfHaploBitsMemCmp(const void *va, const void *vb) +// Compare to sort haploBits based on bit by bit memory compare +// When hBits has been created from vBits sorted by popularity, +// a mem sort will then allow elmTreeGrow() to add popular variants first. { const struct haploBits *a = *((struct haploBits **)va); const struct haploBits *b = *((struct haploBits **)vb); int ret = memcmp(a->bits, b->bits, bitToByteSize(hBitsSlotCount(a))); if (ret == 0) { // sort deterministically ret = (a->genomeIx - b->genomeIx); if (ret == 0) ret = (a->haploidIx - b->haploidIx); } return ret; } static struct haploBits *hBitsCollapsePair(struct lm *lm, struct haploBits *hBits, struct haploBits *dupMap) @@ -313,31 +535,31 @@ int vcfHaploBitsListCollapseIdentical(struct vcfFile *vcff, struct haploBits **phBitsList, int haploGenomeMin) // Collapses a list of haploBits based upon identical bit arrays. // If haploGenomeMin > 1, will drop all hBits structs covering less than N haploGenomes. // Returns count of hBit structs removed. { struct lm *lm = vcfFileLm(vcff); int collapsed = 0; struct haploBits *hBitsPassed = NULL; struct haploBits *hBitsToTest = *phBitsList; if (hBitsToTest->next == NULL) return 0; // Sort early bits first -slSort(&hBitsToTest, vcfHaploMemCmp); +slSort(&hBitsToTest, vcfHaploBitsMemCmp); // Walk through those of matching size and compare int bitCount = hBitsSlotCount(hBitsToTest); int memSize = bitToByteSize(bitCount); struct haploBits *hBitsQ = NULL; while ((hBitsQ = slPopHead(&hBitsToTest)) != NULL) { struct haploBits *hBitsT = hBitsToTest; //hBitsQ->next; is already null. for (;hBitsT != NULL;hBitsT = hBitsT->next) { // If the 2 bitmaps do not have the same number of bits on, they are not identical if (hBitsT->bitsOn != hBitsQ->bitsOn) break; // If 2 bitmaps are identical, then collapse into one