c843f30cccc3610e26a26142ad0622f29402254b angie Wed Jan 11 09:54:24 2012 -0800 Bug #6529 (VCF internal rep doesn't do missing data): add flags for missingdata values to vcfInfoElement, so we know the difference between "0" and ".". This applies to both INFO column values and genotype (FORMAT) values. Also, vcfGenotype's hapIxA and hapIxB are now signed, with negative values indicating missing data. User reported bug in MLQ #6462. diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index 7851f89..8b8bdbe 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -92,50 +92,55 @@ * tiny minor frequencies). */ { struct vcfFile *vcff = record->file; boolean gotInfo = FALSE; double refFreq = 1.0; double maxAltFreq = 0.0; int i; const struct vcfInfoElement *afEl = vcfRecordFindInfo(record, "AF"); const struct vcfInfoDef *afDef = vcfInfoDefForKey(vcff, "AF"); if (afEl != NULL && afDef != NULL && afDef->type == vcfInfoFloat) { // If INFO includes alt allele freqs, use them directly. gotInfo = TRUE; for (i = 0; i < afEl->count; i++) { + if (afEl->missingData[i]) + continue; double altFreq = afEl->values[i].datFloat; refFreq -= altFreq; if (altFreq > maxAltFreq) maxAltFreq = altFreq; } } else { // Calculate alternate allele freqs from AC and AN: const struct vcfInfoElement *acEl = vcfRecordFindInfo(record, "AC"); const struct vcfInfoDef *acDef = vcfInfoDefForKey(vcff, "AC"); const struct vcfInfoElement *anEl = vcfRecordFindInfo(record, "AN"); const struct vcfInfoDef *anDef = vcfInfoDefForKey(vcff, "AN"); if (acEl != NULL && acDef != NULL && acDef->type == vcfInfoInteger && - anEl != NULL && anDef != NULL && anDef->type == vcfInfoInteger && anEl->count == 1) + anEl != NULL && anDef != NULL && anDef->type == vcfInfoInteger && anEl->count == 1 && + anEl->missingData[0] == FALSE) { gotInfo = TRUE; int totalCount = anEl->values[0].datFloat; for (i = 0; i < acEl->count; i++) { + if (acEl->missingData[i]) + continue; int altCount = acEl->values[i].datFloat; double altFreq = (double)altCount / totalCount; refFreq -= altFreq; if (altFreq < maxAltFreq) maxAltFreq = altFreq; } } } if (gotInfo) { double majorAlFreq = max(refFreq, maxAltFreq); if (majorAlFreq > (1.0 - minFreq)) return TRUE; } return FALSE; @@ -218,31 +223,31 @@ double alpha; // weighting factor for distance from center struct lm *localMem; }; // This is the representation of a cluster of up to 65,535 haplotypes of equal length, // where each variant's alleles are specified as 0 (reference) or 1 (alternate) // [or possibly 2 for second alternate, but those are rare so I'll ignore them]. // When an individual is heterozygous and unphased for some variant, we need to // account for missing data. struct hapCluster { struct hapCluster *next; // hacTree wants slList of items unsigned short *refCounts; // per-variant count of reference alleles observed unsigned short *unkCounts; // per-variant count of unknown (or unphased het) alleles unsigned short leafCount; // number of leaves under this node (or 1 if leaf) - unsigned short gtHapIx; // if leaf, (genotype index << 1) + hapIx (0 or 1 for diploid) + unsigned short gtHapIx; // if leaf, (genotype index << 1) + hap (0 or 1 for diploid) }; INLINE boolean isRef(const struct hapCluster *c, int varIx) // Return TRUE if the leaves of cluster have at least as many reference alleles // as alternate alleles for variant varIx. { unsigned short altCount = c->leafCount - c->refCounts[varIx] - c->unkCounts[varIx]; return (c->refCounts[varIx] >= altCount); } INLINE boolean hasUnk(const struct hapCluster *c, int varIx) // Return TRUE if at least one haplotype in this cluster has an unknown/unphased value at varIx. { return (c->unkCounts[varIx] > 0); } @@ -392,45 +397,49 @@ { if (varIx < startIx) continue; int countIx = varIx - startIx; int gtIx; for (gtIx=0; gtIx < gtCount; gtIx++) { struct vcfGenotype *gt = &(rec->genotypes[gtIx]); struct hapCluster *c1 = hapArray[gtIx]; struct hapCluster *c2 = hapArray[gtCount + gtIx]; // hardwired ploidy=2 if (gt->isPhased || gt->isHaploid || (gt->hapIxA == gt->hapIxB)) { // first chromosome: c1->leafCount = 1; c1->gtHapIx = gtIx << 1; - if (gt->hapIxA == 0) + if (gt->hapIxA < 0) + c1->unkCounts[countIx] = 1; + else if (gt->hapIxA == 0) c1->refCounts[countIx] = 1; if (gt->isHaploid) haveHaploid = TRUE; else { c2->leafCount = 1; c2->gtHapIx = (gtIx << 1) | 1; - if (gt->hapIxB == 0) + if (gt->hapIxB < 0) + c2->unkCounts[countIx] = 1; + else if (gt->hapIxB == 0) c2->refCounts[countIx] = 1; } } else { - // Unphased heterozygote, don't use haplotype info for clustering + // Missing data or unphased heterozygote, don't use haplotype info for clustering c1->leafCount = c2->leafCount = 1; c1->gtHapIx = gtIx << 1; c2->gtHapIx = (gtIx << 1) | 1; c1->unkCounts[countIx] = c2->unkCounts[countIx] = 1; } } if (haveHaploid) { // Some array items will have an empty cluster for missing hap2 -- // trim those from the linked list. struct hapCluster *c = hapArray[0]; while (c != NULL && c->next != NULL) { if (c->next->leafCount == 0) c->next = c->next->next; @@ -615,31 +624,33 @@ if (gtHapOrderIxEnd == gtHapOrderIxStart) gtHapOrderIxEnd++; int unks = 0, refs = 0, alts = 0; int gtHapOrderIx; for (gtHapOrderIx = gtHapOrderIxStart; gtHapOrderIx < gtHapOrderIxEnd; gtHapOrderIx++) { int gtHapIx = gtHapOrder[gtHapOrderIx]; int hapIx = gtHapIx & 1; int gtIx = gtHapIx >>1; struct vcfGenotype *gt = &(rec->genotypes[gtIx]); if (!gt->isPhased && gt->hapIxA != gt->hapIxB) unks++; else { int alIx = hapIx ? gt->hapIxB : gt->hapIxA; - if (alIx) + if (alIx < 0) + unks++; + else if (alIx > 0) alts++; else refs++; } } int y = yOff + extraPixel + pixIx; Color col; if (colorMode == baseMode) col = colorByBase(refs, alts, unks, rec->alleles[0], rec->alleles[1]); else if (colorMode == refAltMode) col = colorByRefAlt(refs, alts, unks); else col = colorByAltOnly(refs, alts, unks); if (col != MG_WHITE) hvGfxLine(hvg, x1, y, x2, y, col);