0d3b8feb6074f6caccc2d4aadd020961fbd7eb82 chmalee Wed Jun 10 14:39:38 2020 -0700 Add drag reorder of VCF samples to VCF trio display. Also fixes bug in haplotype sort that I didn't discover until adding drag reorder. Sometimes when sorting haplotypes it is the case that the initial best match to a child allele is the same for each parent. Previously I would just advance the first drawn parent to the next best match but now I also check whether advancing the first drawn is actually the best idea and potentially advance the other parent instead, refs #25582 diff --git src/hg/hgTracks/vcfTrack.c src/hg/hgTracks/vcfTrack.c index d513525..e2d02f6 100644 --- src/hg/hgTracks/vcfTrack.c +++ src/hg/hgTracks/vcfTrack.c @@ -157,31 +157,32 @@ { double majorAlFreq = max(refFreq, maxAltFreq); if (majorAlFreq > (1.0 - minFreq)) return TRUE; } return FALSE; } static void filterRefOnlyAlleles(struct vcfFile *vcff, struct trackDb *tdb) /* Drop items from VCF that don't differ from the reference for any of the * samples specified in trackDb */ { struct vcfRecord *rec, *nextRecord, *retList = NULL; const struct vcfGenotype *gt; -struct slPair *sample, *sampleOrder = vcfPhasedGetSampleOrder(cart, tdb, FALSE); +boolean hideOtherSamples = cartUsualBooleanClosestToHome(cart, tdb, FALSE, VCF_PHASED_HIDE_OTHER_VAR, FALSE); +struct slPair *sample, *sampleOrder = vcfPhasedGetSampleOrder(cart, tdb, FALSE, hideOtherSamples); for (rec = vcff->records; rec != NULL; rec = nextRecord) { nextRecord = rec->next; boolean allRef = TRUE; for (sample = sampleOrder; sample != NULL; sample = sample->next) { gt = vcfRecordFindGenotype(rec, sample->name); if (gt && !(gt->hapIxA == 0 && gt->hapIxB == 0) ) allRef = FALSE; } if (!allRef) slAddHead(&retList, rec); } slReverse(&retList); vcff->records = retList; @@ -220,31 +221,34 @@ unsigned int vcfStart; unsigned int vcfEnd; }; static struct pgSnp *vcfFileToPgSnp(struct vcfFile *vcff, struct trackDb *tdb) /* Convert vcff's records to pgSnp; don't free vcff until you're done with pgSnp * because it contains pointers into vcff's records' chrom. If the trackDb setting * sampleName is present, then check whether all the records are phased or not */ { struct pgSnp *pgsList = NULL; struct vcfRecord *rec; int maxLen = 33; int maxAlCount = 5; struct slPair *sample = NULL, *phasedSamples = NULL; if (sameString(tdb->type, "vcfPhasedTrio")) - phasedSamples = vcfPhasedGetSampleOrder(cart, tdb, FALSE); + { + boolean hideOtherSamples = cartUsualBooleanClosestToHome(cart, tdb, FALSE, VCF_PHASED_HIDE_OTHER_VAR, FALSE); + phasedSamples = vcfPhasedGetSampleOrder(cart, tdb, FALSE, hideOtherSamples); + } vcff->allPhased = TRUE; for (rec = vcff->records; rec != NULL; rec = rec->next) { struct pgSnpVcfStartEnd *psvs = needMem(sizeof(*psvs)); psvs->vcfStart = vcfRecordTrimIndelLeftBase(rec); psvs->vcfEnd = vcfRecordTrimAllelesRight(rec); for (sample = phasedSamples; sample != NULL; sample = sample->next) { const struct vcfGenotype *gt = vcfRecordFindGenotype(rec, sample->name); if (!gt->isPhased) vcff->allPhased = FALSE; } struct pgSnp *pgs = pgSnpFromVcfRecord(rec); memcpy(&(psvs->pgs), pgs, sizeof(*pgs)); @@ -1711,108 +1715,134 @@ tmp->dist = cell->dist; slAddHead(&cellList, tmp); } } } slSort(&cellList, hapDistanceMatrixCellCmp); return cellList; } static unsigned int toggleInt(unsigned int s) /* Add or subtract one */ { return s & 1 ? s - 1 : s + 1; } -static void fillOutHapOrder(unsigned int *hapOrder, unsigned int hapCount, struct hapDistanceMatrixCell *c1, struct hapDistanceMatrixCell *c2) +static void fillOutHapOrder(unsigned int *hapOrder, unsigned int hapCount, struct hapDistanceMatrixCell *c1, struct hapDistanceMatrixCell *c2, char **sampleDrawOrder) /* Assign indices to hapOrder in the order we should draw the alleles. Allows for the second parent * cell to be NULL */ { if (c1 == NULL) errAbort("Error: fillOutHapOrder passed NULL parent"); int numSamplesToDraw = hapCount / 2; +char *childSampleName = c1->otherId; +int childIx = stringArrayIx(childSampleName, sampleDrawOrder, numSamplesToDraw); int i; for (i = 0; i < numSamplesToDraw; i++) { - int hapIx = 2*i; - if (i == 0) // top set of lines + char *thisSample = sampleDrawOrder[i]; + struct hapDistanceMatrixCell *thisCell = c2 != NULL && sameString(c2->sampleId,thisSample) ? c2 : c1; + short hapIx = 2*i; + if (i != childIx) // fill out the parent indices, which may be above or below the child { - hapOrder[hapIx] = toggleInt(c1->alleleIx); - hapOrder[hapIx+1] = c1->alleleIx; - } - else if (i + 1 == numSamplesToDraw) // last set of lines - { - if (c2) + if (i < childIx) { - hapOrder[hapIx] = c2->alleleIx; - hapOrder[hapIx+1] = toggleInt(c2->alleleIx); + hapOrder[hapIx] = toggleInt(thisCell->alleleIx); + hapOrder[hapIx+1] = thisCell->alleleIx; } - else + else // below the child { - hapOrder[hapIx] = c1->otherAlleleIx; - hapOrder[hapIx+1] = toggleInt(c1->otherAlleleIx); + hapOrder[hapIx] = thisCell->alleleIx; + hapOrder[hapIx+1] = toggleInt(thisCell->alleleIx); } } - else // orient based on above and below + else // fill out the child indices { hapOrder[hapIx] = c1->otherAlleleIx; + if (c2) hapOrder[hapIx+1] = c2->otherAlleleIx; + else + hapOrder[hapIx+1] = toggleInt(c1->otherAlleleIx); + } + } } + +static void maybeRollBackCell(struct hapDistanceMatrixCell *c1, struct hapDistanceMatrixCell *c2, + struct hapDistanceMatrixCell *c1Copy, struct hapDistanceMatrixCell *c2Copy) +/* At this point c1->otherAlleleIx != c2->otherAlleleIx, however, it may be the case that c2 has a + * better scoring match to c1->otherAlleleIx, so try to find it and reset the values */ +{ +struct hapDistanceMatrixCell *tmp = c2; +while (tmp->otherAlleleIx != c1->otherAlleleIx) + tmp = tmp->next; +if (tmp->dist < c1->dist) + { + *c2 = *tmp; + *c1 = *c1Copy; } } static void setHapOrderFromMatrix(unsigned int *hapOrder, unsigned int hapCount, struct hapDistanceMatrix *matrix, struct hapCluster **hapArray, struct vcfFile *vcff, char *childName, char **sampleDrawOrder) /* Given a matrix where each row is an allele of the child, and each column * is the distance from the child allele to a parent allele, fill out the order * in which the haplotypes will be drawn. The order are the indexes into rec->genotypes. */ { char *parentNames[(hapCount/2) - 1]; struct hapDistanceMatrixCell *c1 = NULL; struct hapDistanceMatrixCell *c2 = NULL; int i, ix = 0; if (hapCount > 2) // is there a trio or at least part of one? { for (i = 0; i < hapCount/2 ; i++) if (!sameString(sampleDrawOrder[i], childName)) parentNames[ix++] = sampleDrawOrder[i]; // For a parent/child combo, find the most likely set of transmitted // alleles. Depending on the number of variants in the window and the // haplotypes in question, both parents can tie as the person who // contributed a given allele c1 = findClosestChildAlleleToParent(parentNames[0], childName, matrix); c2 = findClosestChildAlleleToParent(parentNames[1], childName, matrix); // NULL if only one parent if (c1 && c2) { if (c1->otherAlleleIx == c2->otherAlleleIx) { - // first choose the lowest score, if a tie then just advance c1 + struct hapDistanceMatrixCell *c1Copy = c1; + struct hapDistanceMatrixCell *c2Copy = c2; if (c1->dist >= c2->dist) { while (c1->dist >= c2->dist && c1->otherAlleleIx == c2->otherAlleleIx) c1 = c1->next; + // at this point c1->otherAlleleIx != c2>otherAlleleIx, BUT c2 may have a better + // scoring match to this allele and we shouldn't have advanced c1 to begin with! + // This case is exercised at the following location (chr1:53896598-53897933) in + // the following file: /gbdb/hg38/1000Genomes/trio/NA12878_1463_CEU/NA12878_1463_CEUTrio.chr1.vcf.gz + // where the child's haplotypes are identical to both of the mother's haplotypes! + maybeRollBackCell(c1, c2, c1Copy, c2Copy); } else { while (c1->dist < c2->dist && c1->otherAlleleIx == c2->otherAlleleIx) c2 = c2->next; + // similar to above case but swap c1 and c2 + maybeRollBackCell(c2, c1, c2Copy, c1Copy); } } } - fillOutHapOrder(hapOrder, hapCount, c1, c2); + fillOutHapOrder(hapOrder, hapCount, c1, c2, sampleDrawOrder); } else { hapOrder[0] = matrix->alleleIx; hapOrder[1] = matrix->next->alleleIx; } } static void assignHapArrayIx(int *ret, struct hapCluster **hapArray, struct vcfFile *vcff, char *sample, boolean doChild) { int i; struct vcfRecord *rec = vcff->records; int ix = 0; // index into ret for (i = 0; i < slCount(hapArray); i++) { @@ -2104,72 +2134,78 @@ vcfPhasedAddLabel(track, hvg, label->string, yOff, round(((y1 + y2) / 2) - (track->lineHeight / 2)), font, MG_BLACK); } } static void vcfPhasedDrawItems(struct track *track, int seqStart, int seqEnd, struct hvGfx *hvg, int xOff, int yOff, int width, MgFont *font, Color color, enum trackVisibility vis) /* Split samples' chromosomes (haplotypes), cluster them by parents, and * draw them all along a line representing each chromosome*/ { struct vcfFile *vcff = track->extraUiData; if (vcff->records == NULL) return; const double scale = scaleForPixels(width); -struct slPair *pair, *sampleNames = vcfPhasedGetSampleOrder(cart, track->tdb, FALSE); +boolean hideOtherSamples = cartUsualBooleanClosestToHome(cart, track->tdb, FALSE, VCF_PHASED_HIDE_OTHER_VAR, FALSE); +struct slPair *pair, *sampleNames = vcfPhasedGetSampleOrder(cart, track->tdb, FALSE, hideOtherSamples); int gtCount = slCount(sampleNames); int yOffsets[gtCount * 2]; // y offsets of each haplotype line char *sampleOrder[gtCount]; // order of sampleName lines int i; for (pair = sampleNames, i = 0; pair != NULL && i < gtCount; pair = pair->next, i++) sampleOrder[i] = pair->name; -// child sample is required to be in the middle/bottom of the trio -char *childSample = gtCount > 2 ? sampleOrder[1] : sampleOrder[0]; + +char *childSample = cloneString(trackDbSetting(track->tdb, VCF_PHASED_CHILD_SAMPLE_SETTING)); +char *pt = strchr(childSample, '|'); +if (pt != NULL) + *pt = '\0'; // set up the "haplotype" lines and the transparent yellow box to delineate samples vcfPhasedSetupHaplotypesLines(track, hvg, xOff, yOff, width, yOffsets, sampleNames, font); -// sort the variants by haplotype then draw ticks +// maybe sort the variants by haplotype then draw ticks +unsigned int *hapOrder = needMem(sizeof(short) * gtCount * 2); int nRecords = slCount(vcff->records); int centerIx = getCenterVariantIx(track, seqStart, seqEnd, vcff->records); int startIx = 0; int endIx = nRecords; -unsigned int *hapOrder = computeHapDist(vcff, centerIx, startIx, endIx, childSample, gtCount, sampleOrder); +hapOrder = computeHapDist(vcff, centerIx, startIx, endIx, childSample, gtCount, sampleOrder); boolean highlightChildDiffs = cartUsualBooleanClosestToHome(cart, track->tdb, FALSE, VCF_PHASED_HIGHLIGHT_INCONSISTENT, FALSE); struct vcfRecord *rec = NULL; struct slList *item = NULL; for (rec = vcff->records, item = track->items; rec != NULL && item != NULL; rec = rec->next, item = item->next) { vcfPhasedDrawOneRecord(track, hvg, rec, item, hapOrder, gtCount * 2, xOff, yOffsets, sampleOrder, childSample, highlightChildDiffs, scale); } } static int vcfPhasedItemHeight(struct track *tg, void *item) { return tg->heightPer; } int vcfPhasedTrackHeight(struct track *tg, enum trackVisibility vis) { const struct vcfFile *vcff = tg->extraUiData; // when doing composite track height, vcfPhasedLoadItems won't have been called yet! if (vis == tvDense) return pgSnpHeight(tg, vis); if (!vcff || vcff->records == NULL) return 0; -int totalSamples = slCount(vcfPhasedGetSampleOrder(cart, tg->tdb, FALSE)); +boolean hideOtherSamples = cartUsualBooleanClosestToHome(cart, tg->tdb, FALSE, VCF_PHASED_HIDE_OTHER_VAR, FALSE); +int totalSamples = slCount(vcfPhasedGetSampleOrder(cart, tg->tdb, FALSE, hideOtherSamples)); tg->lineHeight = tl.fontHeight + 1; tg->heightPer = tl.fontHeight; // if all variants in view are phased, then 3 lines per sample, // else 4 lines. The extra 2 is for clear separation int heightPerSample; if (vcff->allPhased) heightPerSample = (3 * tg->lineHeight) + 2; else heightPerSample = (4 * tg->lineHeight) + 2; tg->height = totalSamples * heightPerSample; tg->itemHeight = vcfPhasedItemHeight; // custom int is reserved for doing pgSnp coloring but as far as I can tell is // never actually used? tg->customInt = totalSamples; return tg->height;