380a1b308bd3bb4f4e52d89ef9e1ccb962892bab angie Tue Oct 3 14:10:37 2017 -0700 Major changes to annoGratorGpVar, annoFormatVep and gpFx.c with the addition of functional effect prediction to variantProjector using PSL+CDS from annoStreamDbPslPlus, which enables accurate predictions even when the genome and transcript have indel differences. struct gpFx includes new members exonCount, txRef and txAlt so that gpFx and variantProjector can compute those and send them forward to annoFormatVep, instead of annoFormatVep computing them assuming that genome and transcript match perfectly. annoGratorGpVar passes forward the new gpFx members in output columns and, when input is PSL+CDS instead of genePred, uses variantProjector instead of gpFx to do functional predictions. diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index fd6ad07..7f70293 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -2454,30 +2454,31 @@ { dyStringPrintf(dy, "delins%s", altAbbr); } } } char *hgvsGFromVariant(struct seqWindow *gSeqWin, struct bed3 *variantBed, char *alt, char *acc, boolean breakDelIns) /* Return an HGVS g. string representing the genomic variant at the position of variantBed with * reference allele from gSeqWin and alternate allele alt. 3'-shift indels if applicable. * If acc is non-NULL it is used instead of variantBed->chrom. * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ { struct dyString *dy = dyStringCreate("%s:g.", acc ? acc : variantBed->chrom); uint vStart = variantBed->chromStart, vEnd = variantBed->chromEnd; +assert (vEnd >= vStart); gSeqWin->fetch(gSeqWin, variantBed->chrom, vStart, vEnd); int refLen = vEnd - vStart; char ref[refLen+1]; seqWindowCopy(gSeqWin, vStart, refLen, ref, sizeof(ref)); int altLen = strlen(alt); char altCpy[altLen+1]; safecpy(altCpy, sizeof(altCpy), alt); if (differentString(ref, altCpy)) // Trim identical bases from start and end -- unless this is an assertion that there is // no change, in which case it's good to keep the range on which that assertion was made. trimRefAlt(ref, altCpy, &vStart, &vEnd, &refLen, &altLen); if (indelShiftIsApplicable(refLen, altLen) && indelShift(gSeqWin, &vStart, &vEnd, altCpy, INDEL_SHIFT_NO_MAX, isdRight)) // update ref seqWindowCopy(gSeqWin, vStart, refLen, ref, sizeof(ref)); @@ -2705,80 +2706,69 @@ else { exonAnchor = txPos->intron3TxOffset; direction = '-'; intronOffset = txPos->intron3Distance + closedEnd; anchorIsStart = TRUE; } exonAnchor = cds ? hgvsTxToCds(exonAnchor, cds, anchorIsStart, cdsPrefix) : exonAnchor + (anchorIsStart ? oneBased : 0); dyStringPrintf(dy, "%s%u%c%u", cdsPrefix, exonAnchor, direction, intronOffset); } else errAbort("appendHgvsNucPos: unrecognized vpTxRegion value %d", txPos->region); } -static char *refFromVpTx(struct vpTx *vpTx) -/* If vpTx->txRef is non-NULL and both start & end are exonic, return txRef; - * otherwise return genomic. For example, if a deletion spans exon/intron boundary, use genomic - * ref because it includes the intron bases. Do not free the returned value. */ -{ -if (vpTx->txRef != NULL && - vpTx->start.region == vpExon && vpTx->end.region == vpExon) - return vpTx->txRef; -return vpTx->gRef; -} - char *hgvsNFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli, struct dnaSeq *txSeq, boolean breakDelIns) /* Return an HGVS n. (noncoding transcript) term for a variant projected onto a transcript. * gSeqWin must already have at least the correct seqName if not the surrounding sequence. * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ { struct dyString *dy = dyStringCreate("%s:n.", vpTx->txName); // Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup. struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end; int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq); appendHgvsNucPos(dy, &startPos, TRUE, NULL); if (!vpTxPosRangeIsSingleBase(&startPos, &endPos)) { dyStringAppendC(dy, '_'); appendHgvsNucPos(dy, &endPos, FALSE, NULL); } -char *ref = refFromVpTx(vpTx); +char *ref = vpTxGetRef(vpTx); hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns); return dyStringCannibalize(&dy); } char *hgvsCFromVpTx(struct vpTx *vpTx, struct seqWindow *gSeqWin, struct psl *txAli, struct genbankCds *cds, struct dnaSeq *txSeq, boolean breakDelIns) /* Return an HGVS c. (coding transcript) term for a variant projected onto a transcript w/cds. * gSeqWin must already have at least the correct seqName if not the surrounding sequence. * If breakDelIns, then show deleted bases (eg show 'delAGinsTT' instead of 'delinsTT'). */ { struct dyString *dy = dyStringCreate("%s:c.", vpTx->txName); // Make local copies of vpTx->{start,end} -- we may need to modify them for HGVS ins/dup. struct vpTxPosition startPos = vpTx->start, endPos = vpTx->end; int dupLen = tweakInsDup(&startPos, &endPos, vpTx->txAlt, gSeqWin, txAli, txSeq); appendHgvsNucPos(dy, &startPos, TRUE, cds); if (!vpTxPosRangeIsSingleBase(&startPos, &endPos)) { dyStringAppendC(dy, '_'); appendHgvsNucPos(dy, &endPos, FALSE, cds); } -char *ref = refFromVpTx(vpTx); +char *ref = vpTxGetRef(vpTx); hgvsAppendChangesFromNucRefAlt(dy, ref, vpTx->txAlt, dupLen, breakDelIns); return dyStringCannibalize(&dy); } static boolean isStartLoss(struct vpPep *vpPep) /* Return TRUE if vpPep shows that the start codon has been lost. */ { return (vpPep->start == 0 && isNotEmpty(vpPep->ref) && vpPep->ref[0] == 'M' && (isEmpty(vpPep->alt) || vpPep->alt[0] != 'M')); } char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens) /* Return an HGVS p. (protein) term for a variant projected into protein space. * Strict HGVS compliance requires parentheses around predicted protein changes, but