b0039dec371d7665e25e2e39b0e79d3e33ba50a1 angie Tue Nov 28 13:54:07 2023 -0800 Fix amino acid translation to support transcipts that have UTR portions and skip noncoding transcripts. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index 96961ec..3616afa 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -1,21 +1,22 @@ /* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample * mutations, protein changes and metadata. */ /* Copyright (C) 2020 The Regents of the University of California */ #include "common.h" +#include "dnaseq.h" #include "errCatch.h" #include "hash.h" #include "hui.h" #include "jsonWrite.h" #include "linefile.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "variantProjector.h" static void auspiceMetaColoringCategoricalStart(struct jsonWrite *jw, char *key, char *title) /* Write key, title and type of a "categorical" coloring spec, but leave it open in case a * scale list needs to be added. */ { @@ -413,107 +414,107 @@ { return (pslOrientation(psl) > 0) ? base : ntCompTable[(int)base]; } static struct slName *codonVpTxToAaChange(struct vpTx *codonVpTxList, struct singleNucChange *ancestorMuts, struct geneInfo *gi) /* Given a list of vpTx from the same codon, combine their changes with inherited mutations * in the same codon to get the amino acid change at this node. * Note: this assumes there is no UTR in transcript, only CDS. True so far for pathogens... */ { struct slName *aaChange = NULL; if (codonVpTxList != NULL) { struct vpTx *vpTx = codonVpTxList; - int firstAaStart = vpTx->start.txOffset / 3; + int firstAaStart = (vpTx->start.txOffset - gi->cds->start) / 3; int codonStart = firstAaStart * 3; - int codonOffset = vpTx->start.txOffset - codonStart; + int codonOffset = vpTx->start.txOffset - gi->cds->start - codonStart; char oldCodon[4]; - safencpy(oldCodon, sizeof oldCodon, gi->txSeq->dna + codonStart, 3); + safencpy(oldCodon, sizeof oldCodon, gi->txSeq->dna + gi->cds->start + codonStart, 3); touppers(oldCodon); boolean isRc = (pslOrientation(gi->psl) < 0); int codonStartG = isRc ? vpTx->start.gOffset + codonOffset : vpTx->start.gOffset - codonOffset; struct singleNucChange *anc; for (anc = ancestorMuts; anc != NULL; anc = anc->next) { int ancCodonOffset = isRc ? codonStartG - (anc->chromStart + 1) : anc->chromStart - codonStartG; if (ancCodonOffset >= 0 && ancCodonOffset < 3) { char parBase = isRc ? ntCompTable[(int)anc->parBase] : anc->parBase; if (parBase != oldCodon[ancCodonOffset]) errAbort("codonVpTxToAaChange: expected parBase for ancestral mutation at %d " "(%s codon %d offset %d) to be '%c' but it's '%c'%s", anc->chromStart+1, gi->psl->qName, firstAaStart, ancCodonOffset, oldCodon[ancCodonOffset], parBase, isRc ? " (rev-comp'd)" : ""); oldCodon[ancCodonOffset] = maybeComplement(anc->newBase, gi->psl); } } char oldAa = lookupCodon(oldCodon); if (oldAa == '\0') oldAa = '*'; char newCodon[4]; safecpy(newCodon, sizeof newCodon, oldCodon); for (vpTx = codonVpTxList; vpTx != NULL; vpTx = vpTx->next) { - int aaStart = vpTx->start.txOffset / 3; + int aaStart = (vpTx->start.txOffset - gi->cds->start) / 3; if (aaStart != firstAaStart) errAbort("codonVpTxToAaChange: program error: firstAaStart %d != aaStart %d", firstAaStart, aaStart); - int codonOffset = vpTx->start.txOffset - codonStart; + int codonOffset = vpTx->start.txOffset - gi->cds->start - codonStart; // vpTx->txRef[0] is always the reference base, not like singleNucChange parBase, // so we can't compare it to expected value as we could for ancMuts above. newCodon[codonOffset] = vpTx->txAlt[0]; } char newAa = lookupCodon(newCodon); if (newAa == '\0') newAa = '*'; if (newAa != oldAa) { char aaChangeStr[32]; safef(aaChangeStr, sizeof aaChangeStr, "%c%d%c", oldAa, firstAaStart+1, newAa); aaChange = slNameNew(aaChangeStr); } } return aaChange; } struct slPair *getAaMutations(struct singleNucChange *sncList, struct singleNucChange *ancestorMuts, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) -/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. - * Note: this assumes there is no UTR in transcript, only CDS. True so far for pathogens... */ +/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ { struct slPair *geneChangeList = NULL; struct geneInfo *gi; for (gi = geneInfoList; gi != NULL; gi = gi->next) { struct slName *aaChangeList = NULL; int prevAaStart = -1; struct vpTx *codonVpTxList = NULL; struct singleNucChange *snc; for (snc = sncList; snc != NULL; snc = snc->next) { if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart) { struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 }; char gAlt[2]; safef(gAlt, sizeof(gAlt), "%c", snc->newBase); struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq); - if (vpTx->start.region == vpExon) + if (vpTx->start.region == vpExon && vpTx->start.txOffset < gi->cds->end && + vpTx->end.txOffset > gi->cds->start) { - int aaStart = vpTx->start.txOffset / 3; + int aaStart = (vpTx->start.txOffset - gi->cds->start) / 3; // Accumulate vpTxs in the same codon if (aaStart == prevAaStart || prevAaStart == -1) { slAddHead(&codonVpTxList, vpTx); } else { // Done with previous codon; find change from accumulated mutations struct slName *aaChange = codonVpTxToAaChange(codonVpTxList, ancestorMuts, gi); if (aaChange) slAddHead(&aaChangeList, aaChange); slFreeListWithFunc(&codonVpTxList, vpTxFree); codonVpTxList = vpTx; } prevAaStart = aaStart; @@ -792,32 +793,39 @@ char *seq = needMem(txLen+1); int txOffset = 0; for (ex = 0; ex < gp->exonCount; ex++) { int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex]; safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen); txOffset += exonLen; } if (sameString(gp->strand, "-")) reverseComplement(seq, txLen); struct geneInfo *gi; AllocVar(gi); gi->psl = genePredToPsl((struct genePred *)gp, refGenome->size, txLen); gi->psl->qName = cloneString(gp->name2); gi->txSeq = newDnaSeq(seq, txLen, gp->name2); + AllocVar(gi->cds); + genePredToCds((struct genePred *)gp, gi->cds); + int cdsLen = gi->cds->end - gi->cds->start; + // Skip genes with no CDS (like RNA genes) or obviously incomplete/incorrect CDS. + if (cdsLen > 0 && (cdsLen % 3) == 0) + { slAddHead(&geneInfoList, gi); } + } lmCleanup(&lm); bigBedFileClose(&bbi); } slReverse(&geneInfoList); return geneInfoList; } void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, struct hash *samplePlacements, char *jsonFile, char *source) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ { struct phyloTree *tree = sti->subtree;