b0039dec371d7665e25e2e39b0e79d3e33ba50a1
angie
  Tue Nov 28 13:54:07 2023 -0800
Fix amino acid translation to support transcipts that have UTR portions and skip noncoding transcripts.

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index 96961ec..3616afa 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -1,21 +1,22 @@
 /* Convert a (sub)tree with condensed nodes to JSON for Nextstrain to display, adding in sample
  * mutations, protein changes and metadata. */
 
 /* Copyright (C) 2020 The Regents of the University of California */
 
 #include "common.h"
+#include "dnaseq.h"
 #include "errCatch.h"
 #include "hash.h"
 #include "hui.h"
 #include "jsonWrite.h"
 #include "linefile.h"
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "phyloTree.h"
 #include "variantProjector.h"
 
 
 static void auspiceMetaColoringCategoricalStart(struct jsonWrite *jw, char *key, char *title)
 /* Write key, title and type of a "categorical" coloring spec, but leave it open in case a
  * scale list needs to be added. */
 {
@@ -413,107 +414,107 @@
 {
 return (pslOrientation(psl) > 0) ? base : ntCompTable[(int)base];
 }
 
 static struct slName *codonVpTxToAaChange(struct vpTx *codonVpTxList,
                                           struct singleNucChange *ancestorMuts,
                                           struct geneInfo *gi)
 /* Given a list of vpTx from the same codon, combine their changes with inherited mutations
  * in the same codon to get the amino acid change at this node.
  * Note: this assumes there is no UTR in transcript, only CDS.  True so far for pathogens... */
 {
 struct slName *aaChange = NULL;
 if (codonVpTxList != NULL)
     {
     struct vpTx *vpTx = codonVpTxList;
-    int firstAaStart = vpTx->start.txOffset / 3;
+    int firstAaStart = (vpTx->start.txOffset - gi->cds->start) / 3;
     int codonStart = firstAaStart * 3;
-    int codonOffset = vpTx->start.txOffset - codonStart;
+    int codonOffset = vpTx->start.txOffset - gi->cds->start - codonStart;
     char oldCodon[4];
-    safencpy(oldCodon, sizeof oldCodon, gi->txSeq->dna + codonStart, 3);
+    safencpy(oldCodon, sizeof oldCodon, gi->txSeq->dna + gi->cds->start + codonStart, 3);
     touppers(oldCodon);
     boolean isRc = (pslOrientation(gi->psl) < 0);
     int codonStartG = isRc ? vpTx->start.gOffset + codonOffset : vpTx->start.gOffset - codonOffset;
     struct singleNucChange *anc;
     for (anc = ancestorMuts;  anc != NULL;  anc = anc->next)
         {
         int ancCodonOffset = isRc ? codonStartG - (anc->chromStart + 1) : anc->chromStart - codonStartG;
         if (ancCodonOffset >= 0 && ancCodonOffset < 3)
             {
             char parBase = isRc ? ntCompTable[(int)anc->parBase] : anc->parBase;
             if (parBase != oldCodon[ancCodonOffset])
                 errAbort("codonVpTxToAaChange: expected parBase for ancestral mutation at %d "
                          "(%s codon %d offset %d) to be '%c' but it's '%c'%s",
                          anc->chromStart+1, gi->psl->qName, firstAaStart, ancCodonOffset,
                          oldCodon[ancCodonOffset], parBase,
                          isRc ? " (rev-comp'd)" : "");
             oldCodon[ancCodonOffset] = maybeComplement(anc->newBase, gi->psl);
             }
         }
     char oldAa = lookupCodon(oldCodon);
     if (oldAa == '\0')
         oldAa = '*';
     char newCodon[4];
     safecpy(newCodon, sizeof newCodon, oldCodon);
     for (vpTx = codonVpTxList;  vpTx != NULL;  vpTx = vpTx->next)
         {
-        int aaStart = vpTx->start.txOffset / 3;
+        int aaStart = (vpTx->start.txOffset - gi->cds->start) / 3;
         if (aaStart != firstAaStart)
             errAbort("codonVpTxToAaChange: program error: firstAaStart %d != aaStart %d",
                      firstAaStart, aaStart);
-        int codonOffset = vpTx->start.txOffset - codonStart;
+        int codonOffset = vpTx->start.txOffset - gi->cds->start - codonStart;
         // vpTx->txRef[0] is always the reference base, not like singleNucChange parBase,
         // so we can't compare it to expected value as we could for ancMuts above.
         newCodon[codonOffset] = vpTx->txAlt[0];
         }
     char newAa = lookupCodon(newCodon);
     if (newAa == '\0')
         newAa = '*';
     if (newAa != oldAa)
         {
         char aaChangeStr[32];
         safef(aaChangeStr, sizeof aaChangeStr, "%c%d%c", oldAa, firstAaStart+1, newAa);
         aaChange = slNameNew(aaChangeStr);
         }
     }
 return aaChange;
 }
 
 struct slPair *getAaMutations(struct singleNucChange *sncList, struct singleNucChange *ancestorMuts,
                               struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
-/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }.
- * Note: this assumes there is no UTR in transcript, only CDS.  True so far for pathogens... */
+/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
 {
 struct slPair *geneChangeList = NULL;
 struct geneInfo *gi;
 for (gi = geneInfoList;  gi != NULL;  gi = gi->next)
     {
     struct slName *aaChangeList = NULL;
     int prevAaStart = -1;
     struct vpTx *codonVpTxList = NULL;
     struct singleNucChange *snc;
     for (snc = sncList;  snc != NULL;  snc = snc->next)
         {
         if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
             {
             struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 };
             char gAlt[2];
             safef(gAlt, sizeof(gAlt), "%c", snc->newBase);
             struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq);
-            if (vpTx->start.region == vpExon)
+            if (vpTx->start.region == vpExon && vpTx->start.txOffset < gi->cds->end &&
+                vpTx->end.txOffset > gi->cds->start)
                 {
-                int aaStart = vpTx->start.txOffset / 3;
+                int aaStart = (vpTx->start.txOffset - gi->cds->start) / 3;
                 // Accumulate vpTxs in the same codon
                 if (aaStart == prevAaStart || prevAaStart == -1)
                     {
                     slAddHead(&codonVpTxList, vpTx);
                     }
                 else
                     {
                     // Done with previous codon; find change from accumulated mutations
                     struct slName *aaChange = codonVpTxToAaChange(codonVpTxList, ancestorMuts, gi);
                     if (aaChange)
                         slAddHead(&aaChangeList, aaChange);
                     slFreeListWithFunc(&codonVpTxList, vpTxFree);
                     codonVpTxList = vpTx;
                     }
                 prevAaStart = aaStart;
@@ -792,32 +793,39 @@
         char *seq = needMem(txLen+1);
         int txOffset = 0;
         for (ex = 0;  ex < gp->exonCount;  ex++)
             {
             int exonLen = gp->exonEnds[ex] - gp->exonStarts[ex];
             safencpy(seq+txOffset, txLen+1-txOffset, refGenome->dna+gp->exonStarts[ex], exonLen);
             txOffset += exonLen;
             }
         if (sameString(gp->strand, "-"))
             reverseComplement(seq, txLen);
         struct geneInfo *gi;
         AllocVar(gi);
         gi->psl = genePredToPsl((struct genePred *)gp, refGenome->size, txLen);
         gi->psl->qName = cloneString(gp->name2);
         gi->txSeq = newDnaSeq(seq, txLen, gp->name2);
+        AllocVar(gi->cds);
+        genePredToCds((struct genePred *)gp, gi->cds);
+        int cdsLen = gi->cds->end - gi->cds->start;
+        // Skip genes with no CDS (like RNA genes) or obviously incomplete/incorrect CDS.
+        if (cdsLen > 0 && (cdsLen % 3) == 0)
+            {
             slAddHead(&geneInfoList, gi);
             }
+        }
     lmCleanup(&lm);
     bigBedFileClose(&bbi);
     }
 slReverse(&geneInfoList);
 return geneInfoList;
 }
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
                        struct seqWindow *gSeqWin, struct hash *sampleMetadata,
                        struct hash *sampleUrls, struct hash *samplePlacements,
                        char *jsonFile, char *source)
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 {
 struct phyloTree *tree = sti->subtree;