4dc2ce3ec067b4ccb7987cbda16289d165954971
angie
  Thu Mar 9 20:04:39 2023 -0800
Translate multiple changes to the same codon correctly, collecting all mutations on the path from root to node.

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index 4dc7614..29efa08 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -368,125 +368,156 @@
     jsonWriteObjectValue(jw, "userOrOld", userOrOld);
 if (nClade)
     jsonWriteObjectValue(jw, (isRsv ? "goya_nextclade" : "Nextstrain_clade"), nClade);
 if (gClade)
     jsonWriteObjectValue(jw, (isRsv ? "ramaekers_tableS1" : "GISAID_clade"), gClade);
 if (lineage)
     jsonWriteObjectValue(jw, (isRsv ? "ramaekers_nextclade" : "pango_lineage"), lineage);
 if (nLineage)
     jsonWriteObjectValue(jw, "Nextstrain_lineage", lineage);
 if (nCladeUsher)
     jsonWriteObjectValue(jw, (isRsv ? "goya_usher" : "Nextstrain_clade_usher"), nCladeUsher);
 if (lineageUsher)
     jsonWriteObjectValue(jw, (isRsv ? "ramaekers_usher" : "pango_lineage_usher"), lineageUsher);
 }
 
-static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi,
-                              struct seqWindow *gSeqWin,
-                              int *retAaStart, char *retOldAa, char *retNewAa)
-/* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly
- * (note amino acid values are single-base not strings). */
+INLINE char maybeComplement(char base, struct psl *psl)
+/* If psl is on '+' strand, return base, otherwise return the complement of base. */
 {
-boolean isCodingChange = FALSE;
-if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
+return (pslOrientation(psl) > 0) ? base : ntCompTable[(int)base];
+}
+
+static struct slName *codonVpTxToAaChange(struct vpTx *codonVpTxList,
+                                          struct singleNucChange *ancestorMuts,
+                                          struct geneInfo *gi)
+/* Given a list of vpTx from the same codon, combine their changes with inherited mutations
+ * in the same codon to get the amino acid change at this node.
+ * Note: this assumes there is no UTR in transcript, only CDS.  True so far for pathogens... */
+{
+struct slName *aaChange = NULL;
+if (codonVpTxList != NULL)
+    {
+    struct vpTx *vpTx = codonVpTxList;
+    int firstAaStart = vpTx->start.txOffset / 3;
+    int codonStart = firstAaStart * 3;
+    int codonOffset = vpTx->start.txOffset - codonStart;
+    char oldCodon[4];
+    safencpy(oldCodon, sizeof oldCodon, gi->txSeq->dna + codonStart, 3);
+    int codonStartG = vpTx->start.gOffset - codonOffset;
+    struct singleNucChange *anc;
+    for (anc = ancestorMuts;  anc != NULL;  anc = anc->next)
         {
-    struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 };
-    char gAlt[2];
-    safef(gAlt, sizeof(gAlt), "%c", snc->newBase);
-    struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq);
-    if (vpTx->start.region == vpExon)
+        int ancCodonOffset = anc->chromStart - codonStartG;
+        if (ancCodonOffset >= 0 && ancCodonOffset < 3)
+            oldCodon[ancCodonOffset] = maybeComplement(anc->newBase, gi->psl);
+        }
+    char oldAa = lookupCodon(oldCodon);
+    if (oldAa == '\0')
+        oldAa = '*';
+    char newCodon[4];
+    safecpy(newCodon, sizeof newCodon, oldCodon);
+    for (vpTx = codonVpTxList;  vpTx != NULL;  vpTx = vpTx->next)
         {
         int aaStart = vpTx->start.txOffset / 3;
-        int codonStart = aaStart * 3;
-        char newCodon[4];
-        safencpy(newCodon, sizeof newCodon, gi->txSeq->dna + codonStart, 3);
+        if (aaStart != firstAaStart)
+            errAbort("codonVpTxToAaChange: program error: firstAaStart %d != aaStart %d",
+                     firstAaStart, aaStart);
         int codonOffset = vpTx->start.txOffset - codonStart;
-        assert(codonOffset < sizeof newCodon);
-        char newNt = (pslOrientation(gi->psl) > 0) ? snc->newBase : ntCompTable[(int)snc->newBase];
-        newCodon[codonOffset] = newNt;
-        char newAa = lookupCodon(newCodon);
-        char oldAa;
-        if (snc->parBase == snc->refBase)
-            oldAa = lookupCodon(gi->txSeq->dna + codonStart);
-        else
-            {
-            char oldCodon[4];
-            safencpy(oldCodon, sizeof oldCodon, gi->txSeq->dna + codonStart, 3);
-            char oldNt = (pslOrientation(gi->psl) > 0) ? snc->parBase : ntCompTable[(int)snc->parBase];
-            oldCodon[codonOffset] = oldNt;
-            oldAa = lookupCodon(oldCodon);
+        newCodon[codonOffset] = vpTx->txAlt[0];
         }
-        // Watch out for lookupCodon's null-character return value for stop codon; we want '*'.
+    char newAa = lookupCodon(newCodon);
     if (newAa == '\0')
         newAa = '*';
-        if (oldAa == '\0')
-            oldAa = '*';
     if (newAa != oldAa)
         {
-            isCodingChange = TRUE;
-            *retAaStart = aaStart;
-            *retOldAa = oldAa;
-            *retNewAa = newAa;
+        char aaChangeStr[32];
+        safef(aaChangeStr, sizeof aaChangeStr, "%c%d%c", oldAa, firstAaStart+1, newAa);
+        aaChange = slNameNew(aaChangeStr);
         }
     }
-    vpTxFree(&vpTx);
-    }
-return isCodingChange;
+return aaChange;
 }
 
-struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList,
-                              struct seqWindow *gSeqWin)
-/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
+struct slPair *getAaMutations(struct singleNucChange *sncList, struct singleNucChange *ancestorMuts,
+                              struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
+/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }.
+ * Note: this assumes there is no UTR in transcript, only CDS.  True so far for pathogens... */
 {
 struct slPair *geneChangeList = NULL;
 struct geneInfo *gi;
 for (gi = geneInfoList;  gi != NULL;  gi = gi->next)
     {
     struct slName *aaChangeList = NULL;
+    int prevAaStart = -1;
+    struct vpTx *codonVpTxList = NULL;
     struct singleNucChange *snc;
     for (snc = sncList;  snc != NULL;  snc = snc->next)
         {
         if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
             {
-            int aaStart;
-            char oldAa, newAa;
-            if (changesProtein(snc, gi, gSeqWin, &aaStart, &oldAa, &newAa))
+            struct bed3 gBed3 = { NULL, gSeqWin->seqName, snc->chromStart, snc->chromStart+1 };
+            char gAlt[2];
+            safef(gAlt, sizeof(gAlt), "%c", snc->newBase);
+            struct vpTx *vpTx = vpGenomicToTranscript(gSeqWin, &gBed3, gAlt, gi->psl, gi->txSeq);
+            if (vpTx->start.region == vpExon)
                 {
-                char aaChange[32];
-                safef(aaChange, sizeof aaChange, "%c%d%c", oldAa, aaStart+1, newAa);
-                slNameAddHead(&aaChangeList, aaChange);
+                int aaStart = vpTx->start.txOffset / 3;
+                // Accumulate vpTxs in the same codon
+                if (aaStart == prevAaStart || prevAaStart == -1)
+                    {
+                    slAddHead(&codonVpTxList, vpTx);
+                    }
+                else
+                    {
+                    // Done with previous codon; find change from accumulated mutations
+                    struct slName *aaChange = codonVpTxToAaChange(codonVpTxList, ancestorMuts, gi);
+                    if (aaChange)
+                        slAddHead(&aaChangeList, aaChange);
+                    slFreeListWithFunc(&codonVpTxList, vpTxFree);
+                    codonVpTxList = vpTx;
+                    }
+                prevAaStart = aaStart;
                 }
             }
         }
+    // Find change from final set of accumulated mutations
+    if (codonVpTxList != NULL)
+        {
+        struct slName *aaChange = codonVpTxToAaChange(codonVpTxList, ancestorMuts, gi);
+        if (aaChange)
+            slAddHead(&aaChangeList, aaChange);
+        slFreeListWithFunc(&codonVpTxList, vpTxFree);
+        }
     if (aaChangeList != NULL)
         {
         slReverse(&aaChangeList);
         slAddHead(&geneChangeList, slPairNew(gi->psl->qName, aaChangeList));
         }
     }
 slReverse(&geneChangeList);
 return geneChangeList;
 }
 
 static void jsonWriteBranchAttrs(struct jsonWrite *jw, struct phyloTree *node,
+                                 struct singleNucChange *ancestorMuts,
                                  struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
 /* Write mutations (if any).  If node is not a leaf, write label for mutations at this node. */
 {
 if (node->priv != NULL)
     {
     struct singleNucChange *sncList = node->priv;
-    struct slPair *geneAaMutations = getAaMutations(sncList, geneInfoList, gSeqWin);
+    struct slPair *geneAaMutations = getAaMutations(sncList, ancestorMuts, geneInfoList, gSeqWin);
     jsonWriteObjectStart(jw, "branch_attrs");
     if (node->numEdges > 0)
         {
         jsonWriteObjectStart(jw, "labels");
         if (node->ident->name)
             jsonWriteString(jw, "id", node->ident->name);
         struct singleNucChange *snc = sncList;
         struct dyString *dy = dyStringCreate("%c%d%c",
                                              snc->parBase, snc->chromStart+1, snc->newBase);
         for (snc = sncList->next;  snc != NULL;  snc = snc->next)
             dyStringPrintf(dy, ",%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
         jsonWriteString(jw, "nuc mutations", dy->string);
         dyStringClear(dy);
         for (snc = sncList;  snc != NULL;  snc = snc->next)
             {
@@ -591,86 +622,94 @@
         {
         runLength++;
         if (runLength > maxRunLength)
             {
             maxRunLength = runLength;
             maxRunVal = array[i];
             }
         }
     else
         runLength = 1;
     }
 return (maxRunLength > (arraySize >> 1)) ? maxRunVal : NULL;
 }
 
 static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji,
-                               boolean isRsv,
+                               struct singleNucChange *ancestorMuts, boolean isRsv,
                                char **retUserOrOld, char **retNClade, char **retGClade,
                                char **retLineage, char **retNLineage,
                                char **retNCladeUsher, char **retLineageUsher)
 /* Write Augur/Auspice V2 JSON for tree.  Enclosing object start and end are written by caller. */
 {
-if (node->priv)
-    {
 struct singleNucChange *sncList = node->priv;
+if (sncList)
+    {
     depth += slCount(sncList);
     }
 boolean isUserSample = FALSE;
 if (node->ident->name)
     isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name);
 char *name = node->ident->name;
 struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL;
 if (name)
     jsonWriteString(aji->jw, "name", name);
 else
     jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++);
-jsonWriteBranchAttrs(aji->jw, node, aji->geneInfoList, aji->gSeqWin);
+jsonWriteBranchAttrs(aji->jw, node, ancestorMuts, aji->geneInfoList, aji->gSeqWin);
 if (node->numEdges > 0)
     {
+    struct singleNucChange *allMuts = ancestorMuts;
+    struct singleNucChange *ancLast = slLastEl(ancestorMuts);
+    if (ancLast != NULL)
+        ancLast->next = sncList;
+    else
+        allMuts = sncList;
     jsonWriteListStart(aji->jw, "children");
     char *kidUserOrOld[node->numEdges];
     char *kidNClade[node->numEdges];
     char *kidGClade[node->numEdges];
     char *kidLineage[node->numEdges];
     char *kidNCladeUsher[node->numEdges];
     char *kidLineageUsher[node->numEdges];
     char *kidNLineage[node->numEdges];
     // Step through children in reverse order because nextstrain/Auspice draws upside-down. :)
     int i;
     for (i = node->numEdges - 1;  i >= 0;  i--)
         {
         jsonWriteObjectStart(aji->jw, NULL);
-        rTreeToAuspiceJson(node->edges[i], depth, aji, isRsv,
+        rTreeToAuspiceJson(node->edges[i], depth, aji, allMuts, isRsv,
                            &kidUserOrOld[i], &kidNClade[i], &kidGClade[i], &kidLineage[i],
                            &kidNLineage[i], &kidNCladeUsher[i], &kidLineageUsher[i]);
         jsonWriteObjectEnd(aji->jw);
         }
     jsonWriteListEnd(aji->jw);
     if (retUserOrOld)
         *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges);
     if (retNClade)
         *retNClade = majorityMaybe(kidNClade, node->numEdges);
     if (retGClade)
         *retGClade = majorityMaybe(kidGClade, node->numEdges);
     if (retLineage)
         *retLineage = majorityMaybe(kidLineage, node->numEdges);
     if (retNCladeUsher)
         *retNCladeUsher = majorityMaybe(kidNCladeUsher, node->numEdges);
     if (retLineageUsher)
         *retLineageUsher = majorityMaybe(kidLineageUsher, node->numEdges);
     if (retNLineage)
         *retNLineage = majorityMaybe(kidNLineage, node->numEdges);
+    if (ancLast)
+        ancLast->next = NULL;
     }
 jsonWriteObjectStart(aji->jw, "node_attrs");
 jsonWriteDouble(aji->jw, "div", depth);
 if (node->numEdges == 0)
     jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls,
                                 aji->samplePlacements, isRsv,
                                 retUserOrOld, retNClade, retGClade, retLineage, retNLineage,
                                 retNCladeUsher, retLineageUsher);
 else if (retUserOrOld && retGClade && retLineage)
     jsonWriteBranchNodeAttributes(aji->jw, isRsv, *retUserOrOld, *retNClade, *retGClade, *retLineage,
                                   *retNLineage, *retNCladeUsher, *retLineageUsher);
 jsonWriteObjectEnd(aji->jw);
 }
 
 struct phyloTree *phyloTreeNewNode(char *name)
@@ -772,23 +811,23 @@
     slNameAddHead(&colorFields, "Nextstrain_lineage");
     }
 //#*** END FIXME
 writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, db, colorFields, geneInfoList,
                  gSeqWin->end);
 jsonWriteObjectStart(jw, "tree");
 int nodeNum = 10000; // Auspice.us starting node number for newick -> json
 int depth = 0;
 
 // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree
 struct phyloTree *root = phyloTreeNewNode("wrapper");
 phyloAddEdge(root, tree);
 tree = root;
 struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin,
                                sampleMetadata, sampleUrls, samplePlacements, nodeNum, source };
-rTreeToAuspiceJson(tree, depth, &aji, isRsv, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+rTreeToAuspiceJson(tree, depth, &aji, NULL, isRsv, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
 jsonWriteObjectEnd(jw); // tree
 jsonWriteObjectEnd(jw); // top-level object
 fputs(jw->dy->string, outF);
 jsonWriteFree(&jw);
 carefulClose(&outF);
 }