7e58340888377874edaad1dbc5174e20295f890c
angie
  Mon Feb 22 14:17:33 2021 -0800
Support upload of more sequences, add TSV file summarizing sample variants and placements.
Requested by Joe de Risi (UCSF).  Increase timeout to 10 minutes; make TSV with each sample's
ID, nuc muts, AA muts, imputed bases and path from root to sample.  Also use Yatish's new
-K subtree algorithm in usher: one subtree encompassing all uploaded samples, plus the
specified number of samples randomly selected from the rest of the tree.
Don't show every single sample name in the title because there can be 1000 samples in the
same subtree now.  :)

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index 78c9355..f1ef1ed 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -11,33 +11,39 @@
 #include "parsimonyProto.h"
 #include "phyloPlace.h"
 #include "phyloTree.h"
 #include "variantProjector.h"
 
 // Globals
 extern char *chrom;
 extern int chromSize;
 
 static void writeAuspiceMeta(FILE *outF, struct slName *subtreeUserSampleIds, char *source)
 /* Write metadata to configure Auspice display. */
 {
 fprintf(outF,
         "\"meta\": { "
         "\"title\": \"Subtree with %s", subtreeUserSampleIds->name);
+int sampleCount = slCount(subtreeUserSampleIds);
+if (sampleCount > 10)
+    fprintf(outF, " and %d other uploaded samples", sampleCount - 1);
+else
+    {
     struct slName *sln;
     for (sln = subtreeUserSampleIds->next;  sln != NULL;  sln = sln->next)
         fprintf(outF, ", %s", sln->name);
+    }
 fputs("\", "
       "\"panels\": [ \"tree\"] , "
       "\"colorings\": [ "
       "  { \"key\": \"pangolin_lineage\", "
       "    \"title\": \"Pangolin lineage\", \"type\": \"categorical\" },"
       "  { \"key\": \"Nextstrain_clade\","
       "    \"scale\": [ [ \"19B\", \"#EC676D\" ], [ \"19A\", \"#F79E43\" ],"
       "        [ \"20A\", \"#B6D77A\" ], [ \"20C\", \"#8FD4ED\" ],"
       "        [ \"20B\", \"#A692C3\" ] ],"
       "    \"title\": \"Nextstrain Clade\", \"type\": \"categorical\" },"
       "  { \"key\": \"GISAID_clade\","
       "    \"scale\": [ [ \"S\", \"#EC676D\" ], [ \"L\", \"#F79E43\" ], [ \"O\", \"#F9D136\" ],"
       "        [ \"V\", \"#FAEA95\" ], [ \"G\", \"#B6D77A\" ], [ \"GH\", \"#8FD4ED\" ],"
       "        [ \"GR\", \"#A692C3\" ] ],"
       "    \"title\": \"GISAID Clade\", \"type\": \"categorical\" },"
@@ -127,38 +133,30 @@
 
 static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld,
                                           char *nClade, char *gClade, char *lineage)
 /* Write elements of node_attrs for a branch. */
 {
 if (userOrOld)
     jsonWriteObjectValue(jw, "userOrOld", userOrOld);
 if (nClade)
     jsonWriteObjectValue(jw, "Nextstrain_clade", nClade);
 if (gClade)
     jsonWriteObjectValue(jw, "GISAID_clade", gClade);
 if (lineage)
     jsonWriteObjectValue(jw, "pangolin_lineage", lineage);
 }
 
-struct geneInfo
-/* Information sufficient to determine whether a genome change causes a coding change. */
-    {
-    struct geneInfo *next;
-    struct psl *psl;        // Alignment of transcript to genome
-    struct dnaSeq *txSeq;   // Transcript sequence
-    };
-
 static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi,
                               struct seqWindow *gSeqWin,
                               int *retAaStart, char *retOldAa, char *retNewAa)
 /* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly
  * (note amino acid values are single-base not strings). */
 {
 boolean isCodingChange = FALSE;
 if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
     {
     struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 };
     char gAlt[2];
     safef(gAlt, sizeof(gAlt), "%c", snc->newBase);
     if (!sameString(gi->psl->strand, "+"))
         errAbort("changesProtein: only worlds for psl->strand='+', but gene '%s' has strand '%s'",
                  gi->psl->qName, gi->psl->strand);
@@ -175,31 +173,31 @@
         char oldAa = lookupCodon(gi->txSeq->dna + codonStart);
         char newAa = lookupCodon(newCodon);
         if (newAa != oldAa)
             {
             isCodingChange = TRUE;
             *retAaStart = aaStart;
             *retOldAa = oldAa;
             *retNewAa = newAa;
             }
         }
     vpTxFree(&vpTx);
     }
 return isCodingChange;
 }
 
-static struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList,
+struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList,
                               struct seqWindow *gSeqWin)
 /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
 {
 struct slPair *geneChangeList = NULL;
 struct geneInfo *gi;
 for (gi = geneInfoList;  gi != NULL;  gi = gi->next)
     {
     struct slName *aaChangeList = NULL;
     struct singleNucChange *snc;
     for (snc = sncList;  snc != NULL;  snc = snc->next)
         {
         if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
             {
             int aaStart;
             char oldAa, newAa;
@@ -421,49 +419,46 @@
             txOffset += exonLen;
             }
         struct geneInfo *gi;
         AllocVar(gi);
         gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen);
         gi->txSeq = newDnaSeq(seq, txLen, gp->name2);
         slAddHead(&geneInfoList, gi);
         }
     lmCleanup(&lm);
     bigBedFileClose(&bbi);
     }
 slReverse(&geneInfoList);
 return geneInfoList;
 }
 
-void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref,
-                       char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile,
+void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
+                       struct seqWindow *gSeqWin, struct hash *sampleMetadata, char *jsonFile,
                        char *source)
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 {
 struct phyloTree *tree = sti->subtree;
 FILE *outF = mustOpen(jsonFile, "w");
 fputs("{ \"version\": \"v2\", ", outF);
 writeAuspiceMeta(outF, sti->subtreeUserSampleIds, source);
 // The meta part is mostly constant & easier to just write out, but jsonWrite is better for the
 // nested tree structure.
 struct jsonWrite *jw = jsonWriteNew();
 jsonWriteObjectStart(jw, "tree");
 int nodeNum = 10000; // Auspice.us starting node number for newick -> json
 int depth = 0;
 
 // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree
 struct phyloTree *root = phyloTreeNewNode("wrapper");
 phyloAddEdge(root, tree);
 tree = root;
-struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, ref);
-struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize);
 struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin,
                                sampleMetadata, nodeNum, source };
 rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL, NULL);
-chromSeqWindowFree(&gSeqWin);
 jsonWriteObjectEnd(jw);
 fputs(jw->dy->string, outF);
 jsonWriteFree(&jw);
 fputs("}", outF);
 carefulClose(&outF);
 }