9c4b7225d91f5180fcddbecd843b71c4e6503688
angie
  Tue Mar 30 10:39:22 2021 -0700
Add subtree JSON URLs to single-subtree JSON, using Auspice's new url attribute.  Allow up to 1000 VCF samples and extend timeout to 15 minutes.

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index 6fdd045..a84e777 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -23,145 +23,157 @@
 fprintf(outF,
         "\"meta\": { "
         "\"title\": \"Subtree with %s", subtreeUserSampleIds->name);
 int sampleCount = slCount(subtreeUserSampleIds);
 if (sampleCount > 10)
     fprintf(outF, " and %d other uploaded samples", sampleCount - 1);
 else
     {
     struct slName *sln;
     for (sln = subtreeUserSampleIds->next;  sln != NULL;  sln = sln->next)
         fprintf(outF, ", %s", sln->name);
     }
 fputs("\", "
       "\"panels\": [ \"tree\"] , "
       "\"colorings\": [ "
-      "  { \"key\": \"pangolin_lineage\", "
-      "    \"title\": \"Pangolin lineage\", \"type\": \"categorical\" },"
+      "  { \"key\": \"pango_lineage\", "
+      "    \"title\": \"Pango lineage\", \"type\": \"categorical\" },"
       "  { \"key\": \"Nextstrain_clade\","
       "    \"scale\": [ [ \"19B\", \"#EC676D\" ], [ \"19A\", \"#F79E43\" ],"
       "        [ \"20A\", \"#B6D77A\" ], [ \"20C\", \"#8FD4ED\" ],"
       "        [ \"20B\", \"#A692C3\" ], [ \"20D\", \"#8020A0\" ],"
       "        [ \"20E (EU1)\", \"#44CC44\" ], [ \"20F\", \"#8822AA\" ],"
       "        [ \"20G\", \"#8888FF\" ], [ \"20H/501Y.V2\", \"#6666FF\" ],"
       "        [ \"20I/501Y.V1\", \"#CC44EE\" ], [ \"20A.EU2\", \"#448844\"  ], "
       "        [ \"uploaded sample\", \"#FF0000\" ] ],"
       "    \"title\": \"Nextstrain Clade\", \"type\": \"categorical\" },"
       , outF);
 if (sameString(source, "GISAID"))
     fputs("  { \"key\": \"GISAID_clade\","
       "    \"scale\": [ [ \"S\", \"#EC676D\" ], [ \"L\", \"#F79E43\" ], [ \"O\", \"#F9D136\" ],"
       "        [ \"V\", \"#FAEA95\" ], [ \"G\", \"#B6D77A\" ], [ \"GH\", \"#8FD4ED\" ],"
       "        [ \"GR\", \"#A692C3\" ] ],"
       "    \"title\": \"GISAID Clade\", \"type\": \"categorical\" },"
           , outF);
 fprintf(outF, "  { \"key\": \"userOrOld\", "
         "    \"scale\": [ [ \"uploaded sample\", \"#CC0000\"] , [ \"%s\", \"#000000\"] ],"
         "    \"title\": \"Sample type\", \"type\": \"categorical\" }"
         , source);
 fputs("  ] , "
 //#*** Filters didn't seem to work... maybe something about the new fetch feature, or do I need to spcify in some other way?
 //#***      "\"filters\": [ \"GISAID_clade\", \"region\", \"country\", \"division\", \"author\" ], "
+      "\"filters\": [ ], "
       "\"display_defaults\": { "
       "  \"branch_label\": \"none\", "
       "  \"color_by\": \"userOrOld\" "
       "}, "
       , outF);
 fprintf(outF,
         "\"description\": \"Dataset generated by [UShER web interface]"
         "(%shgPhyloPlace) using the "
         "[usher](https://github.com/yatisht/usher/) program.  "
 //#*** TODO: describe input from which tree was generated: user sample, version of tree, etc.
         , hLocalHostCgiBinUrl());
 fputs("If you have metadata you wish to display, you can now drag on a CSV file and it will be "
       "added into this view, [see here]("NEXTSTRAIN_DRAG_DROP_DOC") "
       "for more info.\"} ,"
       , outF);
 }
 
-static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value)
-/* Write an object with one member, "value", set to value, as most Auspice node attributes are
- * formatted. */
+static void jsonWriteObjectValueUrl(struct jsonWrite *jw, char *name, char *value, char *url)
+/* Write an object with member "value" set to value, and if url is non-empty, "url" set to url. */
 {
 jsonWriteObjectStart(jw, name);
 jsonWriteString(jw, "value", value);
+if (isNotEmpty(url))
+    jsonWriteString(jw, "url", url);
 jsonWriteObjectEnd(jw);
 }
 
+static void jsonWriteObjectValue(struct jsonWrite *jw, char *name, char *value)
+/* Write an object with one member, "value", set to value, as most Auspice node attributes are
+ * formatted. */
+{
+jsonWriteObjectValueUrl(jw, name, value, NULL);
+}
+
 static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name,
                                         struct sampleMetadata *met, boolean isUserSample,
-                                        char *source,
+                                        char *source, struct hash *sampleUrls,
                                         char **retUserOrOld, char **retNClade, char **retGClade,
                                         char **retLineage)
 /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash,
  * or may be a new sample from the user.  Set rets for color categories so parent branches can
  * determine their color categories. */
 {
 *retUserOrOld = isUserSample ? "uploaded sample" : source;
 jsonWriteObjectValue(jw, "userOrOld", *retUserOrOld);
 if (met && met->date)
     jsonWriteObjectValue(jw, "date", met->date);
 if (met && met->author)
     {
     jsonWriteObjectValue(jw, "author", met->author);
     // Note: Nextstrain adds paper_url and title when available; they also add author and use
     // a uniquified value (e.g. "author": "Wenjie Tan et al" / "value": "Wenjie Tan et al A")
     }
 *retNClade = isUserSample ? "uploaded sample" : (met && met->nClade) ? met->nClade : NULL;
 if (isNotEmpty(*retNClade))
     jsonWriteObjectValue(jw, "Nextstrain_clade", *retNClade);
 *retGClade = isUserSample ? "uploaded sample" : (met && met->gClade) ? met->gClade : NULL;
 if (isNotEmpty(*retGClade))
     jsonWriteObjectValue(jw, "GISAID_clade", *retGClade);
 *retLineage = isUserSample ? "uploaded sample" :
                              (met && met->lineage) ? met->lineage : NULL;
 if (isNotEmpty(*retLineage))
-    jsonWriteObjectValue(jw, "pangolin_lineage", *retLineage);
+    jsonWriteObjectValue(jw, "pango_lineage", *retLineage);
 if (met && met->epiId)
     jsonWriteObjectValue(jw, "gisaid_epi_isl", met->epiId);
 if (met && met->gbAcc)
     jsonWriteObjectValue(jw, "genbank_accession", met->gbAcc);
 if (met && met->country)
     jsonWriteObjectValue(jw, "country", met->country);
 if (met && met->division)
     jsonWriteObjectValue(jw, "division", met->division);
 if (met && met->location)
     jsonWriteObjectValue(jw, "location", met->location);
 if (met && met->countryExp)
     jsonWriteObjectValue(jw, "country_exposure", met->countryExp);
 if (met && met->divExp)
     jsonWriteObjectValue(jw, "division_exposure", met->divExp);
 if (met && met->origLab)
     jsonWriteObjectValue(jw, "originating_lab", met->origLab);
 if (met && met->subLab)
     jsonWriteObjectValue(jw, "submitting_lab", met->subLab);
 if (met && met->region)
     jsonWriteObjectValue(jw, "region", met->region);
+char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL;
+if (isNotEmpty(sampleUrl))
+    jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl);
 }
 
 static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, char *userOrOld,
                                           char *nClade, char *gClade, char *lineage)
 /* Write elements of node_attrs for a branch. */
 {
 if (userOrOld)
     jsonWriteObjectValue(jw, "userOrOld", userOrOld);
 if (nClade)
     jsonWriteObjectValue(jw, "Nextstrain_clade", nClade);
 if (gClade)
     jsonWriteObjectValue(jw, "GISAID_clade", gClade);
 if (lineage)
-    jsonWriteObjectValue(jw, "pangolin_lineage", lineage);
+    jsonWriteObjectValue(jw, "pango_lineage", lineage);
 }
 
 static boolean changesProtein(struct singleNucChange *snc, struct geneInfo *gi,
                               struct seqWindow *gSeqWin,
                               int *retAaStart, char *retOldAa, char *retNewAa)
 /* If snc changes the coding sequence of gene, return TRUE and set ret values accordingly
  * (note amino acid values are single-base not strings). */
 {
 boolean isCodingChange = FALSE;
 if (snc->chromStart < gi->psl->tEnd && snc->chromStart >= gi->psl->tStart)
     {
     struct bed3 gBed3 = { NULL, chrom, snc->chromStart, snc->chromStart+1 };
     char gAlt[2];
     safef(gAlt, sizeof(gAlt), "%c", snc->newBase);
     if (!sameString(gi->psl->strand, "+"))
@@ -277,30 +289,31 @@
     jsonWriteListEnd(jw);
     jsonWriteObjectEnd(jw);  // mutations
     jsonWriteObjectEnd(jw); // branch_attrs
     }
 }
 
 struct auspiceJsonInfo
 /* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the
  * recursive function doesn't need a dozen args. */
     {
     struct jsonWrite *jw;
     struct slName *subtreeUserSampleIds;  // Subtree node names for user samples (not from big tree)
     struct geneInfo *geneInfoList;        // Transcript seq & alignment for predicting AA change
     struct seqWindow *gSeqWin;            // Reference genome seq for predicting AA change
     struct hash *sampleMetadata;          // Sample metadata for decorating tree
+    struct hash *sampleUrls;              // URLs for samples, if applicable
     int nodeNum;                          // For generating sequential node ID (in absence of name)
     char *source;                         // Source of non-user sequences in tree (GISAID or public)
     };
 
 static int cmpstringp(const void *p1, const void *p2)
 /* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */
 {
 char *s1 = *(char * const *)p1;
 char *s2 = *(char * const *)p2;
 if (s1 && s2)
     return strcmp(s1, s2);
 else if (s1 && !s2)
     return 1;
 else if (s2 && !s1)
     return -1;
@@ -369,31 +382,31 @@
         jsonWriteObjectEnd(aji->jw);
         }
     jsonWriteListEnd(aji->jw);
     if (retUserOrOld)
         *retUserOrOld = majorityMaybe(kidUserOrOld, node->numEdges);
     if (retNClade)
         *retNClade = majorityMaybe(kidNClade, node->numEdges);
     if (retGClade)
         *retGClade = majorityMaybe(kidGClade, node->numEdges);
     if (retLineage)
         *retLineage = majorityMaybe(kidLineage, node->numEdges);
     }
 jsonWriteObjectStart(aji->jw, "node_attrs");
 jsonWriteDouble(aji->jw, "div", depth);
 if (node->numEdges == 0)
-    jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source,
+    jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls,
                                 retUserOrOld, retNClade, retGClade, retLineage);
 else if (retUserOrOld && retGClade && retLineage)
     jsonWriteBranchNodeAttributes(aji->jw, *retUserOrOld, *retNClade, *retGClade, *retLineage);
 jsonWriteObjectEnd(aji->jw);
 }
 
 struct phyloTree *phyloTreeNewNode(char *name)
 /* Alloc & return a new node with no children. */
 {
 struct phyloTree *node;
 AllocVar(node);
 AllocVar(node->ident);
 node->ident->name = cloneString(name);
 return node;
 }
@@ -427,45 +440,45 @@
             }
         struct geneInfo *gi;
         AllocVar(gi);
         gi->psl = genePredToPsl((struct genePred *)gp, chromSize, txLen);
         gi->txSeq = newDnaSeq(seq, txLen, gp->name2);
         slAddHead(&geneInfoList, gi);
         }
     lmCleanup(&lm);
     bigBedFileClose(&bbi);
     }
 slReverse(&geneInfoList);
 return geneInfoList;
 }
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
-                       struct seqWindow *gSeqWin, struct hash *sampleMetadata, char *jsonFile,
-                       char *source)
+                       struct seqWindow *gSeqWin, struct hash *sampleMetadata,
+                       struct hash *sampleUrls, char *jsonFile, char *source)
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 {
 struct phyloTree *tree = sti->subtree;
 FILE *outF = mustOpen(jsonFile, "w");
 fputs("{ \"version\": \"v2\", ", outF);
 writeAuspiceMeta(outF, sti->subtreeUserSampleIds, source);
 // The meta part is mostly constant & easier to just write out, but jsonWrite is better for the
 // nested tree structure.
 struct jsonWrite *jw = jsonWriteNew();
 jsonWriteObjectStart(jw, "tree");
 int nodeNum = 10000; // Auspice.us starting node number for newick -> json
 int depth = 0;
 
 // Add an extra root node because otherwise Auspice won't draw branch from big tree root to subtree
 struct phyloTree *root = phyloTreeNewNode("wrapper");
 phyloAddEdge(root, tree);
 tree = root;
 struct auspiceJsonInfo aji = { jw, sti->subtreeUserSampleIds, geneInfoList, gSeqWin,
-                               sampleMetadata, nodeNum, source };
+                               sampleMetadata, sampleUrls, nodeNum, source };
 rTreeToAuspiceJson(tree, depth, &aji, NULL, NULL, NULL, NULL);
 jsonWriteObjectEnd(jw);
 fputs(jw->dy->string, outF);
 jsonWriteFree(&jw);
 fputs("}", outF);
 carefulClose(&outF);
 }