7e58340888377874edaad1dbc5174e20295f890c
angie
  Mon Feb 22 14:17:33 2021 -0800
Support upload of more sequences, add TSV file summarizing sample variants and placements.
Requested by Joe de Risi (UCSF).  Increase timeout to 10 minutes; make TSV with each sample's
ID, nuc muts, AA muts, imputed bases and path from root to sample.  Also use Yatish's new
-K subtree algorithm in usher: one subtree encompassing all uploaded samples, plus the
specified number of samples randomly selected from the rest of the tree.
Don't show every single sample name in the title because there can be 1000 samples in the
same subtree now.  :)

diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h
index 1a22d68..481c747 100644
--- src/hg/hgPhyloPlace/phyloPlace.h
+++ src/hg/hgPhyloPlace/phyloPlace.h
@@ -1,30 +1,37 @@
 /* Place SARS-CoV-2 sequences in phylogenetic tree using add_missing_samples program. */
 
 #ifndef _PHYLO_PLACE_H_
 #define _PHYLO_PLACE_H_
 
 #include "common.h"
 #include "dnaseq.h"
 #include "hash.h"
 #include "linefile.h"
 #include "parsimonyProto.h"
 #include "phyloTree.h"
+#include "seqWindow.h"
 #include "trashDir.h"
 
 #define PHYLOPLACE_DATA_DIR "hgPhyloPlaceData"
 
+// Allow users to upload a lot of sequences, but put limits on how much detail we'll show and
+// how many custom tracks we'll create.
+#define MAX_SUBTREE_BUTTONS 50
+#define MAX_SEQ_DETAILS 100
+#define MAX_SUBTREE_CTS 10
+
 #define NEXTSTRAIN_DRAG_DROP_DOC "https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html"
 
 struct treeChoices
 /* Phylogenetic tree versions for the user to choose from. */
 {
     char **protobufFiles;      // Mutation annotated tree files in protobuf format for UShER
     char **metadataFiles;      // Sample metadata a la GISAID's nextmeta download option
     char **sources;            // GISAID or public
     char **descriptions;       // Menu labels to describe the options to the user
     int count;                 // Number of choices (and size of each array)
 };
 
 struct seqInfo
 /* User sequences, alignments and statistics */
 {
@@ -104,55 +111,71 @@
     char *date;         // Sample collection date
     char *author;       // Author(s) to credit
     char *nClade;       // Nextstrain year-letter clade
     char *gClade;       // GISAID amino acid change clade
     char *lineage;      // Pangolin lineage
     char *country;      // Country in which sample was collected
     char *division;     // Administrative division in which sample was collected (country or state)
     char *location;     // Location in which sample was collected (city)
     char *countryExp;   // Country in which host was exposed to virus
     char *divExp;       // Administrative division in which host was exposed to virus
     char *origLab;      // Originating lab
     char *subLab;       // Submitting lab
     char *region;       // Continent on which sample was collected
     };
 
+struct geneInfo
+/* Information sufficient to determine whether a genome change causes a coding change. */
+    {
+    struct geneInfo *next;
+    struct psl *psl;        // Alignment of transcript to genome
+    struct dnaSeq *txSeq;   // Transcript sequence
+    };
+
 struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
                               boolean *informativeBases, struct slName **maskSites,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime);
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 
 struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName *userSampleIds,
                               struct hash *condensedNodes, int *pStartTime);
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
  * and parse other results out of stderr output. */
 
-void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref,
-                       char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile,
+struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList,
+                              struct seqWindow *gSeqWin);
+/* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
+
+struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome);
+/* If config.ra has a source of gene annotations, then return the gene list. */
+
+void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
+                       struct seqWindow *gSeqWin, struct hash *sampleMetadata, char *jsonFile,
                        char *source);
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 
 struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur,
                                    struct slName *sampleIds, struct phyloTree *bigTree,
-                                   char *source, int fontHeight, int *pStartTime);
+                                   char *source, int fontHeight, struct phyloTree **retSampleTree,
+                                   int *pStartTime);
 /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */
 
 
 struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId);
 /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
 
 struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds);
 /* Prune all descendants of node that have no leaf descendants in sampleIds. */
 
 char *phyloPlaceDbSetting(char *db, char *settingName);
 /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */
 
 char *phyloPlaceDbSettingPath(char *db, char *settingName);
 /* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra,
  * or NULL if not found.  (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */