be4311c07e14feb728abc6425ee606ffaa611a58
markd
  Fri Jan 22 06:46:58 2021 -0800
merge with master

diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h
index 9408a7f..1a22d68 100644
--- src/hg/hgPhyloPlace/phyloPlace.h
+++ src/hg/hgPhyloPlace/phyloPlace.h
@@ -1,161 +1,175 @@
 /* Place SARS-CoV-2 sequences in phylogenetic tree using add_missing_samples program. */
 
 #ifndef _PHYLO_PLACE_H_
 #define _PHYLO_PLACE_H_
 
 #include "common.h"
 #include "dnaseq.h"
 #include "hash.h"
 #include "linefile.h"
 #include "parsimonyProto.h"
 #include "phyloTree.h"
 #include "trashDir.h"
 
 #define PHYLOPLACE_DATA_DIR "hgPhyloPlaceData"
 
 #define NEXTSTRAIN_DRAG_DROP_DOC "https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html"
 
+struct treeChoices
+/* Phylogenetic tree versions for the user to choose from. */
+{
+    char **protobufFiles;      // Mutation annotated tree files in protobuf format for UShER
+    char **metadataFiles;      // Sample metadata a la GISAID's nextmeta download option
+    char **sources;            // GISAID or public
+    char **descriptions;       // Menu labels to describe the options to the user
+    int count;                 // Number of choices (and size of each array)
+};
+
 struct seqInfo
 /* User sequences, alignments and statistics */
 {
     struct seqInfo *next;
     struct dnaSeq *seq;
     struct psl *psl;
     struct singleNucChange *sncList;
     struct singleNucChange *maskedSncList;
     struct slRef *maskedReasonsList;
     uint nCountStart;
     uint nCountMiddle;
     uint nCountEnd;
     uint ambigCount;
 };
 
 struct variantPathNode
 /* A list of these gives a path from phylo tree root to some node (usually user seq leaf). */
     {
     struct variantPathNode *next;
     char *nodeName;                   // Either a numeric internal node ID or the user seq name
     struct singleNucChange *sncList;  // One or more single nucleotide changes associated with node
     };
 
 struct bestNodeInfo
     {
     struct bestNodeInfo *next;
     char *name;                           // Node name
     struct variantPathNode *variantPath;  // Mutations assigned to nodes along path from root->node
     boolean isSibling;                    // Placement would be as sibling of node (not child)
     };
 
 struct baseVal
 /* List of imputed base positions and values */
     {
     struct baseVal *next;
     int chromStart;         // 0-based position
     char *val;              // nucleotide base(s)
     };
 
 struct placementInfo
 /* Info about a sample's mutations and placement in the phylo tree */
     {
     char *sampleId;                       // Sample name from FASTA or VCF header
     struct slName *sampleMuts;            // Differences with the reference genome
     struct variantPathNode *variantPath;  // Mutations assigned to nodes along path from root
     struct bestNodeInfo *bestNodes;       // Other nodes identified as equally parsimonious
     struct baseVal *imputedBases;         // Ambiguous bases imputed to ref/alt [ACGT]
     int parsimonyScore;                   // Parsimony cost of placing sample
     int bestNodeCount;                    // Number of equally parsimonious placements
     };
 
 struct subtreeInfo
 /* Parsed subtree from usher and derivative products. */
     {
     struct subtreeInfo *next;
     struct tempName *subtreeTn;           // Newick file from usher (may have condensed nodes)
     struct phyloTree *subtree;            // Parsed subtree (#*** with annotated muts?  shoudl we?_
     struct hash *subtreeIdToIx;           // Map of subtree nodes to VCF output sample order
     struct slName *subtreeUserSampleIds;  // List of user-uploaded samples in this subtree
     struct slName *subtreeNameList;       // List of leaf names with nicer names for cond. nodes
     };
 
 struct usherResults
 /* Tree+samples download file, sample placements, and subtrees parsed from usher output. */
     {
     struct tempName *bigTreePlusTn;      // Newick file: original tree plus user's samples
     struct hash *samplePlacements;       // Info about each sample's placement in the tree
     struct subtreeInfo *subtreeInfoList; // For each subtree: tree, file, node info etc.
     };
 
 struct sampleMetadata
 /* Information about a virus sample. */
     {
     char *strain;       // Strain name, usually of the form Country/ArbitraryId/YYYY-MM-DD
     char *epiId;        // GISAID EPI_ISL_[0-9]+ ID
     char *gbAcc;        // GenBank accession
     char *date;         // Sample collection date
     char *author;       // Author(s) to credit
     char *nClade;       // Nextstrain year-letter clade
     char *gClade;       // GISAID amino acid change clade
     char *lineage;      // Pangolin lineage
     char *country;      // Country in which sample was collected
     char *division;     // Administrative division in which sample was collected (country or state)
     char *location;     // Location in which sample was collected (city)
     char *countryExp;   // Country in which host was exposed to virus
     char *divExp;       // Administrative division in which host was exposed to virus
     char *origLab;      // Originating lab
     char *subLab;       // Submitting lab
     char *region;       // Continent on which sample was collected
     };
 
 struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
                               boolean *informativeBases, struct slName **maskSites,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime);
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 
 struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName *userSampleIds,
                               struct hash *condensedNodes, int *pStartTime);
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
  * and parse other results out of stderr output. */
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref,
-                       char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile);
+                       char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile,
+                       char *source);
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 
 struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur,
                                    struct slName *sampleIds, struct phyloTree *bigTree,
-                                   int fontHeight, int *pStartTime);
+                                   char *source, int fontHeight, int *pStartTime);
 /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */
 
 
 struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId);
 /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
 
 struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds);
 /* Prune all descendants of node that have no leaf descendants in sampleIds. */
 
 char *phyloPlaceDbSetting(char *db, char *settingName);
 /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */
 
 char *phyloPlaceDbSettingPath(char *db, char *settingName);
 /* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra,
  * or NULL if not found.  (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */
 
+struct treeChoices *loadTreeChoices(char *db);
+/* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */
+
 void reportTiming(int *pStartTime, char *message);
 /* Print out a report to stderr of how much time something took. */
 
 boolean hgPhyloPlaceEnabled();
 /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */
 
-char *phyloPlaceSamples(struct lineFile *lf, char *db, boolean doMeasureTiming, int subtreeSize,
-                        int fontHeight);
-/* Given a lineFile that contains either FASTA or VCF, prepare VCF for add_missing_samples;
- * if that goes well then run add_missing_samples, report results, make custom track files
+char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf,
+                        boolean doMeasureTiming, int subtreeSize, int fontHeight);
+/* Given a lineFile that contains either FASTA or VCF, prepare VCF for usher;
+ * if that goes well then run usher, report results, make custom track files
  * and return the top-level custom track file; otherwise return NULL. */
 
 #endif //_PHYLO_PLACE_H_