7c0d73b5b9eb8303ee04ba236d31a6703552581c
angie
  Wed Dec 2 16:32:15 2020 -0800
hgPhyloPlace metadata: add Nextstrain_clade and genbank_accession columns, refactor in prep for public data.
* Move the metadatafile-reading code into phyloPlace.c so metadata file can be used for looking up lineage (no need for separate idToLineage file).
* Support looking up metadata by GenBank ID in addition to EPI_ISL ID, in preparation for using protobuf and metadata from public sequences instead of GISAID.

diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h
index 3a9ba84..9408a7f 100644
--- src/hg/hgPhyloPlace/phyloPlace.h
+++ src/hg/hgPhyloPlace/phyloPlace.h
@@ -73,62 +73,80 @@
     struct tempName *subtreeTn;           // Newick file from usher (may have condensed nodes)
     struct phyloTree *subtree;            // Parsed subtree (#*** with annotated muts?  shoudl we?_
     struct hash *subtreeIdToIx;           // Map of subtree nodes to VCF output sample order
     struct slName *subtreeUserSampleIds;  // List of user-uploaded samples in this subtree
     struct slName *subtreeNameList;       // List of leaf names with nicer names for cond. nodes
     };
 
 struct usherResults
 /* Tree+samples download file, sample placements, and subtrees parsed from usher output. */
     {
     struct tempName *bigTreePlusTn;      // Newick file: original tree plus user's samples
     struct hash *samplePlacements;       // Info about each sample's placement in the tree
     struct subtreeInfo *subtreeInfoList; // For each subtree: tree, file, node info etc.
     };
 
+struct sampleMetadata
+/* Information about a virus sample. */
+    {
+    char *strain;       // Strain name, usually of the form Country/ArbitraryId/YYYY-MM-DD
+    char *epiId;        // GISAID EPI_ISL_[0-9]+ ID
+    char *gbAcc;        // GenBank accession
+    char *date;         // Sample collection date
+    char *author;       // Author(s) to credit
+    char *nClade;       // Nextstrain year-letter clade
+    char *gClade;       // GISAID amino acid change clade
+    char *lineage;      // Pangolin lineage
+    char *country;      // Country in which sample was collected
+    char *division;     // Administrative division in which sample was collected (country or state)
+    char *location;     // Location in which sample was collected (city)
+    char *countryExp;   // Country in which host was exposed to virus
+    char *divExp;       // Administrative division in which host was exposed to virus
+    char *origLab;      // Originating lab
+    char *subLab;       // Submitting lab
+    char *region;       // Continent on which sample was collected
+    };
+
 struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
                               boolean *informativeBases, struct slName **maskSites,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime);
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 
 struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName *userSampleIds,
                               struct hash *condensedNodes, int *pStartTime);
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
  * and parse other results out of stderr output. */
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref,
-                       char *bigGenePredFile, char *metadataFile, char *jsonFile);
+                       char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile);
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 
 struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur,
                                    struct slName *sampleIds, struct phyloTree *bigTree,
                                    int fontHeight, int *pStartTime);
 /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */
 
 
-char *epiIdFromSampleName(char *sampleId);
-/* If an EPI_ISL_# ID is present somewhere in sampleId, extract and return it, otherwise NULL. */
-
-char *lineageForSample(char *db, char *sampleId);
-/* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */
+struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId);
+/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
 
 struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds);
 /* Prune all descendants of node that have no leaf descendants in sampleIds. */
 
 char *phyloPlaceDbSetting(char *db, char *settingName);
 /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */
 
 char *phyloPlaceDbSettingPath(char *db, char *settingName);
 /* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra,
  * or NULL if not found.  (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */
 
 void reportTiming(int *pStartTime, char *message);
 /* Print out a report to stderr of how much time something took. */
 
 boolean hgPhyloPlaceEnabled();