08ec019f344fc52bd83ea3a98bedccd0048d732c
angie
  Mon May 24 00:04:08 2021 -0700
Let the user upload a file containing names or IDs of sequences already in the selected tree; run matUtils extract to get subtrees that include those sequences.  protobufs.tab gets a new optional column to specify a file that maps alias to tree name/ID (e.g. for mapping EPI_ISL to public names/IDs).

diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h
index 722441d..b2a803c 100644
--- src/hg/hgPhyloPlace/phyloPlace.h
+++ src/hg/hgPhyloPlace/phyloPlace.h
@@ -20,30 +20,31 @@
 #define MAX_SEQ_DETAILS 100
 #define MAX_SUBTREE_CTS 10
 
 // For usher's -K option (single subtree):
 #define SINGLE_SUBTREE_SIZE "1000"
 
 #define NEXTSTRAIN_DRAG_DROP_DOC "https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html"
 
 struct treeChoices
 /* Phylogenetic tree versions for the user to choose from. */
 {
     char **protobufFiles;      // Mutation annotated tree files in protobuf format for UShER
     char **metadataFiles;      // Sample metadata a la GISAID's nextmeta download option
     char **sources;            // GISAID or public
     char **descriptions;       // Menu labels to describe the options to the user
+    char **aliasFiles;         // Two-column files associating IDs/aliases with full tree names
     int count;                 // Number of choices (and size of each array)
 };
 
 struct seqInfo
 /* User sequences, alignments and statistics */
 {
     struct seqInfo *next;
     struct dnaSeq *seq;                     // Uploaded sequence
     struct psl *psl;                        // Alignment to reference (if FASTA uploaded)
     struct singleNucChange *sncList;        // SNVs in seq
     struct singleNucChange *maskedSncList;  // SNVs that were masked (not used for placement)
     struct slRef *maskedReasonsList;        // Reason (from Problematic Sites) for masking each SNV
     uint nCountStart;                       // #Ns at beginning of seq
     uint nCountMiddle;                      // #Ns not at beginning or end of seq
     uint nCountEnd;                         // #Ns at end of seq
@@ -148,30 +149,37 @@
                               boolean *informativeBases, struct slName **maskSites,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime);
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 
 struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName *userSampleIds,
                               struct hash *condensedNodes, int *pStartTime);
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
  * and parse other results out of stderr output. */
 
+struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath,
+                                                int subtreeSize, struct slName *sampleIds,
+                                                struct hash *condensedNodes, int *pStartTime);
+/* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees
+ * containing sampleIds, save resulting subtrees to trash files, return subtree results.
+ * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */
+
 struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList,
                               struct seqWindow *gSeqWin);
 /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
 
 struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome);
 /* If config.ra has a source of gene annotations, then return the gene list. */
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
                        struct seqWindow *gSeqWin, struct hash *sampleMetadata,
                        struct hash *sampleUrls, char *jsonFile, char *source);
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 
 struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur,
                                    struct slName *sampleIds, struct phyloTree *bigTree,
@@ -191,21 +199,25 @@
 
 char *phyloPlaceDbSettingPath(char *db, char *settingName);
 /* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra,
  * or NULL if not found.  (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */
 
 struct treeChoices *loadTreeChoices(char *db);
 /* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */
 
 void reportTiming(int *pStartTime, char *message);
 /* Print out a report to stderr of how much time something took. */
 
 boolean hgPhyloPlaceEnabled();
 /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */
 
 char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf,
-                        boolean doMeasureTiming, int subtreeSize, int fontHeight);
-/* Given a lineFile that contains either FASTA or VCF, prepare VCF for usher;
- * if that goes well then run usher, report results, make custom track files
- * and return the top-level custom track file; otherwise return NULL. */
+                        boolean doMeasureTiming, int subtreeSize, int fontHeight,
+                        boolean *retSuccess);
+/* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids:
+ * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results,
+ * make custom track files and return the top-level custom track file.
+ * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils
+ * to make subtrees, show subtree results, and return NULL.  Set retSuccess to TRUE if we were
+ * able to get at least some results for the user's input. */
 
 #endif //_PHYLO_PLACE_H_