src/hg/hgPhyloPlace/phyloPlace.h 0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3

0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3
angie
  Fri Oct 28 15:08:06 2022 -0700
Performance improvements for trees with millions of sequences:
* Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher
* Use usher-sampled instead of usher if server is not configured but usher-sampled is available
* Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running
* Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results)
* Avoid parsing the protobuf and traversing the big tree unless absolutely necessary
** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c
** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree)
** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new.
** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree

diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h
index cd0fb2a..118a331 100644
--- src/hg/hgPhyloPlace/phyloPlace.h
+++ src/hg/hgPhyloPlace/phyloPlace.h
@@ -12,47 +12,50 @@
 #include "phyloTree.h"
 #include "seqWindow.h"
 #include "trashDir.h"
 
 #define PHYLOPLACE_DATA_DIR "hgPhyloPlaceData"
 
 // Allow users to upload a lot of sequences, but put limits on how much detail we'll show and
 // how many custom tracks we'll create.
 #define MAX_SUBTREE_BUTTONS 5
 #define MAX_SEQ_DETAILS 100
 #define MAX_SUBTREE_CTS 10
 
 // For usher's -K option (single subtree):
 #define SINGLE_SUBTREE_SIZE "2000"
 #define USHER_NUM_THREADS "16"
+#define USHER_SERVER_CHILD_TIMEOUT "600"
+#define USHER_DEDUP_PREFIX "uploaded_"
 
 #define NEXTSTRAIN_DRAG_DROP_DOC "https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html"
 #define OUTBREAK_INFO_URLBASE "https://outbreak.info/situation-reports?pango="
 #define PANGO_DESIGNATION_ISSUE_URLBASE "https://github.com/cov-lineages/pango-designation/issues/"
 
 // usher now preprends "node_" to node numbers when parsing protobuf, although they're still stored
 // numeric in the protobuf.
 #define USHER_NODE_PREFIX "node_"
 
 struct treeChoices
 /* Phylogenetic tree versions for the user to choose from. */
 {
     char **protobufFiles;      // Mutation annotated tree files in protobuf format for UShER
     char **metadataFiles;      // Sample metadata a la GISAID's nextmeta download option
     char **sources;            // GISAID or public
     char **descriptions;       // Menu labels to describe the options to the user
     char **aliasFiles;         // Two-column files associating IDs/aliases with full tree names
+    char **sampleNameFiles;    // One-column files with full tree names
     int count;                 // Number of choices (and size of each array)
 };
 
 struct seqInfo
 /* User sequences, alignments and statistics */
 {
     struct seqInfo *next;
     struct dnaSeq *seq;                     // Uploaded sequence
     struct psl *psl;                        // Alignment to reference (if FASTA uploaded)
     struct singleNucChange *sncList;        // SNVs in seq
     struct singleNucChange *maskedSncList;  // SNVs that were masked (not used for placement)
     struct slRef *maskedReasonsList;        // Reason (from Problematic Sites) for masking each SNV
     uint nCountStart;                       // #Ns at beginning of seq
     uint nCountMiddle;                      // #Ns not at beginning or end of seq
     uint nCountEnd;                         // #Ns at end of seq
@@ -141,69 +144,89 @@
     char *lineageUsher; // Pango lineage according to annotated tree
     char *authors;      // Sequence submitters/authors
     char *pubs;         // PubMed ID numbers of publications associated with sequences
     char *nLineage;     // Nextstrain letter-dot-numbers lineage assigned by nextclade
     };
 
 struct geneInfo
 /* Information sufficient to determine whether a genome change causes a coding change. */
     {
     struct geneInfo *next;
     struct psl *psl;        // Alignment of transcript to genome
     struct dnaSeq *txSeq;   // Transcript sequence
     };
 
 struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
-                              boolean *informativeBases, struct slName **maskSites,
-                              struct hash *treeNames,
+                              struct slName **maskSites, struct hash *treeNames,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime);
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 
-struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile,
-                              int subtreeSize, struct slName *userSampleIds,
-                              struct hash *condensedNodes, int *pStartTime);
+struct usherResults *runUsher(char *db, char *usherPath, char *usherAssignmentsPath, char *vcfFile,
+                              int subtreeSize, struct slName **pUserSampleIds,
+                              struct treeChoices *treeChoices, int *pStartTime);
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
- * and parse other results out of stderr output. */
+ * and parse other results out of stderr output.  The usher-sampled version of usher might
+ * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */
 
 struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath,
                                                 int subtreeSize, struct slName *sampleIds,
-                                                struct hash *condensedNodes, int *pStartTime);
+                                                int *pStartTime);
 /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees
  * containing sampleIds, save resulting subtrees to trash files, return subtree results.
  * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */
 
+boolean serverIsConfigured(char *db);
+/* Return TRUE if all necessary configuration settings are in place to run usher-sampled-server. */
+
+boolean serverIsRunning(char *db, FILE *errFile);
+/* Return TRUE if we can find a PID for server and that PID looks alive according to /proc. */
+
+boolean startServer(char *db, struct treeChoices *treeChoices, FILE *errFile);
+/* Start up an usher-sampled-server process to run in the background. */
+
+void serverReloadProtobufs(char *db, struct treeChoices *treeChoices);
+/* Send a reload command and list of protobufs for db to usher server. */
+
+void serverStop(char *db);
+/* Send stop command to usher server. */
+
+void serverSetThreadCount(char *db, int val);
+/* Send thread command and value to usher server. */
+
+void serverSetTimeout(char *db, int val);
+/* Send timeout command and value (in seconds) to usher server. */
+
 struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList,
                               struct seqWindow *gSeqWin);
 /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
 
 struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome);
 /* If config.ra has a source of gene annotations, then return the gene list. */
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList,
                        struct seqWindow *gSeqWin, struct hash *sampleMetadata,
                        struct hash *sampleUrls, char *jsonFile, char *source);
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 
 struct tempName *writeCustomTracks(char *db, struct tempName *vcfTn, struct usherResults *ur,
-                                   struct slName *sampleIds, struct mutationAnnotatedTree *bigTree,
-                                   char *source, int fontHeight,
+                                   struct slName *sampleIds, char *source, int fontHeight,
                                    struct phyloTree **retSampleTree, int *pStartTime);
 /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */
 
 
 struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId);
 /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
 
 struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds);
 /* Prune all descendants of node that have no leaf descendants in sampleIds. */
 
 struct slName *phyloPlaceDbList(struct cart *cart);
 /* Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file is a supported db
  * or track hub name (without the hub_number_ prefix).  Return a list of them, or NULL if none
  * are found. */