0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3 angie Fri Oct 28 15:08:06 2022 -0700 Performance improvements for trees with millions of sequences: * Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher * Use usher-sampled instead of usher if server is not configured but usher-sampled is available * Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running * Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results) * Avoid parsing the protobuf and traversing the big tree unless absolutely necessary ** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c ** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree) ** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new. ** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h index cd0fb2a..118a331 100644 --- src/hg/hgPhyloPlace/phyloPlace.h +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -12,47 +12,50 @@ #include "phyloTree.h" #include "seqWindow.h" #include "trashDir.h" #define PHYLOPLACE_DATA_DIR "hgPhyloPlaceData" // Allow users to upload a lot of sequences, but put limits on how much detail we'll show and // how many custom tracks we'll create. #define MAX_SUBTREE_BUTTONS 5 #define MAX_SEQ_DETAILS 100 #define MAX_SUBTREE_CTS 10 // For usher's -K option (single subtree): #define SINGLE_SUBTREE_SIZE "2000" #define USHER_NUM_THREADS "16" +#define USHER_SERVER_CHILD_TIMEOUT "600" +#define USHER_DEDUP_PREFIX "uploaded_" #define NEXTSTRAIN_DRAG_DROP_DOC "https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html" #define OUTBREAK_INFO_URLBASE "https://outbreak.info/situation-reports?pango=" #define PANGO_DESIGNATION_ISSUE_URLBASE "https://github.com/cov-lineages/pango-designation/issues/" // usher now preprends "node_" to node numbers when parsing protobuf, although they're still stored // numeric in the protobuf. #define USHER_NODE_PREFIX "node_" struct treeChoices /* Phylogenetic tree versions for the user to choose from. */ { char **protobufFiles; // Mutation annotated tree files in protobuf format for UShER char **metadataFiles; // Sample metadata a la GISAID's nextmeta download option char **sources; // GISAID or public char **descriptions; // Menu labels to describe the options to the user char **aliasFiles; // Two-column files associating IDs/aliases with full tree names + char **sampleNameFiles; // One-column files with full tree names int count; // Number of choices (and size of each array) }; struct seqInfo /* User sequences, alignments and statistics */ { struct seqInfo *next; struct dnaSeq *seq; // Uploaded sequence struct psl *psl; // Alignment to reference (if FASTA uploaded) struct singleNucChange *sncList; // SNVs in seq struct singleNucChange *maskedSncList; // SNVs that were masked (not used for placement) struct slRef *maskedReasonsList; // Reason (from Problematic Sites) for masking each SNV uint nCountStart; // #Ns at beginning of seq uint nCountMiddle; // #Ns not at beginning or end of seq uint nCountEnd; // #Ns at end of seq @@ -141,69 +144,89 @@ char *lineageUsher; // Pango lineage according to annotated tree char *authors; // Sequence submitters/authors char *pubs; // PubMed ID numbers of publications associated with sequences char *nLineage; // Nextstrain letter-dot-numbers lineage assigned by nextclade }; struct geneInfo /* Information sufficient to determine whether a genome change causes a coding change. */ { struct geneInfo *next; struct psl *psl; // Alignment of transcript to genome struct dnaSeq *txSeq; // Transcript sequence }; struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, - boolean *informativeBases, struct slName **maskSites, - struct hash *treeNames, + struct slName **maskSites, struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime); /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ -struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile, - int subtreeSize, struct slName *userSampleIds, - struct hash *condensedNodes, int *pStartTime); +struct usherResults *runUsher(char *db, char *usherPath, char *usherAssignmentsPath, char *vcfFile, + int subtreeSize, struct slName **pUserSampleIds, + struct treeChoices *treeChoices, int *pStartTime); /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files - * and parse other results out of stderr output. */ + * and parse other results out of stderr output. The usher-sampled version of usher might + * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */ struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath, int subtreeSize, struct slName *sampleIds, - struct hash *condensedNodes, int *pStartTime); + int *pStartTime); /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ +boolean serverIsConfigured(char *db); +/* Return TRUE if all necessary configuration settings are in place to run usher-sampled-server. */ + +boolean serverIsRunning(char *db, FILE *errFile); +/* Return TRUE if we can find a PID for server and that PID looks alive according to /proc. */ + +boolean startServer(char *db, struct treeChoices *treeChoices, FILE *errFile); +/* Start up an usher-sampled-server process to run in the background. */ + +void serverReloadProtobufs(char *db, struct treeChoices *treeChoices); +/* Send a reload command and list of protobufs for db to usher server. */ + +void serverStop(char *db); +/* Send stop command to usher server. */ + +void serverSetThreadCount(char *db, int val); +/* Send thread command and value to usher server. */ + +void serverSetTimeout(char *db, int val); +/* Send timeout command and value (in seconds) to usher server. */ + struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin); /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome); /* If config.ra has a source of gene annotations, then return the gene list. */ void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, char *jsonFile, char *source); /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ struct tempName *writeCustomTracks(char *db, struct tempName *vcfTn, struct usherResults *ur, - struct slName *sampleIds, struct mutationAnnotatedTree *bigTree, - char *source, int fontHeight, + struct slName *sampleIds, char *source, int fontHeight, struct phyloTree **retSampleTree, int *pStartTime); /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId); /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds); /* Prune all descendants of node that have no leaf descendants in sampleIds. */ struct slName *phyloPlaceDbList(struct cart *cart); /* Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file is a supported db * or track hub name (without the hub_number_ prefix). Return a list of them, or NULL if none * are found. */