afafa0301ea4b14fbe1fbd5aa379c5351ecd640d angie Tue Aug 2 19:41:44 2022 -0700 Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub). * Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name. * Add a menu input to the main page for switching between supported genomes if there are more than one. * Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings. * Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions. * Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV). * also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output. TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes. diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h index 8d0d10d..399edee 100644 --- src/hg/hgPhyloPlace/phyloPlace.h +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -10,33 +10,35 @@ #include "parsimonyProto.h" #include "phyloTree.h" #include "seqWindow.h" #include "trashDir.h" #define PHYLOPLACE_DATA_DIR "hgPhyloPlaceData" // Allow users to upload a lot of sequences, but put limits on how much detail we'll show and // how many custom tracks we'll create. #define MAX_SUBTREE_BUTTONS 5 #define MAX_SEQ_DETAILS 100 #define MAX_SUBTREE_CTS 10 // For usher's -K option (single subtree): #define SINGLE_SUBTREE_SIZE "2000" +#define USHER_NUM_THREADS "16" #define NEXTSTRAIN_DRAG_DROP_DOC "https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html" #define OUTBREAK_INFO_URLBASE "https://outbreak.info/situation-reports?pango=" +#define PANGO_DESIGNATION_ISSUE_URLBASE "https://github.com/cov-lineages/pango-designation/issues/" // usher now preprends "node_" to node numbers when parsing protobuf, although they're still stored // numeric in the protobuf. #define USHER_NODE_PREFIX "node_" struct treeChoices /* Phylogenetic tree versions for the user to choose from. */ { char **protobufFiles; // Mutation annotated tree files in protobuf format for UShER char **metadataFiles; // Sample metadata a la GISAID's nextmeta download option char **sources; // GISAID or public char **descriptions; // Menu labels to describe the options to the user char **aliasFiles; // Two-column files associating IDs/aliases with full tree names int count; // Number of choices (and size of each array) }; @@ -124,30 +126,33 @@ char *date; // Sample collection date char *author; // Author(s) to credit char *nClade; // Nextstrain year-letter clade assigned by nextclade char *gClade; // GISAID amino acid change clade char *lineage; // Pango lineage assigned by pangolin char *country; // Country in which sample was collected char *division; // Administrative division in which sample was collected (country or state) char *location; // Location in which sample was collected (city) char *countryExp; // Country in which host was exposed to virus char *divExp; // Administrative division in which host was exposed to virus char *origLab; // Originating lab char *subLab; // Submitting lab char *region; // Continent on which sample was collected char *nCladeUsher; // Nextstrain clade according to annotated tree char *lineageUsher; // Pango lineage according to annotated tree + char *authors; // Sequence submitters/authors + char *pubs; // PubMed ID numbers of publications associated with sequences + char *nLineage; // Nextstrain letter-dot-numbers lineage assigned by nextclade }; struct geneInfo /* Information sufficient to determine whether a genome change causes a coding change. */ { struct geneInfo *next; struct psl *psl; // Alignment of transcript to genome struct dnaSeq *txSeq; // Transcript sequence }; struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, boolean *informativeBases, struct slName **maskSites, struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, @@ -171,43 +176,48 @@ * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin); /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome); /* If config.ra has a source of gene annotations, then return the gene list. */ void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, char *jsonFile, char *source); /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ -struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur, +struct tempName *writeCustomTracks(char *db, struct tempName *vcfTn, struct usherResults *ur, struct slName *sampleIds, struct mutationAnnotatedTree *bigTree, - char *source, int fontHeight, struct phyloTree **retSampleTree, - int *pStartTime); + char *source, int fontHeight, + struct phyloTree **retSampleTree, int *pStartTime); /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId); /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds); /* Prune all descendants of node that have no leaf descendants in sampleIds. */ +struct slName *phyloPlaceDbList(); +/* Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file is a supported db + * or track hub name (without the hub_number_ prefix). Return a list of them, or NULL if none + * are found. */ + char *phyloPlaceDbSetting(char *db, char *settingName); /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */ char *phyloPlaceDbSettingPath(char *db, char *settingName); /* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra, * or NULL if not found. (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */ struct treeChoices *loadTreeChoices(char *db); /* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */ boolean isInternalNodeName(char *nodeName, int minNewNode); /* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric * or <USHER_NODE_PREFIX>_<number> and, if minNewNode > 0, number is less than minNewNode. */ void reportTiming(int *pStartTime, char *message);