705420e703b067fbcad43ab67ed3e131552e7ac8 angie Wed Apr 24 14:23:03 2024 -0700 Pathogen drop-down choices can now be groups of references/trees, for example 'Dengue (types 1 - 4)' instead of a separate choice for each type. Instead of config.ra, each group has an organism.ra and subdirectories named after reference accessions that contain reference.ra files. nextclade sort is used to match the user's uploaded sequences against available references for the selected pathogen. SARS-CoV-2, M. tuberculosis and hMPXV still have only one reference and still use config.ra, but RSV, Dengue and Influenza will become groups. Presentation is still kinda rough, just a loop on the original results output. The server commands part needs testing and will not work yet for groups (currently used only for SARS-CoV-2). diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h index f0edb7e..d09bfd3 100644 --- src/hg/hgPhyloPlace/phyloPlace.h +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -1,28 +1,29 @@ /* Place SARS-CoV-2 sequences in phylogenetic tree using add_missing_samples program. */ #ifndef _PHYLO_PLACE_H_ #define _PHYLO_PLACE_H_ #include "common.h" #include "cart.h" #include "dnaseq.h" #include "hash.h" #include "linefile.h" #include "parsimonyProto.h" #include "phyloTree.h" #include "seqWindow.h" +#include "trackLayout.h" #include "trashDir.h" #define PHYLOPLACE_DATA_DIR "hgPhyloPlaceData" // Allow users to upload a lot of sequences, but put limits on how much detail we'll show and // how many custom tracks we'll create. #define MAX_SUBTREE_SIZE 5000 #define MAX_MICROBETRACE_SUBTREE_SIZE 500 #define MAX_SUBTREE_BUTTONS 5 #define MAX_SEQ_DETAILS 1000 #define MAX_SUBTREE_CTS 10 // For usher's -K option (single subtree): #define SINGLE_SUBTREE_SIZE "2000" #define USHER_NUM_THREADS "16" @@ -133,125 +134,138 @@ // -- shared by all metadata rows, not allocated for each struct char **columnValues; // Metadata column values -- allocated for each struct }; struct geneInfo /* Information sufficient to determine whether a genome change causes a coding change. */ { struct geneInfo *next; struct psl *psl; // Alignment of transcript to genome struct dnaSeq *txSeq; // Transcript sequence struct genbankCds *cds; // CDS (for those few pathogens that have transcript UTRs) int cdsStart; // genePred cdsStart (genome coord, really cds end if - strand) int cdsEnd; // genePred cdsEnd (genome coord, really cds start if - strand) }; -struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, +struct tempName *vcfFromFasta(struct lineFile *lf, char *org, char *db, struct dnaSeq *refGenome, struct slName **maskSites, struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime); /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ -struct usherResults *runUsher(char *db, char *usherPath, char *usherAssignmentsPath, char *vcfFile, +struct usherResults *runUsher(char *org, char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName **pUserSampleIds, struct treeChoices *treeChoices, int *pStartTime); /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files * and parse other results out of stderr output. The usher-sampled version of usher might * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */ -struct usherResults *runMatUtilsExtractSubtrees(char *db, char *matUtilsPath, char *protobufPath, +struct usherResults *runMatUtilsExtractSubtrees(char *org, char *matUtilsPath, char *protobufPath, int subtreeSize, struct slName *sampleIds, struct treeChoices *treeChoices, int *pStartTime); /* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees * containing sampleIds, save resulting subtrees to trash files, return subtree results. * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ -boolean serverIsConfigured(char *db); +boolean serverIsConfigured(char *org); /* Return TRUE if all necessary configuration settings are in place to run usher-sampled-server. */ -boolean serverIsRunning(char *db, FILE *errFile); +boolean serverIsRunning(char *org, FILE *errFile); /* Return TRUE if we can find a PID for server and that PID looks alive according to /proc. */ -boolean startServer(char *db, struct treeChoices *treeChoices, FILE *errFile); +boolean startServer(char *org, struct treeChoices *treeChoices, FILE *errFile); /* Start up an usher-sampled-server process to run in the background. */ -void serverReloadProtobufs(char *db, struct treeChoices *treeChoices); -/* Send a reload command and list of protobufs for db to usher server. */ +void serverReloadProtobufs(char *org, struct treeChoices *treeChoices); +/* Send a reload command and list of protobufs for org to usher server. */ -void serverStop(char *db); +void serverStop(char *org); /* Send stop command to usher server. */ -void serverSetThreadCount(char *db, int val); +void serverSetThreadCount(char *org, int val); /* Send thread command and value to usher server. */ -void serverSetTimeout(char *db, int val); +void serverSetTimeout(char *org, int val); /* Send timeout command and value (in seconds) to usher server. */ char *microbeTraceHost(); /* Return the MicrobeTrace hostname from an hg.conf param, or NULL if missing. Do not free result. */ struct slPair *getAaMutations(struct singleNucChange *sncList, struct singleNucChange *ancestorMuts, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin); /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome); /* If config.ra has a source of gene annotations, then return the gene list. */ -void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, +void treeToAuspiceJson(struct subtreeInfo *sti, char *org, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, struct hash *samplePlacements, char *jsonFile, char *source); /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ -struct tempName *writeCustomTracks(char *db, struct tempName *vcfTn, struct usherResults *ur, +struct tempName *writeCustomTracks(char *org, char *ref, char *db, + struct tempName *vcfTn, struct usherResults *ur, struct slName *sampleIds, char *source, int fontHeight, struct phyloTree *sampleTree, int *pStartTime); /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId); /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds); /* Prune all descendants of node that have no leaf descendants in sampleIds. */ -struct slName *phyloPlaceDbList(struct cart *cart); -/* Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file is a supported db - * or track hub name (without the hub_number_ prefix). Return a list of them, or NULL if none - * are found. */ +struct slPair *phyloPlaceOrgList(struct cart *cart); +/* Each subdirectory of PHYLOPLACE_DATA_DIR that contains an organism.ra file is a collection of + * reference sequences that uploaded sequences will be matched against using nextclade sort. + * Some of those references might also be dbs or track hub names (without the hub_number_ prefix). + * Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file contains a single + * reference which might also be a db or track hub name (without the hub_number_ prefix). + * Return a list of {name, label} pairs, SARS-CoV-2 first, combining the two categories. */ -char *phyloPlaceDbSetting(char *db, char *settingName); -/* Return a setting from hgPhyloPlaceData//config.ra or NULL if not found. */ +char *phyloPlaceOrgSetting(char *org, char *settingName); +/* Return cloned setting value if found in hgPhyloPlaceData//organism.ra or + * old-style hgPhyloPlaceData//config.ra, or NULL if not found. */ -char *phyloPlaceDbSettingPath(char *db, char *settingName); -/* Return path to a file named by a setting from hgPhyloPlaceData//config.ra, - * or NULL if not found. (Append hgPhyloPlaceData// to the beginning of relative path) */ +char *phyloPlaceOrgSettingPath(char *org, char *settingName); +/* Return cgi-bin-relative path to a file named by a setting for org, or NULL if not found. */ -struct treeChoices *loadTreeChoices(char *db); -/* If /config.ra specifies a treeChoices file, load it up, else return NULL. */ +char *phyloPlaceRefSetting(char *org, char *ref, char *settingName); +/* Return cloned setting value if found in hgPhyloPlaceData///reference.ra or + * old-style hgPhyloPlaceData//config.ra, or NULL if not found. */ + +char *phyloPlaceRefSettingPath(char *org, char *ref, char *settingName); +/* Return cgi-bin-relative path to a file named by a setting from + * hgPhyloPlaceData///reference.ra or old-style hgPhyloPlaceData//config.ra, + * or NULL if not found. */ + +struct treeChoices *loadTreeChoices(char *org, char *db); +/* If config specifies a treeChoices file, load it up, else return NULL. */ boolean isInternalNodeName(char *nodeName, int minNewNode); /* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric * or _ and, if minNewNode > 0, number is less than minNewNode. */ void reportTiming(int *pStartTime, char *message); /* Print out a report to stderr of how much time something took. */ boolean hgPhyloPlaceEnabled(); /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */ -char *phyloPlaceSamples(struct lineFile *lf, char *db, char *refName, char *defaultProtobuf, - boolean doMeasureTiming, int subtreeSize, int fontHeight, - boolean *retSuccess); +boolean phyloPlaceSamples(struct lineFile *lf, char *db, char *refName, char *defaultProtobuf, + boolean doMeasureTiming, int subtreeSize, struct trackLayout *tl, + struct cart *cart); /* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids: * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results, - * make custom track files and return the top-level custom track file. + * make custom track files. * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils - * to make subtrees, show subtree results, and return NULL. Set retSuccess to TRUE if we were - * able to get at least some results for the user's input. */ + * to make subtrees, show subtree results. + * Return TRUE if we were able to get at least some results for the user's input. */ #endif //_PHYLO_PLACE_H_