08ec019f344fc52bd83ea3a98bedccd0048d732c angie Mon May 24 00:04:08 2021 -0700 Let the user upload a file containing names or IDs of sequences already in the selected tree; run matUtils extract to get subtrees that include those sequences. protobufs.tab gets a new optional column to specify a file that maps alias to tree name/ID (e.g. for mapping EPI_ISL to public names/IDs). diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h index 722441d..b2a803c 100644 --- src/hg/hgPhyloPlace/phyloPlace.h +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -20,30 +20,31 @@ #define MAX_SEQ_DETAILS 100 #define MAX_SUBTREE_CTS 10 // For usher's -K option (single subtree): #define SINGLE_SUBTREE_SIZE "1000" #define NEXTSTRAIN_DRAG_DROP_DOC "https://docs.nextstrain.org/projects/auspice/en/latest/advanced-functionality/drag-drop-csv-tsv.html" struct treeChoices /* Phylogenetic tree versions for the user to choose from. */ { char **protobufFiles; // Mutation annotated tree files in protobuf format for UShER char **metadataFiles; // Sample metadata a la GISAID's nextmeta download option char **sources; // GISAID or public char **descriptions; // Menu labels to describe the options to the user + char **aliasFiles; // Two-column files associating IDs/aliases with full tree names int count; // Number of choices (and size of each array) }; struct seqInfo /* User sequences, alignments and statistics */ { struct seqInfo *next; struct dnaSeq *seq; // Uploaded sequence struct psl *psl; // Alignment to reference (if FASTA uploaded) struct singleNucChange *sncList; // SNVs in seq struct singleNucChange *maskedSncList; // SNVs that were masked (not used for placement) struct slRef *maskedReasonsList; // Reason (from Problematic Sites) for masking each SNV uint nCountStart; // #Ns at beginning of seq uint nCountMiddle; // #Ns not at beginning or end of seq uint nCountEnd; // #Ns at end of seq @@ -148,30 +149,37 @@ boolean *informativeBases, struct slName **maskSites, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime); /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName *userSampleIds, struct hash *condensedNodes, int *pStartTime); /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files * and parse other results out of stderr output. */ +struct usherResults *runMatUtilsExtractSubtrees(char *matUtilsPath, char *protobufPath, + int subtreeSize, struct slName *sampleIds, + struct hash *condensedNodes, int *pStartTime); +/* Open a pipe from Yatish Turakhia and Jakob McBroome's matUtils extract to extract subtrees + * containing sampleIds, save resulting subtrees to trash files, return subtree results. + * Caller must ensure that sampleIds are names of leaves in the protobuf tree. */ + struct slPair *getAaMutations(struct singleNucChange *sncList, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin); /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome); /* If config.ra has a source of gene annotations, then return the gene list. */ void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *sampleMetadata, struct hash *sampleUrls, char *jsonFile, char *source); /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur, struct slName *sampleIds, struct phyloTree *bigTree, @@ -191,21 +199,25 @@ char *phyloPlaceDbSettingPath(char *db, char *settingName); /* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra, * or NULL if not found. (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */ struct treeChoices *loadTreeChoices(char *db); /* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */ void reportTiming(int *pStartTime, char *message); /* Print out a report to stderr of how much time something took. */ boolean hgPhyloPlaceEnabled(); /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */ char *phyloPlaceSamples(struct lineFile *lf, char *db, char *defaultProtobuf, - boolean doMeasureTiming, int subtreeSize, int fontHeight); -/* Given a lineFile that contains either FASTA or VCF, prepare VCF for usher; - * if that goes well then run usher, report results, make custom track files - * and return the top-level custom track file; otherwise return NULL. */ + boolean doMeasureTiming, int subtreeSize, int fontHeight, + boolean *retSuccess); +/* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids: + * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results, + * make custom track files and return the top-level custom track file. + * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils + * to make subtrees, show subtree results, and return NULL. Set retSuccess to TRUE if we were + * able to get at least some results for the user's input. */ #endif //_PHYLO_PLACE_H_