7c0d73b5b9eb8303ee04ba236d31a6703552581c angie Wed Dec 2 16:32:15 2020 -0800 hgPhyloPlace metadata: add Nextstrain_clade and genbank_accession columns, refactor in prep for public data. * Move the metadatafile-reading code into phyloPlace.c so metadata file can be used for looking up lineage (no need for separate idToLineage file). * Support looking up metadata by GenBank ID in addition to EPI_ISL ID, in preparation for using protobuf and metadata from public sequences instead of GISAID. diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h index 3a9ba84..9408a7f 100644 --- src/hg/hgPhyloPlace/phyloPlace.h +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -73,62 +73,80 @@ struct tempName *subtreeTn; // Newick file from usher (may have condensed nodes) struct phyloTree *subtree; // Parsed subtree (#*** with annotated muts? shoudl we?_ struct hash *subtreeIdToIx; // Map of subtree nodes to VCF output sample order struct slName *subtreeUserSampleIds; // List of user-uploaded samples in this subtree struct slName *subtreeNameList; // List of leaf names with nicer names for cond. nodes }; struct usherResults /* Tree+samples download file, sample placements, and subtrees parsed from usher output. */ { struct tempName *bigTreePlusTn; // Newick file: original tree plus user's samples struct hash *samplePlacements; // Info about each sample's placement in the tree struct subtreeInfo *subtreeInfoList; // For each subtree: tree, file, node info etc. }; +struct sampleMetadata +/* Information about a virus sample. */ + { + char *strain; // Strain name, usually of the form Country/ArbitraryId/YYYY-MM-DD + char *epiId; // GISAID EPI_ISL_[0-9]+ ID + char *gbAcc; // GenBank accession + char *date; // Sample collection date + char *author; // Author(s) to credit + char *nClade; // Nextstrain year-letter clade + char *gClade; // GISAID amino acid change clade + char *lineage; // Pangolin lineage + char *country; // Country in which sample was collected + char *division; // Administrative division in which sample was collected (country or state) + char *location; // Location in which sample was collected (city) + char *countryExp; // Country in which host was exposed to virus + char *divExp; // Administrative division in which host was exposed to virus + char *origLab; // Originating lab + char *subLab; // Submitting lab + char *region; // Continent on which sample was collected + }; + struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, boolean *informativeBases, struct slName **maskSites, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime); /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ struct usherResults *runUsher(char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName *userSampleIds, struct hash *condensedNodes, int *pStartTime); /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files * and parse other results out of stderr output. */ void treeToAuspiceJson(struct subtreeInfo *sti, char *db, struct dnaSeq *ref, - char *bigGenePredFile, char *metadataFile, char *jsonFile); + char *bigGenePredFile, struct hash *sampleMetadata, char *jsonFile); /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ struct tempName *writeCustomTracks(struct tempName *vcfTn, struct usherResults *ur, struct slName *sampleIds, struct phyloTree *bigTree, int fontHeight, int *pStartTime); /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ -char *epiIdFromSampleName(char *sampleId); -/* If an EPI_ISL_# ID is present somewhere in sampleId, extract and return it, otherwise NULL. */ - -char *lineageForSample(char *db, char *sampleId); -/* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ +struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId); +/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds); /* Prune all descendants of node that have no leaf descendants in sampleIds. */ char *phyloPlaceDbSetting(char *db, char *settingName); /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */ char *phyloPlaceDbSettingPath(char *db, char *settingName); /* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra, * or NULL if not found. (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */ void reportTiming(int *pStartTime, char *message); /* Print out a report to stderr of how much time something took. */ boolean hgPhyloPlaceEnabled();