2d05d30ed4df1612d72ba84c812d004de935b122 angie Fri May 17 16:08:54 2024 -0700 Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup. Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries. diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h index fabc6b0..96f8552 100644 --- src/hg/hgPhyloPlace/phyloPlace.h +++ src/hg/hgPhyloPlace/phyloPlace.h @@ -117,52 +117,60 @@ struct phyloTree *subtree; // Parsed subtree (#*** with annotated muts? shoudl we?_ struct hash *subtreeIdToIx; // Map of subtree nodes to VCF output sample order struct slName *subtreeUserSampleIds; // List of user-uploaded samples in this subtree struct slName *subtreeNameList; // List of leaf names with nicer names for cond. nodes }; struct usherResults /* Tree+samples download file, sample placements, and subtrees parsed from usher output. */ { struct tempName *bigTreePlusTn; // Newick file: original tree plus user's samples struct hash *samplePlacements; // Info about each sample's placement in the tree struct subtreeInfo *singleSubtreeInfo; // Comprehensive subtree with all uploaded samples struct subtreeInfo *subtreeInfoList; // For each subtree: tree, file, node info etc. }; -struct sampleMetadata -/* Information about a virus sample. */ +struct sampleMetadataStore +/* Storage for sample metadata: hash of array of strings for named columns. */ { + struct mmHash *mmh; // Either NULL (if hash is non-NULL) or a memory-mapped hash. + struct hash *hash; // Either NULL (if mmh is non-NULL) or a regular hash. size_t columnCount; // Number of metadata columns char **columnNames; // Metadata column names (e.g. date, genbank_accession, pangolin_lineage) - // -- shared by all metadata rows, not allocated for each struct - char **columnValues; // Metadata column values -- allocated for each struct }; struct geneInfo /* Information sufficient to determine whether a genome change causes a coding change. */ { struct geneInfo *next; struct psl *psl; // Alignment of transcript to genome struct dnaSeq *txSeq; // Transcript sequence struct genbankCds *cds; // CDS (for those few pathogens that have transcript UTRs) int cdsStart; // genePred cdsStart (genome coord, really cds end if - strand) int cdsEnd; // genePred cdsEnd (genome coord, really cds start if - strand) }; +struct hashOrMmHash +/* Wrapper for either a regular hash or a memory-mapped hash. Not using a union because I need + * to know which one I have. */ + { + struct mmHash *mmh; // Either NULL (if hash is non-NULL) or a memory-mapped hash. + struct hash *hash; // Either NULL (if mmh is non-NULL) or a regular hash. + }; + struct tempName *vcfFromFasta(struct lineFile *lf, char *org, char *db, struct dnaSeq *refGenome, - struct slName **maskSites, struct hash *treeNames, + struct slName **maskSites, struct hashOrMmHash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime); /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ struct usherResults *runUsher(char *org, char *usherPath, char *usherAssignmentsPath, char *vcfFile, int subtreeSize, struct slName **pUserSampleIds, struct treeChoices *treeChoices, int *pStartTime); /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and * subtrees to trash files, return list of slRef to struct tempName for the trash files * and parse other results out of stderr output. The usher-sampled version of usher might * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */ @@ -193,45 +201,46 @@ void serverSetTimeout(char *org, int val); /* Send timeout command and value (in seconds) to usher server. */ char *microbeTraceHost(); /* Return the MicrobeTrace hostname from an hg.conf param, or NULL if missing. Do not free result. */ struct slPair *getAaMutations(struct singleNucChange *sncList, struct singleNucChange *ancestorMuts, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin); /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */ struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome); /* If config.ra has a source of gene annotations, then return the gene list. */ void treeToAuspiceJson(struct subtreeInfo *sti, char *org, char *db, struct geneInfo *geneInfoList, - struct seqWindow *gSeqWin, struct hash *sampleMetadata, + struct seqWindow *gSeqWin, struct sampleMetadataStore *sampleMetadata, struct hash *sampleUrls, struct hash *samplePlacements, char *jsonFile, char *source); /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ struct tempName *writeCustomTracks(char *org, char *ref, char *db, struct tempName *vcfTn, struct usherResults *ur, struct slName *sampleIds, char *source, int fontHeight, struct phyloTree *sampleTree, int *pStartTime); /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */ -struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId); -/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ +char **metadataForSample(struct sampleMetadataStore *sampleMetadata, char *sampleId); +/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. + * Return NULL if not found. */ struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds); /* Prune all descendants of node that have no leaf descendants in sampleIds. */ struct slPair *phyloPlaceOrgList(struct cart *cart); /* Each subdirectory of PHYLOPLACE_DATA_DIR that contains an organism.ra file is a collection of * reference sequences that uploaded sequences will be matched against using nextclade sort. * Some of those references might also be dbs or track hub names (without the hub_number_ prefix). * Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file contains a single * reference which might also be a db or track hub name (without the hub_number_ prefix). * Return a list of {name, label} pairs, SARS-CoV-2 first, combining the two categories. */ char *phyloPlaceOrgSetting(char *org, char *settingName); /* Return cloned setting value if found in hgPhyloPlaceData/<org>/organism.ra or * old-style hgPhyloPlaceData/<org>/config.ra, or NULL if not found. */