2d05d30ed4df1612d72ba84c812d004de935b122 angie Fri May 17 16:08:54 2024 -0700 Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup. Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries. diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index c8308dd..1b978f7 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -1,23 +1,24 @@ /* Align user sequence to the reference genome and write VCF */ /* Copyright (C) 2020-2024 The Regents of the University of California */ #include "common.h" #include "fa.h" #include "genoFind.h" #include "iupac.h" +#include "mmHash.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "pipeline.h" #include "psl.h" #include "trashDir.h" double maxNs = 0.5; // Using blat defaults for these: int gfRepMatch = 1024*4; char *gfOoc = NULL; int gfStepSize = gfTileSize; int gfOutMinIdentityPpt = 900; // The default minScore for gfLongDnaInMem's 4500 base preferred size is 30, but we want way @@ -41,31 +42,31 @@ subChar(seqName, '(', '_'); subChar(seqName, ')', '_'); subChar(seqName, ':', '_'); subChar(seqName, ';', '_'); subChar(seqName, ',', '_'); // Strip any underscores from beginning/end; strip double-underscores while (seqName[0] == '_') memmove(seqName, seqName+1, strlen(seqName)+1); char *p; while (*(p = seqName + strlen(seqName) - 1) == '_') *p = '\0'; strSwapStrs(seqName, strlen(seqName), "__", "_"); } } -static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hash *treeNames, +static struct seqInfo *checkSequences(struct dnaSeq *seqs, struct hashOrMmHash *treeNames, int minSeqSize, int maxSeqSize, struct slPair **retFailedSeqs) /* Return a list of sequences that pass basic QC checks (appropriate size etc). * If any sequences have names that are already in the tree, add a prefix so usher doesn't * reject them. If any sequence name contains characters that have special meaning in the Newick * format, replace them with something harmless. * Set retFailedSeqs to the list of sequences that failed checks. */ { struct seqInfo *filteredSeqs = NULL; struct hash *uniqNames = hashNew(0); *retFailedSeqs = NULL; struct dnaSeq *seq, *nextSeq; for (seq = seqs; seq != NULL; seq = nextSeq) { boolean passes = TRUE; nextSeq = seq->next; @@ -128,40 +129,49 @@ if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i])) ambigCount++; if (passes) { replaceNewickChars(seq->name); if (hashLookup(uniqNames, seq->name)) { struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; " "ignoring subsequent usage " "(%d bases, %d N's, %d ambiguous).", seq->name, seq->size, nCountTotal, ambigCount); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); } else { - if (treeNames && - (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name))) + // If non-NULL treeNames was passed in then we're using original usher not usher-sampled + // and need to modify uploaded names that are already in the tree, if any. + if (treeNames) + { + boolean foundInTreeNames = FALSE; + if (treeNames->hash) + foundInTreeNames = (hashLookup(treeNames->hash, seq->name) != NULL); + else + foundInTreeNames = (mmHashFindVal(treeNames->mmh, seq->name) != NULL); + if (foundInTreeNames || isInternalNodeName(seq->name, 0)) { // usher will reject any sequence whose name is already in the tree, even a // numeric internal node name. Add a prefix so usher won't reject sequence. char newName[strlen(seq->name)+32]; safef(newName, sizeof newName, "uploaded_%s", seq->name); freeMem(seq->name); seq->name = cloneString(newName); } + } struct seqInfo *si; AllocVar(si); si->seq = seq; si->nCountStart = nCountStart; si->nCountMiddle = nCountMiddle; si->nCountEnd = nCountEnd; si->ambigCount = ambigCount; hashAdd(uniqNames, seq->name, NULL); slAddHead(&filteredSeqs, si); } } } slReverse(&filteredSeqs); slReverse(retFailedSeqs); hashFree(&uniqNames); @@ -960,31 +970,31 @@ } slReverse(&sampleIds); writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, tn->forCgi); } // Clean up freeMem(snvsByPos); unlink(tnSeqFile.forCgi); unlink(nextcladeOut); *retSampleIds = sampleIds; *retSeqInfo = filteredSeqs; return tn; } struct tempName *vcfFromFasta(struct lineFile *lf, char *org, char *db, struct dnaSeq *refGenome, - struct slName **maskSites, struct hash *treeNames, + struct slName **maskSites, struct hashOrMmHash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); int minSeqSize = 0, maxSeqSize = 0; // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra. char *minSeqSizeSetting = phyloPlaceRefSetting(org, db, "minSeqSize"); if (isEmpty(minSeqSizeSetting)) minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000; else