0fffa3c31de4845a9bd3f06c0992f971e4d8a7a3 angie Fri Oct 28 15:08:06 2022 -0700 Performance improvements for trees with millions of sequences: * Use @yceh's usher-sampled-server if configured; it preloads protobufs and can start placing sequences immediately using usher-sampled, a faster version of usher * Use usher-sampled instead of usher if server is not configured but usher-sampled is available * Load sample metadata file in a pthread while usher(-sampled(-server)) or matUtils is running * Skip checking for sample name clashes in uploaded fasta when using usher-sampled(-server)'s new --no-ignore-prefix option (but look for the prefix when parsing results) * Avoid parsing the protobuf and traversing the big tree unless absolutely necessary ** Subtrees from usher/matUtils have not included condensed nodes in a long time; remove lots of condensedNodes/summarization code from phyloPlace.c, runUsher.c, writeCustomTracks.c ** Use subtrees instead of big tree when possible (in findNearestNeighbor, treeToBaseAlleles, uploadedSamplesTree) ** Skip the informativeBases stuff that inhibits masking of sites from Problematic Sites set when the tree was built with an earlier version -- that pretty much never applies anymore now that only daily-updated trees are offered, not a range from old to new. ** Allow config.ra to specify a flat file of sample names (needed for searching user's uploaded names/IDs before calling matUtils) instead of getting names from the big tree diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index 6e06e530..f914657 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -103,31 +103,32 @@ for (i = 0; i < seq->size; i++) if (seq->dna[i] != 'n' && isIupacAmbiguous(seq->dna[i])) ambigCount++; if (passes) { if (hashLookup(uniqNames, seq->name)) { struct dyString *dy = dyStringCreate("Sequence name '%s' has already been used; " "ignoring subsequent usage " "(%d bases, %d N's, %d ambiguous).", seq->name, seq->size, nCountTotal, ambigCount); slPairAdd(retFailedSeqs, dyStringCannibalize(&dy), seq); } else { - if (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name)) + if (treeNames && + (isInternalNodeName(seq->name, 0) || hashLookup(treeNames, seq->name))) { // usher will reject any sequence whose name is already in the tree, even a // numeric internal node name. Add a prefix so usher won't reject sequence. char newName[strlen(seq->name)+32]; safef(newName, sizeof newName, "uploaded_%s", seq->name); freeMem(seq->name); seq->name = cloneString(newName); } struct seqInfo *si; AllocVar(si); si->seq = seq; si->nCountStart = nCountStart; si->nCountMiddle = nCountMiddle; si->nCountEnd = nCountEnd; si->ambigCount = ambigCount; @@ -356,49 +357,48 @@ /* If refBase is not N, add a call (or no-call if altBase is N or '-') to snvsByPos[tStart]. * Return the VCF genotype code (-1 for missing/N, 0 for ref, 1 for first alt etc). */ { if (refBase != 'n') { if (snvsByPos[tStart] == NULL) snvsByPos[tStart] = snvInfoNew(refBase, altBase, gtIx, gtCount); else snvInfoAddCall(snvsByPos[tStart], refBase, altBase, gtIx); return snvsByPos[tStart]->genotypes[gtIx]; } return -1; } static void extractSnvs(struct psl *psls, struct dnaSeq *ref, struct seqInfo *querySeqs, - struct snvInfo **snvsByPos, boolean *informativeBases, - struct slName **maskSites) + struct snvInfo **snvsByPos, struct slName **maskSites) /* For each PSL, find single-nucleotide differences between ref and query (one of querySeqs), * building up target position-indexed array of snvInfo w/genotypes in same order as querySeqs. - * Add explicit no-calls for unaligned positions in informativeBases. */ + * Add explicit no-calls for unaligned positions that are not masked. */ { int gtCount = slCount(querySeqs); struct psl *psl; for (psl = psls; psl != NULL; psl = psl->next) { if (!sameString(psl->strand, "+")) errAbort("extractSnvs: expected strand to be '+' but it's '%s'", psl->strand); int gtIx; struct seqInfo *qSeq = mustFindSeqAndIx(psl->qName, querySeqs, >Ix); // Add no-calls for unaligned bases before first block int tStart; for (tStart = 0; tStart < psl->tStart; tStart++) { - if (informativeBases[tStart]) + if (!maskSites[tStart]) { char refBase = ref->dna[tStart]; addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount); } } int blkIx; for (blkIx = 0; blkIx < psl->blockCount; blkIx++) { // Add genotypes for aligned bases tStart = psl->tStarts[blkIx]; int tEnd = tStart + psl->blockSizes[blkIx]; int qStart = psl->qStarts[blkIx]; for (; tStart < tEnd; tStart++, qStart++) { char refBase = ref->dna[tStart]; @@ -412,42 +412,42 @@ struct slName *maskedReasons = maskSites[tStart]; if (maskedReasons) { slAddHead(&qSeq->maskedSncList, snc); slAddHead(&qSeq->maskedReasonsList, slRefNew(maskedReasons)); } else slAddHead(&qSeq->sncList, snc); } } if (blkIx < psl->blockCount - 1) { // Add no-calls for unaligned bases between this block and the next for (; tStart < psl->tStarts[blkIx+1]; tStart++) { - if (informativeBases[tStart]) + if (!maskSites[tStart]) { char refBase = ref->dna[tStart]; addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount); } } } } // Add no-calls for unaligned bases after last block for (tStart = psl->tEnd; tStart < ref->size; tStart++) { - if (informativeBases[tStart]) + if (!maskSites[tStart]) { char refBase = ref->dna[tStart]; addCall(snvsByPos, tStart, refBase, '-', gtIx, gtCount); } } slReverse(&qSeq->sncList); } } static void writeVcfSnv(struct snvInfo **snvsByPos, int gtCount, char *chrom, int chromStart, FILE *f) /* If snvsByPos[chromStart] is not NULL, write out a VCF data row with genotypes. */ { struct snvInfo *snv = snvsByPos[chromStart]; if (snv && (snv->altCount > 0 || snv->unkCount > 0)) @@ -491,37 +491,37 @@ fprintf(f, "##fileformat=VCFv4.2\n"); fprintf(f, "##reference=%s\n", ref->name); fprintf(f, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); int i; for (i = 0; i < sampleCount; i++) fprintf(f, "\t%s", sampleNames[i]); fputc('\n', f); int chromStart; for (chromStart = 0; chromStart < ref->size; chromStart++) if (!maskSites[chromStart]) writeVcfSnv(snvsByPos, sampleCount, ref->name, chromStart, f); carefulClose(&f); } static void pslSnvsToVcfFile(struct psl *psls, struct dnaSeq *ref, struct seqInfo *querySeqs, - char *vcfFileName, boolean *informativeBases, struct slName **maskSites) + char *vcfFileName, struct slName **maskSites) /* Find single-nucleotide differences between each query sequence and reference, and * write out a VCF file with genotype columns for the queries. */ { struct snvInfo *snvsByPos[ref->size]; memset(snvsByPos, 0, sizeof snvsByPos); -extractSnvs(psls, ref, querySeqs, snvsByPos, informativeBases, maskSites); +extractSnvs(psls, ref, querySeqs, snvsByPos, maskSites); int sampleCount = slCount(querySeqs); char *sampleNames[sampleCount]; struct seqInfo *qSeq; int i; for (i = 0, qSeq = querySeqs; i < sampleCount; i++, qSeq = qSeq->next) sampleNames[i] = qSeq->seq->name; writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, vcfFileName); } static void analyzeGaps(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome) /* Tally up actual insertions and deletions in each psl; ignore skipped N bases. */ { struct seqInfo *si; for (si = filteredSeqs; si != NULL; si = si->next) { @@ -570,32 +570,31 @@ dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq); } else dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen); } } si->insBases = insBases; si->delBases = delBases; si->insRanges = dyStringCannibalize(&dyIns); si->delRanges = dyStringCannibalize(&dyDel); } } } struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome, - boolean *informativeBases, struct slName **maskSites, - struct hash *treeNames, + struct slName **maskSites, struct hash *treeNames, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Read in FASTA from lf and make sure each item has a reasonable size and not too high * percentage of N's. Align to reference, extract SNVs from alignment, and save as VCF * with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL; struct dnaSeq *allSeqs = faReadAllMixedInLf(lf); int minSeqSize = 0, maxSeqSize = 0; // Default to SARS-CoV-2 or hMPXV values if setting is missing from config.ra. char *minSeqSizeSetting = phyloPlaceDbSetting(db, "minSeqSize"); if (isEmpty(minSeqSizeSetting)) minSeqSize = sameString(db, "wuhCor1") ? 10000 : 100000; @@ -606,30 +605,29 @@ maxSeqSize = sameString(db, "wuhCor1") ? 35000 : 220000; else maxSeqSize = atoi(maxSeqSizeSetting); struct seqInfo *filteredSeqs = checkSequences(allSeqs, treeNames, minSeqSize, maxSeqSize, retFailedSeqs); reportTiming(pStartTime, "read and check uploaded FASTA"); if (filteredSeqs) { struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime); struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); if (filteredAlignments) { AllocVar(tn); trashDirFile(tn, "ct", "pp", ".vcf"); - pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases, - maskSites); + pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, maskSites); struct psl *psl; for (psl = filteredAlignments; psl != NULL; psl = psl->next) slNameAddHead(&sampleIds, psl->qName); analyzeGaps(filteredSeqs, refGenome); slReverse(&sampleIds); reportTiming(pStartTime, "write VCF for uploaded FASTA"); } } *retSampleIds = sampleIds; *retSeqInfo = filteredSeqs; return tn; }