0c0fe3658fc96b44ea96c2b30f58194779569961 angie Mon Jun 10 18:17:25 2024 -0700 Fix: was forgetting to explicitly set reference alleles when making VCF from nextclade output, resulting in way too many no-calls which made usher-sampled way too slow. Thanks @aviczhl2 for reporting the slowness. diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index 1b978f7..4c90605 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -714,103 +714,112 @@ struct dyString *dy = dyStringCreate("Sequence %s could not be aligned to the reference; " "skipping", si->seq->name); slPairAdd(&newFailedSeqs, dyStringCannibalize(&dy), si); } } slReverse(&alignedSeqs); slReverse(&newFailedSeqs); *pFilteredSeqs = alignedSeqs; *retFailedSeqs = slCat(*retFailedSeqs, newFailedSeqs); hashFree(&alignedSeqNames); return gtCount; } static void parseNextcladeSubstitutions(char *substStr, struct seqInfo *si, struct dnaSeq *ref, struct slName **maskSites, int gtIx, int gtCount, - struct snvInfo **snvsByPos) + struct snvInfo **snvsByPos, boolean *positionsCalled) /* Add genotypes for substitutions; also build up si fields for substs and masked substs */ { int substCount = chopByChar(substStr, ',', NULL, 0); char *substArr[substCount]; chopCommas(substStr, substArr); int i; for (i = 0; i < substCount; i++) { char *subst = substArr[i]; int t = atol(subst+1) - 1; char altBase = subst[strlen(subst)-1]; int gt = addCall(snvsByPos, t, ref->dna[t], tolower(altBase), gtIx, gtCount); + positionsCalled[t] = TRUE; if (gt > 0) { struct singleNucChange *snc = sncNew(t, ref->dna[t], '\0', altBase); struct slName *maskedReasons = maskSites[t]; if (maskedReasons) { slAddHead(&si->maskedSncList, snc); slAddHead(&si->maskedReasonsList, slRefNew(maskedReasons)); } else slAddHead(&si->sncList, snc); } } } static void parseNextcladeMissing(char *nRangeStr, struct dnaSeq *ref, struct slName **maskSites, - int gtIx, int gtCount, struct snvInfo **snvsByPos) + int gtIx, int gtCount, struct snvInfo **snvsByPos, + boolean *positionsCalled) /* Add no-calls to snvsByPos for ranges of N bases */ { int nRangeCount = chopByChar(nRangeStr, ',', NULL, 0); char *nRangeArr[nRangeCount]; chopCommas(nRangeStr, nRangeArr); int i; for (i = 0; i < nRangeCount; i++) { char *nRange = nRangeArr[i]; int nStart = atol(nRange) - 1; int nEnd = nStart + 1; char *p = strchr(nRange, '-'); if (p) nEnd = atol(p+1); int t; for (t = nStart; t < nEnd; t++) if (!maskSites[t]) + { addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount); + positionsCalled[t] = TRUE; + } } } static void parseNextcladeAmbig(char *ambigStr, struct dnaSeq *ref, - int gtIx, int gtCount, struct snvInfo **snvsByPos) + int gtIx, int gtCount, struct snvInfo **snvsByPos, + boolean *positionsCalled) /* Add calls to snvsByPos for non-N ambiguous bases */ { int ambigCount = chopByChar(ambigStr, ',', NULL, 0); char *ambigArr[ambigCount]; chopCommas(ambigStr, ambigArr); int i; for (i = 0; i < ambigCount; i++) { char *ambig = ambigArr[i]; char *p = strchr(ambig, ':'); if (! p) errAbort("missing ':' separator in ambig '%s'", ambig); int aStart = atol(p+1) - 1; int aEnd = aStart+1; p = strchr(ambig, '-'); if (p != NULL) aEnd = atol(p+1); int t; for (t = aStart; t < aEnd; t++) + { addCall(snvsByPos, t, ref->dna[t], ambig[0], gtIx, gtCount); + positionsCalled[t] = TRUE; + } } } static void parseNextcladeDeletions(char *delStr, struct seqInfo *si, struct dnaSeq *ref) /* We don't add calls for insertions or deletions, but we do describe them in seqInfo */ { int delRangeCount = chopByChar(delStr, ',', NULL, 0); char *delRangeArr[delRangeCount]; chopCommas(delStr, delRangeArr); int delBases = 0; struct dyString *dy = dyStringNew(0); int i; for (i = 0; i < delRangeCount; i++) { char *delRange = delRangeArr[i]; @@ -855,86 +864,109 @@ errAbort("missing ':' separator in insertion '%s'", ins); char *insSeq = p+1; int insLen = strlen(insSeq); insBases += insLen; if (isNotEmpty(dy->string)) dyStringAppend(dy, ", "); if (insLen <= 12) dyStringPrintf(dy, "%d-%d:%s", t, t+1, insSeq); else dyStringPrintf(dy, "%d-%d:%d bases", t, t+1, insLen); } si->insBases = insBases; si->insRanges = dyStringCannibalize(&dy); } +static void addRefCalls(struct snvInfo **snvsByPos, boolean *positionsCalled, struct dnaSeq *ref, + int gtIx, int gtCount) +/* Add call for ref allele at any base not already called otherwise. */ +{ +int i; +for (i = 0; i < ref->size; i++) + { + if (! positionsCalled[i]) + addCall(snvsByPos, i, ref->dna[i], ref->dna[i], gtIx, gtCount); + } +} + static struct snvInfo **parseNextcladeRun(char *nextcladeOut, struct seqInfo **pFilteredSeqs, struct dnaSeq *ref, struct slName **maskSites, struct slPair **retFailedSeqs) /* Parse the TSV output from nextcladeRun into a per-base array of snvInfo for making VCF, * while filling in alignment-related seqInfo fields. * Append any unaligned sequences to retFailedSeqs and set pFilteredSeqs to aligned seqs. */ { struct snvInfo **snvsByPos = NULL; AllocArray(snvsByPos, ref->size); int gtCount = countNextcladeAlignedSeqs(nextcladeOut, pFilteredSeqs, retFailedSeqs); struct lineFile *lf = lineFileOpen(nextcladeOut, TRUE); // First line is header; get column indexes char *row[100]; int headerColCount = lineFileChopTab(lf, row); if (headerColCount <= 0) errAbort("nextclade run output is empty"); int seqNameIx = stringArrayIx("seqName", row, headerColCount); int alStartIx = stringArrayIx("alignmentStart", row, headerColCount); int alEndIx = stringArrayIx("alignmentEnd", row, headerColCount); int substIx = stringArrayIx("substitutions", row, headerColCount); int delIx = stringArrayIx("deletions", row, headerColCount); int insIx = stringArrayIx("insertions", row, headerColCount); int nIx = stringArrayIx("missing", row, headerColCount); int ambigIx = stringArrayIx("nonACGTNs", row, headerColCount); if (seqNameIx == -1 || alStartIx == -1 || alEndIx == -1 || substIx == -1 || delIx == -1 || insIx == -1 || nIx == -1 || ambigIx == -1) errAbort("nextclade run output header line does not contain all expected columns"); +boolean *positionsCalled = NULL; +AllocArray(positionsCalled, ref->size); int colCount = 0; while ((colCount = lineFileChopTab(lf, row)) > 0) { + zeroBytes(positionsCalled, ref->size * sizeof positionsCalled[0]); char *seqName = row[seqNameIx]; // Nextclade still includes unaligned sequences in its output, just with empty alignment columns if (isEmpty(row[alStartIx])) continue; int gtIx; struct seqInfo *si = mustFindSeqAndIx(seqName, *pFilteredSeqs, >Ix); int tStart = atol(row[alStartIx]) - 1; // Add no-calls for unaligned bases before alignmentStart int t; for (t = 0; t < tStart; t++) { if (!maskSites[t]) + { addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount); + positionsCalled[t] = TRUE; + } } int tEnd = atol(row[alEndIx]); // Add no-calls for unaligned bases after alignmentEnd for (t = tEnd; t < ref->size; t++) { if (!maskSites[t]) + { addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount); + positionsCalled[t] = TRUE; + } } - parseNextcladeSubstitutions(row[substIx], si, ref, maskSites, gtIx, gtCount, snvsByPos); - parseNextcladeMissing(row[nIx], ref, maskSites, gtIx, gtCount, snvsByPos); - parseNextcladeAmbig(row[ambigIx], ref, gtIx, gtCount, snvsByPos); + parseNextcladeSubstitutions(row[substIx], si, ref, maskSites, gtIx, gtCount, snvsByPos, + positionsCalled); + parseNextcladeMissing(row[nIx], ref, maskSites, gtIx, gtCount, snvsByPos, positionsCalled); + parseNextcladeAmbig(row[ambigIx], ref, gtIx, gtCount, snvsByPos, positionsCalled); parseNextcladeDeletions(row[delIx], si, ref); parseNextcladeInsertions(row[insIx], si, ref); + addRefCalls(snvsByPos, positionsCalled, ref, gtIx, gtCount); si->basesAligned = tEnd - tStart - si->delBases - si->insBases; si->tStart = tStart; si->tEnd = tEnd; } lineFileClose(&lf); return snvsByPos; } static struct tempName *alignWithNextclade(struct seqInfo *filteredSeqs, char *nextcladeDataset, struct dnaSeq *ref, struct slName **maskSites, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, int *pStartTime) /* Use nextclade to align filteredSeqs to ref, extract SNVs from alignment, and * save as VCF with sample genotype columns. */