834653e22227fcea282e12a48910bb73068b34df angie Mon Apr 29 15:33:06 2024 -0700 Don't depend on having a PSL alignment for the results table, precompute the same values whether aligning with gf/blat or nextclade. diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c index 919c5ed..c8308dd 100644 --- src/hg/hgPhyloPlace/vcfFromFasta.c +++ src/hg/hgPhyloPlace/vcfFromFasta.c @@ -216,33 +216,31 @@ if (otherPsl) { struct dyString *dy = dyStringCreate("Warning: multiple alignments to reference found for " "sequence %s (%d-%d and %d-%d). " "Skipping alignment of %d-%d", psl->qName, otherPsl->qStart, otherPsl->qEnd, psl->qStart, psl->qEnd, psl->qStart, psl->qEnd); slPairAdd(retFailedPsls, dyStringCannibalize(&dy), psl); passes = FALSE; } else hashAdd(alignedSeqs, psl->qName, psl); if (passes) { si = hashFindVal(userSeqsByName, psl->qName); - if (si) - si->psl = psl; - else + if (! si) warn("Aligned sequence name '%s' does not match any input sequence name", psl->qName); slAddHead(&filteredPsls, psl); } } hashFree(&alignedSeqs); hashFree(&userSeqsByName); slReverse(&filteredPsls); slReverse(retFailedPsls); return filteredPsls; } static void removeUnalignedSeqs(struct seqInfo **pSeqs, struct psl *alignments, struct slPair **pFailedSeqs) /* If any seqs were not aligned, then remove them from *pSeqs and add them to *pFailedSeqs. */ { @@ -360,30 +358,31 @@ } snv->genotypes[gtIx] = 1 + altIx; } } static struct seqInfo *mustFindSeqAndIx(char *qName, struct seqInfo *querySeqs, int *retGtIx) /* Find qName and its index in querySeqs or errAbort. */ { struct seqInfo *qSeq; int gtIx; for (gtIx = 0, qSeq = querySeqs; qSeq != NULL; gtIx++, qSeq = qSeq->next) if (sameString(qName, qSeq->seq->name)) break; if (!qSeq) errAbort("PSL query '%s' not found in input sequences", qName); +if (retGtIx) *retGtIx = gtIx; return qSeq; } static int addCall(struct snvInfo **snvsByPos, int tStart, char refBase, char altBase, int gtIx, int gtCount) /* If refBase is not N, add a call (or no-call if altBase is N or '-') to snvsByPos[tStart]. * Return the VCF genotype code (-1 for missing/N, 0 for ref, 1 for first alt etc). */ { if (refBase != 'n') { if (snvsByPos[tStart] == NULL) snvsByPos[tStart] = snvInfoNew(refBase, altBase, gtIx, gtCount); else snvInfoAddCall(snvsByPos[tStart], refBase, altBase, gtIx); @@ -532,38 +531,39 @@ * write out a VCF file with genotype columns for the queries. */ { struct snvInfo **snvsByPos = NULL; AllocArray(snvsByPos, ref->size); extractSnvs(psls, ref, querySeqs, snvsByPos, maskSites); int sampleCount = slCount(querySeqs); char *sampleNames[sampleCount]; struct seqInfo *qSeq; int i; for (i = 0, qSeq = querySeqs; i < sampleCount; i++, qSeq = qSeq->next) sampleNames[i] = qSeq->seq->name; writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, vcfFileName); freeMem(snvsByPos); } -static void analyzeGaps(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome) +static void analyzeGaps(struct psl *filteredAlignments, struct seqInfo *filteredSeqs, + struct dnaSeq *refGenome) /* Tally up actual insertions and deletions in each psl; ignore skipped N bases. */ { -struct seqInfo *si; -for (si = filteredSeqs; si != NULL; si = si->next) +struct psl *psl; +for (psl = filteredAlignments; psl != NULL; psl = psl->next) { - struct psl *psl = si->psl; - if (psl && (psl->qBaseInsert || psl->tBaseInsert)) + struct seqInfo *si = mustFindSeqAndIx(psl->qName, filteredSeqs, NULL); + if (psl->qBaseInsert || psl->tBaseInsert) { struct dyString *dyIns = dyStringNew(0); struct dyString *dyDel = dyStringNew(0); int insBases = 0, delBases = 0; int ix; for (ix = 0; ix < psl->blockCount - 1; ix++) { int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix]; int qGapEnd = psl->qStarts[ix+1]; int qGapLen = qGapEnd - qGapStart; int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix]; int tGapEnd = psl->tStarts[ix+1]; int tGapLen = tGapEnd - tGapStart; if (qGapLen > tGapLen) { @@ -591,54 +591,60 @@ { char delSeq[delLen+1]; safencpy(delSeq, sizeof delSeq, refGenome->dna + tGapEnd - delLen, delLen); touppers(delSeq); dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq); } else dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen); } } si->insBases = insBases; si->delBases = delBases; si->insRanges = dyStringCannibalize(&dyIns); si->delRanges = dyStringCannibalize(&dyDel); } + si->basesAligned = 0; + int i; + for (i = 0; i < psl->blockCount; i++) + si->basesAligned += psl->blockSizes[i]; + si->tStart = psl->tStart; + si->tEnd = psl->tEnd; } } static struct tempName *alignWithBlat(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome, struct slName **maskSites, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, struct slPair **retFailedPsls, int *pStartTime) /* Use blat gf code to align filteredSeqs to refGenome, extract SNVs from alignment, and * save as VCF with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL; struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime); struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls); removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs); if (filteredAlignments) { AllocVar(tn); trashDirFile(tn, "ct", "pp", ".vcf"); pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, maskSites); struct psl *psl; for (psl = filteredAlignments; psl != NULL; psl = psl->next) slNameAddHead(&sampleIds, psl->qName); - analyzeGaps(filteredSeqs, refGenome); + analyzeGaps(filteredAlignments, filteredSeqs, refGenome); slReverse(&sampleIds); reportTiming(pStartTime, "write VCF for uploaded FASTA"); } *retSampleIds = sampleIds; *retSeqInfo = filteredSeqs; return tn; } char *runNextcladeRun(char *seqFile, char *nextcladeDataset) /* Run the nextclade run command on uploaded seqs, return output TSV file name. */ { char *nextcladePath = getNextcladePath(); struct tempName tnNextcladeOut; trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_run", ".tsv"); #define NEXTCLADE_NUM_THREADS "4" @@ -895,30 +901,33 @@ if (!maskSites[t]) addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount); } int tEnd = atol(row[alEndIx]); // Add no-calls for unaligned bases after alignmentEnd for (t = tEnd; t < ref->size; t++) { if (!maskSites[t]) addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount); } parseNextcladeSubstitutions(row[substIx], si, ref, maskSites, gtIx, gtCount, snvsByPos); parseNextcladeMissing(row[nIx], ref, maskSites, gtIx, gtCount, snvsByPos); parseNextcladeAmbig(row[ambigIx], ref, gtIx, gtCount, snvsByPos); parseNextcladeDeletions(row[delIx], si, ref); parseNextcladeInsertions(row[insIx], si, ref); + si->basesAligned = tEnd - tStart - si->delBases - si->insBases; + si->tStart = tStart; + si->tEnd = tEnd; } lineFileClose(&lf); return snvsByPos; } static struct tempName *alignWithNextclade(struct seqInfo *filteredSeqs, char *nextcladeDataset, struct dnaSeq *ref, struct slName **maskSites, struct slName **retSampleIds, struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs, int *pStartTime) /* Use nextclade to align filteredSeqs to ref, extract SNVs from alignment, and * save as VCF with sample genotype columns. */ { struct tempName *tn = NULL; struct slName *sampleIds = NULL;