834653e22227fcea282e12a48910bb73068b34df
angie
  Mon Apr 29 15:33:06 2024 -0700
Don't depend on having a PSL alignment for the results table, precompute the same values whether aligning with gf/blat or nextclade.

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index 919c5ed..c8308dd 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -216,33 +216,31 @@
     if (otherPsl)
         {
         struct dyString *dy = dyStringCreate("Warning: multiple alignments to reference found for "
                                              "sequence %s (%d-%d and %d-%d).  "
                                              "Skipping alignment of %d-%d",
                                               psl->qName, otherPsl->qStart, otherPsl->qEnd,
                                               psl->qStart, psl->qEnd, psl->qStart, psl->qEnd);
         slPairAdd(retFailedPsls, dyStringCannibalize(&dy), psl);
         passes = FALSE;
         }
     else
         hashAdd(alignedSeqs, psl->qName, psl);
     if (passes)
         {
         si = hashFindVal(userSeqsByName, psl->qName);
-        if (si)
-            si->psl = psl;
-        else
+        if (! si)
             warn("Aligned sequence name '%s' does not match any input sequence name", psl->qName);
         slAddHead(&filteredPsls, psl);
         }
     }
 hashFree(&alignedSeqs);
 hashFree(&userSeqsByName);
 slReverse(&filteredPsls);
 slReverse(retFailedPsls);
 return filteredPsls;
 }
 
 static void removeUnalignedSeqs(struct seqInfo **pSeqs, struct psl *alignments,
                                 struct slPair **pFailedSeqs)
 /* If any seqs were not aligned, then remove them from *pSeqs and add them to *pFailedSeqs. */
 {
@@ -360,30 +358,31 @@
         }
     snv->genotypes[gtIx] = 1 + altIx;
     }
 }
 
 static struct seqInfo *mustFindSeqAndIx(char *qName, struct seqInfo *querySeqs, int *retGtIx)
 /* Find qName and its index in querySeqs or errAbort. */
 {
 struct seqInfo *qSeq;
 int gtIx;
 for (gtIx = 0, qSeq = querySeqs;  qSeq != NULL;  gtIx++, qSeq = qSeq->next)
     if (sameString(qName, qSeq->seq->name))
         break;
 if (!qSeq)
     errAbort("PSL query '%s' not found in input sequences", qName);
+if (retGtIx)
     *retGtIx = gtIx;
 return qSeq;
 }
 
 static int addCall(struct snvInfo **snvsByPos, int tStart, char refBase, char altBase, int gtIx,
                     int gtCount)
 /* If refBase is not N, add a call (or no-call if altBase is N or '-') to snvsByPos[tStart].
  * Return the VCF genotype code (-1 for missing/N, 0 for ref, 1 for first alt etc). */
 {
 if (refBase != 'n')
     {
     if (snvsByPos[tStart] == NULL)
         snvsByPos[tStart] = snvInfoNew(refBase, altBase, gtIx, gtCount);
     else
         snvInfoAddCall(snvsByPos[tStart], refBase, altBase, gtIx);
@@ -532,38 +531,39 @@
  * write out a VCF file with genotype columns for the queries. */
 {
 struct snvInfo **snvsByPos = NULL;
 AllocArray(snvsByPos, ref->size);
 extractSnvs(psls, ref, querySeqs, snvsByPos, maskSites);
 int sampleCount = slCount(querySeqs);
 char *sampleNames[sampleCount];
 struct seqInfo *qSeq;
 int i;
 for (i = 0, qSeq = querySeqs;  i < sampleCount;  i++, qSeq = qSeq->next)
     sampleNames[i] = qSeq->seq->name;
 writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, vcfFileName);
 freeMem(snvsByPos);
 }
 
-static void analyzeGaps(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome)
+static void analyzeGaps(struct psl *filteredAlignments, struct seqInfo *filteredSeqs,
+                        struct dnaSeq *refGenome)
 /* Tally up actual insertions and deletions in each psl; ignore skipped N bases. */
 {
-struct seqInfo *si;
-for (si = filteredSeqs;  si != NULL;  si = si->next)
+struct psl *psl;
+for (psl = filteredAlignments;  psl != NULL;  psl = psl->next)
     {
-    struct psl *psl = si->psl;
-    if (psl && (psl->qBaseInsert || psl->tBaseInsert))
+    struct seqInfo *si = mustFindSeqAndIx(psl->qName, filteredSeqs, NULL);
+    if (psl->qBaseInsert || psl->tBaseInsert)
         {
         struct dyString *dyIns = dyStringNew(0);
         struct dyString *dyDel = dyStringNew(0);
         int insBases = 0, delBases = 0;
         int ix;
         for (ix = 0;  ix < psl->blockCount - 1;  ix++)
             {
             int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix];
             int qGapEnd = psl->qStarts[ix+1];
             int qGapLen = qGapEnd - qGapStart;
             int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix];
             int tGapEnd = psl->tStarts[ix+1];
             int tGapLen = tGapEnd - tGapStart;
             if (qGapLen > tGapLen)
                 {
@@ -591,54 +591,60 @@
                     {
                     char delSeq[delLen+1];
                     safencpy(delSeq, sizeof delSeq, refGenome->dna + tGapEnd - delLen, delLen);
                     touppers(delSeq);
                     dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq);
                     }
                 else
                     dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen);
                 }
             }
         si->insBases = insBases;
         si->delBases = delBases;
         si->insRanges = dyStringCannibalize(&dyIns);
         si->delRanges = dyStringCannibalize(&dyDel);
         }
+    si->basesAligned = 0;
+    int i;
+    for (i = 0;  i < psl->blockCount;  i++)
+        si->basesAligned += psl->blockSizes[i];
+    si->tStart = psl->tStart;
+    si->tEnd = psl->tEnd;
     }
 }
 
 static struct tempName *alignWithBlat(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome,
                                       struct slName **maskSites, struct slName **retSampleIds,
                                       struct seqInfo **retSeqInfo, struct slPair **retFailedSeqs,
                                       struct slPair **retFailedPsls, int *pStartTime)
 /* Use blat gf code to align filteredSeqs to refGenome, extract SNVs from alignment, and
  * save as VCF with sample genotype columns. */
 {
 struct tempName *tn = NULL;
 struct slName *sampleIds = NULL;
 struct psl *alignments = alignSequences(refGenome, filteredSeqs, pStartTime);
 struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls);
 removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs);
 if (filteredAlignments)
     {
     AllocVar(tn);
     trashDirFile(tn, "ct", "pp", ".vcf");
     pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, maskSites);
     struct psl *psl;
     for (psl = filteredAlignments;  psl != NULL;  psl = psl->next)
         slNameAddHead(&sampleIds, psl->qName);
-    analyzeGaps(filteredSeqs, refGenome);
+    analyzeGaps(filteredAlignments, filteredSeqs, refGenome);
     slReverse(&sampleIds);
     reportTiming(pStartTime, "write VCF for uploaded FASTA");
     }
 *retSampleIds = sampleIds;
 *retSeqInfo = filteredSeqs;
 return tn;
 }
 
 char *runNextcladeRun(char *seqFile, char *nextcladeDataset)
 /* Run the nextclade run command on uploaded seqs, return output TSV file name. */
 {
 char *nextcladePath = getNextcladePath();
 struct tempName tnNextcladeOut;
 trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_run", ".tsv");
 #define NEXTCLADE_NUM_THREADS "4"
@@ -895,30 +901,33 @@
         if (!maskSites[t])
             addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount);
         }
     int tEnd = atol(row[alEndIx]);
     // Add no-calls for unaligned bases after alignmentEnd
     for (t = tEnd;  t < ref->size;  t++)
         {
         if (!maskSites[t])
             addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount);
         }
     parseNextcladeSubstitutions(row[substIx], si, ref, maskSites, gtIx, gtCount, snvsByPos);
     parseNextcladeMissing(row[nIx], ref, maskSites, gtIx, gtCount, snvsByPos);
     parseNextcladeAmbig(row[ambigIx], ref, gtIx, gtCount, snvsByPos);
     parseNextcladeDeletions(row[delIx], si, ref);
     parseNextcladeInsertions(row[insIx], si, ref);
+    si->basesAligned = tEnd - tStart - si->delBases - si->insBases;
+    si->tStart = tStart;
+    si->tEnd = tEnd;
     }
 lineFileClose(&lf);
 return snvsByPos;
 }
 
 static struct tempName *alignWithNextclade(struct seqInfo *filteredSeqs, char *nextcladeDataset,
                                            struct dnaSeq *ref, struct slName **maskSites,
                                            struct slName **retSampleIds,
                                            struct seqInfo **retSeqInfo,
                                            struct slPair **retFailedSeqs, int *pStartTime)
 /* Use nextclade to align filteredSeqs to ref, extract SNVs from alignment, and
  * save as VCF with sample genotype columns. */
 {
 struct tempName *tn = NULL;
 struct slName *sampleIds = NULL;