src/hg/hgPhyloPlace/vcfFromFasta.c 0c0fe3658fc96b44ea96c2b30f58194779569961

0c0fe3658fc96b44ea96c2b30f58194779569961
angie
  Mon Jun 10 18:17:25 2024 -0700
Fix: was forgetting to explicitly set reference alleles when making VCF from nextclade output, resulting in way too many no-calls which made usher-sampled way too slow.  Thanks @aviczhl2 for reporting the slowness.

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index 1b978f7..4c90605 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -714,103 +714,112 @@
         struct dyString *dy = dyStringCreate("Sequence %s could not be aligned to the reference; "
                                              "skipping", si->seq->name);
         slPairAdd(&newFailedSeqs, dyStringCannibalize(&dy), si);
         }
     }
 slReverse(&alignedSeqs);
 slReverse(&newFailedSeqs);
 *pFilteredSeqs = alignedSeqs;
 *retFailedSeqs = slCat(*retFailedSeqs, newFailedSeqs);
 hashFree(&alignedSeqNames);
 return gtCount;
 }
 
 static void parseNextcladeSubstitutions(char *substStr, struct seqInfo *si, struct dnaSeq *ref,
                                         struct slName **maskSites, int gtIx, int gtCount,
-                                        struct snvInfo **snvsByPos)
+                                        struct snvInfo **snvsByPos, boolean *positionsCalled)
 /* Add genotypes for substitutions; also build up si fields for substs and masked substs */
 {
 int substCount = chopByChar(substStr, ',', NULL, 0);
 char *substArr[substCount];
 chopCommas(substStr, substArr);
 int i;
 for (i = 0;  i < substCount; i++)
     {
     char *subst = substArr[i];
     int t = atol(subst+1) - 1;
     char altBase = subst[strlen(subst)-1];
     int gt = addCall(snvsByPos, t, ref->dna[t], tolower(altBase), gtIx, gtCount);
+    positionsCalled[t] = TRUE;
     if (gt > 0)
         {
         struct singleNucChange *snc = sncNew(t, ref->dna[t], '\0', altBase);
         struct slName *maskedReasons = maskSites[t];
         if (maskedReasons)
             {
             slAddHead(&si->maskedSncList, snc);
             slAddHead(&si->maskedReasonsList, slRefNew(maskedReasons));
             }
         else
             slAddHead(&si->sncList, snc);
         }
     }
 }
 
 static void parseNextcladeMissing(char *nRangeStr, struct dnaSeq *ref, struct slName **maskSites,
-                                  int gtIx, int gtCount, struct snvInfo **snvsByPos)
+                                  int gtIx, int gtCount, struct snvInfo **snvsByPos,
+                                  boolean *positionsCalled)
 /*  Add no-calls to snvsByPos for ranges of N bases */
 {
 int nRangeCount = chopByChar(nRangeStr, ',', NULL, 0);
 char *nRangeArr[nRangeCount];
 chopCommas(nRangeStr, nRangeArr);
 int i;
 for (i = 0;  i < nRangeCount;  i++)
     {
     char *nRange = nRangeArr[i];
     int nStart = atol(nRange) - 1;
     int nEnd = nStart + 1;
     char *p = strchr(nRange, '-');
     if (p)
         nEnd = atol(p+1);
     int t;
     for (t = nStart;  t < nEnd;  t++)
         if (!maskSites[t])
+            {
             addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount);
+            positionsCalled[t] = TRUE;
+            }
     }
 }
 
 static void parseNextcladeAmbig(char *ambigStr, struct dnaSeq *ref,
-                                int gtIx, int gtCount, struct snvInfo **snvsByPos)
+                                int gtIx, int gtCount, struct snvInfo **snvsByPos,
+                                boolean *positionsCalled)
 /*  Add calls to snvsByPos for non-N ambiguous bases */
 {
 int ambigCount = chopByChar(ambigStr, ',', NULL, 0);
 char *ambigArr[ambigCount];
 chopCommas(ambigStr, ambigArr);
 int i;
 for (i = 0;  i < ambigCount;  i++)
     {
     char *ambig = ambigArr[i];
     char *p = strchr(ambig, ':');
     if (! p)
         errAbort("missing ':' separator in ambig '%s'", ambig);
     int aStart = atol(p+1) - 1;
     int aEnd = aStart+1;
     p = strchr(ambig, '-');
     if (p != NULL)
         aEnd = atol(p+1);
     int t;
     for (t = aStart;  t < aEnd;  t++)
+        {
         addCall(snvsByPos, t, ref->dna[t], ambig[0], gtIx, gtCount);
+        positionsCalled[t] = TRUE;
+        }
     }
 }
 
 static void parseNextcladeDeletions(char *delStr, struct seqInfo *si, struct dnaSeq *ref)
 /*  We don't add calls for insertions or deletions, but we do describe them in seqInfo */
 {
 int delRangeCount = chopByChar(delStr, ',', NULL, 0);
 char *delRangeArr[delRangeCount];
 chopCommas(delStr, delRangeArr);
 int delBases = 0;
 struct dyString *dy = dyStringNew(0);
 int i;
 for (i = 0;  i < delRangeCount;  i++)
     {
     char *delRange = delRangeArr[i];
@@ -855,86 +864,109 @@
         errAbort("missing ':' separator in insertion '%s'", ins);
     char *insSeq = p+1;
     int insLen = strlen(insSeq);
     insBases += insLen;
     if (isNotEmpty(dy->string))
         dyStringAppend(dy, ", ");
     if (insLen <= 12)
         dyStringPrintf(dy, "%d-%d:%s", t, t+1, insSeq);
     else
         dyStringPrintf(dy, "%d-%d:%d bases", t, t+1, insLen);
     }
 si->insBases = insBases;
 si->insRanges = dyStringCannibalize(&dy);
 }
 
+static void addRefCalls(struct snvInfo **snvsByPos, boolean *positionsCalled, struct dnaSeq *ref,
+                        int gtIx, int gtCount)
+/* Add call for ref allele at any base not already called otherwise. */
+{
+int i;
+for (i = 0;  i < ref->size;  i++)
+    {
+    if (! positionsCalled[i])
+        addCall(snvsByPos, i, ref->dna[i], ref->dna[i], gtIx, gtCount);
+    }
+}
+
 static struct snvInfo **parseNextcladeRun(char *nextcladeOut, struct seqInfo **pFilteredSeqs,
                                           struct dnaSeq *ref, struct slName **maskSites,
                                           struct slPair **retFailedSeqs)
 /* Parse the TSV output from nextcladeRun into a per-base array of snvInfo for making VCF,
  * while filling in alignment-related seqInfo fields.
  * Append any unaligned sequences to retFailedSeqs and set pFilteredSeqs to aligned seqs. */
 {
 struct snvInfo **snvsByPos = NULL;
 AllocArray(snvsByPos, ref->size);
 int gtCount = countNextcladeAlignedSeqs(nextcladeOut, pFilteredSeqs, retFailedSeqs);
 struct lineFile *lf = lineFileOpen(nextcladeOut, TRUE);
 // First line is header; get column indexes
 char *row[100];
 int headerColCount = lineFileChopTab(lf, row);
 if (headerColCount <= 0)
     errAbort("nextclade run output is empty");
 int seqNameIx = stringArrayIx("seqName", row,  headerColCount);
 int alStartIx = stringArrayIx("alignmentStart", row,  headerColCount);
 int alEndIx = stringArrayIx("alignmentEnd", row,  headerColCount);
 int substIx = stringArrayIx("substitutions", row,  headerColCount);
 int delIx = stringArrayIx("deletions", row,  headerColCount);
 int insIx = stringArrayIx("insertions", row,  headerColCount);
 int nIx = stringArrayIx("missing", row,  headerColCount);
 int ambigIx = stringArrayIx("nonACGTNs", row,  headerColCount);
 if (seqNameIx == -1 || alStartIx == -1 || alEndIx == -1 || substIx == -1 || delIx == -1 ||
     insIx == -1 || nIx == -1 || ambigIx == -1)
     errAbort("nextclade run output header line does not contain all expected columns");
+boolean *positionsCalled = NULL;
+AllocArray(positionsCalled, ref->size);
 int colCount = 0;
 while ((colCount = lineFileChopTab(lf, row)) > 0)
     {
+    zeroBytes(positionsCalled, ref->size * sizeof positionsCalled[0]);
     char *seqName = row[seqNameIx];
     // Nextclade still includes unaligned sequences in its output, just with empty alignment columns
     if (isEmpty(row[alStartIx]))
         continue;
     int gtIx;
     struct seqInfo *si = mustFindSeqAndIx(seqName, *pFilteredSeqs, &gtIx);
     int tStart = atol(row[alStartIx]) - 1;
     // Add no-calls for unaligned bases before alignmentStart
     int t;
     for (t = 0;  t < tStart;  t++)
         {
         if (!maskSites[t])
+            {
             addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount);
+            positionsCalled[t] = TRUE;
+            }
         }
     int tEnd = atol(row[alEndIx]);
     // Add no-calls for unaligned bases after alignmentEnd
     for (t = tEnd;  t < ref->size;  t++)
         {
         if (!maskSites[t])
+            {
             addCall(snvsByPos, t, ref->dna[t], 'n', gtIx, gtCount);
+            positionsCalled[t] = TRUE;
+            }
         }
-    parseNextcladeSubstitutions(row[substIx], si, ref, maskSites, gtIx, gtCount, snvsByPos);
-    parseNextcladeMissing(row[nIx], ref, maskSites, gtIx, gtCount, snvsByPos);
-    parseNextcladeAmbig(row[ambigIx], ref, gtIx, gtCount, snvsByPos);
+    parseNextcladeSubstitutions(row[substIx], si, ref, maskSites, gtIx, gtCount, snvsByPos,
+                                positionsCalled);
+    parseNextcladeMissing(row[nIx], ref, maskSites, gtIx, gtCount, snvsByPos, positionsCalled);
+    parseNextcladeAmbig(row[ambigIx], ref, gtIx, gtCount, snvsByPos, positionsCalled);
     parseNextcladeDeletions(row[delIx], si, ref);
     parseNextcladeInsertions(row[insIx], si, ref);
+    addRefCalls(snvsByPos, positionsCalled, ref, gtIx, gtCount);
     si->basesAligned = tEnd - tStart - si->delBases - si->insBases;
     si->tStart = tStart;
     si->tEnd = tEnd;
     }
 lineFileClose(&lf);
 return snvsByPos;
 }
 
 static struct tempName *alignWithNextclade(struct seqInfo *filteredSeqs, char *nextcladeDataset,
                                            struct dnaSeq *ref, struct slName **maskSites,
                                            struct slName **retSampleIds,
                                            struct seqInfo **retSeqInfo,
                                            struct slPair **retFailedSeqs, int *pStartTime)
 /* Use nextclade to align filteredSeqs to ref, extract SNVs from alignment, and
  * save as VCF with sample genotype columns. */