bdc1ddd9389c69f90464e15fff6661b52befdd1c
angie
  Sun May 23 13:49:32 2021 -0700
If a sequence can't be aligned to the reference then move it from filteredSeqs to failedSeqs so it doesn't end up in the VCF with no actual calls.

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index c929d7a..e9082a5 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -202,30 +202,65 @@
         si = hashFindVal(userSeqsByName, psl->qName);
         if (si)
             si->psl = psl;
         else
             warn("Aligned sequence name '%s' does not match any input sequence name", psl->qName);
         slAddHead(&filteredPsls, psl);
         }
     }
 hashFree(&alignedSeqs);
 hashFree(&userSeqsByName);
 slReverse(&filteredPsls);
 slReverse(retFailedPsls);
 return filteredPsls;
 }
 
+static void removeUnalignedSeqs(struct seqInfo **pSeqs, struct psl *alignments,
+                                struct slPair **pFailedSeqs)
+/* If any seqs were not aligned, then remove them from *pSeqs and add them to *pFailedSeqs. */
+{
+struct seqInfo *alignedSeqs = NULL;
+struct slPair *newFailedSeqs = NULL;
+struct seqInfo *seq, *nextSeq;
+for (seq = *pSeqs;  seq != NULL;  seq = nextSeq)
+    {
+    nextSeq = seq->next;
+    boolean found = FALSE;
+    struct psl *psl;
+    for (psl = alignments;  psl != NULL;  psl = psl->next)
+        {
+        if (sameString(psl->qName, seq->seq->name))
+            {
+            found = TRUE;
+            break;
+            }
+        }
+    if (found)
+        slAddHead(&alignedSeqs, seq);
+    else
+        {
+        struct dyString *dy = dyStringCreate("Sequence %s could not be aligned to the reference; "
+                                             "skipping", seq->seq->name);
+        slPairAdd(&newFailedSeqs, dyStringCannibalize(&dy), seq);
+        }
+    }
+slReverse(&alignedSeqs);
+slReverse(&newFailedSeqs);
+*pSeqs = alignedSeqs;
+*pFailedSeqs = slCat(*pFailedSeqs, newFailedSeqs);
+}
+
 struct snvInfo
 /* Summary of SNV with sample genotypes (position and samples are externally defined) */
     {
     int altCount;     // number of alternate alleles
     int altAlloced;   // number of allocated array elements for alt alleles
     char *altBases;   // alternate allele values (not '\0'-terminated string)
     int *altCounts;   // number of occurrences of each alt allele
     int *genotypes;   // -1 for no-call, 0 for ref, otherwise 1+index in altBases
     int unkCount;     // number of no-calls (genotype is -1)
     char refBase;     // reference allele
     };
 
 static struct snvInfo *snvInfoNew(char refBase, char altBase, int gtIx, int gtCount)
 /* Alloc & return a new snvInfo for ref & alt.  If alt == ref, just record the genotype call. */
 {
@@ -549,30 +584,31 @@
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime)
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 {
 struct tempName *tn = NULL;
 struct slName *sampleIds = NULL;
 struct dnaSeq *allSeqs = faReadAllMixedInLf(lf);
 struct seqInfo *filteredSeqs = checkSequences(allSeqs, retFailedSeqs);
 reportTiming(pStartTime, "read and check uploaded FASTA");
 if (filteredSeqs)
     {
     struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime);
     struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls);
+    removeUnalignedSeqs(&filteredSeqs, filteredAlignments, retFailedSeqs);
     if (filteredAlignments)
         {
         AllocVar(tn);
         trashDirFile(tn, "ct", "pp", ".vcf");
         pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases,
                          maskSites);
         struct psl *psl;
         for (psl = filteredAlignments;  psl != NULL;  psl = psl->next)
             slNameAddHead(&sampleIds, psl->qName);
         analyzeGaps(filteredSeqs, refGenome);
         slReverse(&sampleIds);
         reportTiming(pStartTime, "write VCF for uploaded FASTA");
         }
     }
 *retSampleIds = sampleIds;