5956c40955854fd83b75627124f6420518a78960
angie
  Wed Feb 24 20:51:42 2021 -0800
Add more columns from summary table to TSV summary file.  Move PSL indel-tallying code upstream & store results for summaries.

diff --git src/hg/hgPhyloPlace/vcfFromFasta.c src/hg/hgPhyloPlace/vcfFromFasta.c
index 501fbf0..ca46b1e 100644
--- src/hg/hgPhyloPlace/vcfFromFasta.c
+++ src/hg/hgPhyloPlace/vcfFromFasta.c
@@ -469,51 +469,114 @@
 /* Find single-nucleotide differences between each query sequence and reference, and
  * write out a VCF file with genotype columns for the queries. */
 {
 struct snvInfo *snvsByPos[ref->size];
 memset(snvsByPos, 0, sizeof snvsByPos);
 extractSnvs(psls, ref, querySeqs, snvsByPos, informativeBases, maskSites);
 int sampleCount = slCount(querySeqs);
 char *sampleNames[sampleCount];
 struct seqInfo *qSeq;
 int i;
 for (i = 0, qSeq = querySeqs;  i < sampleCount;  i++, qSeq = qSeq->next)
     sampleNames[i] = qSeq->seq->name;
 writeSnvsToVcfFile(snvsByPos, ref, sampleNames, sampleCount, maskSites, vcfFileName);
 }
 
+static void analyzeGaps(struct seqInfo *filteredSeqs, struct dnaSeq *refGenome)
+/* Tally up actual insertions and deletions in each psl; ignore skipped N bases. */
+{
+struct seqInfo *si;
+for (si = filteredSeqs;  si != NULL;  si = si->next)
+    {
+    struct psl *psl = si->psl;
+    if (psl && (psl->qBaseInsert || psl->tBaseInsert))
+        {
+        struct dyString *dyIns = dyStringNew(0);
+        struct dyString *dyDel = dyStringNew(0);
+        int insBases = 0, delBases = 0;
+        int ix;
+        for (ix = 0;  ix < psl->blockCount - 1;  ix++)
+            {
+            int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix];
+            int qGapEnd = psl->qStarts[ix+1];
+            int qGapLen = qGapEnd - qGapStart;
+            int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix];
+            int tGapEnd = psl->tStarts[ix+1];
+            int tGapLen = tGapEnd - tGapStart;
+            if (qGapLen > tGapLen)
+                {
+                int insLen = qGapLen - tGapLen;
+                insBases += insLen;
+                if (isNotEmpty(dyIns->string))
+                    dyStringAppend(dyIns, ", ");
+                if (insLen <= 12)
+                    {
+                    char insSeq[insLen+1];
+                    safencpy(insSeq, sizeof insSeq, si->seq->dna + qGapEnd - insLen, insLen);
+                    touppers(insSeq);
+                    dyStringPrintf(dyIns, "%d-%d:%s", tGapEnd, tGapEnd+1, insSeq);
+                    }
+                else
+                    dyStringPrintf(dyIns, "%d-%d:%d bases", tGapEnd, tGapEnd+1, insLen);
+                }
+            else if (tGapLen > qGapLen)
+                {
+                int delLen = tGapLen - qGapLen;;
+                delBases += delLen;
+                if (isNotEmpty(dyDel->string))
+                    dyStringAppend(dyDel, ", ");
+                if (delLen <= 12)
+                    {
+                    char delSeq[delLen+1];
+                    safencpy(delSeq, sizeof delSeq, refGenome->dna + tGapEnd - delLen, delLen);
+                    touppers(delSeq);
+                    dyStringPrintf(dyDel, "%d-%d:%s", tGapEnd - delLen + 1, tGapEnd, delSeq);
+                    }
+                else
+                    dyStringPrintf(dyDel, "%d-%d:%d bases", tGapEnd - delLen + 1, tGapEnd, delLen);
+                }
+            }
+        si->insBases = insBases;
+        si->delBases = delBases;
+        si->insRanges = dyStringCannibalize(&dyIns);
+        si->delRanges = dyStringCannibalize(&dyDel);
+        }
+    }
+}
+
 struct tempName *vcfFromFasta(struct lineFile *lf, char *db, struct dnaSeq *refGenome,
                               boolean *informativeBases, struct slName **maskSites,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime)
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 {
 struct tempName *tn = NULL;
 struct slName *sampleIds = NULL;
 struct dnaSeq *allSeqs = faReadAllMixedInLf(lf);
 struct seqInfo *filteredSeqs = checkSequences(allSeqs, retFailedSeqs);
 reportTiming(pStartTime, "read and check uploaded FASTA");
 if (filteredSeqs)
     {
     struct psl *alignments = alignSequences(db, refGenome, filteredSeqs, pStartTime);
     struct psl *filteredAlignments = checkAlignments(alignments, filteredSeqs, retFailedPsls);
     if (filteredAlignments)
         {
         AllocVar(tn);
         trashDirFile(tn, "ct", "pp", ".vcf");
         pslSnvsToVcfFile(filteredAlignments, refGenome, filteredSeqs, tn->forCgi, informativeBases,
                          maskSites);
         struct psl *psl;
         for (psl = filteredAlignments;  psl != NULL;  psl = psl->next)
             slNameAddHead(&sampleIds, psl->qName);
+        analyzeGaps(filteredSeqs, refGenome);
         slReverse(&sampleIds);
         reportTiming(pStartTime, "write VCF for uploaded FASTA");
         }
     }
 *retSampleIds = sampleIds;
 *retSeqInfo = filteredSeqs;
 return tn;
 }