5956c40955854fd83b75627124f6420518a78960
angie
  Wed Feb 24 20:51:42 2021 -0800
Add more columns from summary table to TSV summary file.  Move PSL indel-tallying code upstream & store results for summaries.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 6f49fe1..5efe19e 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -1340,31 +1340,30 @@
     dyStringPrintf(dy, "%d N bases at end", si->nCountEnd);
 }
 
 static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta,
                                struct usherResults *ur, struct tempName *jsonTns[],
                                struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree,
                                struct dnaSeq *refGenome)
 /* Show a table with composition & alignment stats for each sequence that passed basic QC. */
 {
 if (seqInfoList)
     {
     puts("<table class='seqSummary'>");
     printSummaryHeader(isFasta);
     puts("<tbody>");
     struct dyString *dy = dyStringNew(0);
-    struct dyString *dyExtra = dyStringNew(0);
     struct seqInfo *si;
     for (si = seqInfoList;  si != NULL;  si = si->next)
         {
         puts("<tr>");
         printf("<th>%s</td>", replaceChars(si->seq->name, "|", " | "));
         if (isFasta)
             {
             if (si->nCountStart || si->nCountEnd)
                 {
                 int effectiveLength = si->seq->size - (si->nCountStart + si->nCountEnd);
                 dyStringClear(dy);
                 dyStringPrintf(dy, "%d ", effectiveLength);
                 appendExcludingNs(dy, si);
                 dyStringPrintf(dy, " (original size %d)", si->seq->size);
                 printf("<td class='%s'>%d", qcClassForLength(effectiveLength), effectiveLength);
@@ -1405,98 +1404,41 @@
                                si->ambigCount - alignedAmbigCount);
             printTooltip(dy->string);
             }
         printf("</td>");
         if (isFasta)
             {
             struct psl *psl = si->psl;
             if (psl)
                 {
                 int aliCount = psl->match + psl->misMatch + psl->repMatch;
                 printf("<td class='%s'>%d ", qcClassForLength(aliCount), aliCount);
                 dyStringClear(dy);
                 dyStringPrintf(dy, "bases %d - %d align to reference bases %d - %d",
                                psl->qStart+1, psl->qEnd, psl->tStart+1, psl->tEnd);
                 printTooltip(dy->string);
-                int insBases = 0, insCount = 0, delBases = 0, delCount = 0;
-                if (psl->qBaseInsert || psl->tBaseInsert)
-                    {
-                    // Tally up actual insertions and deletions; ignore skipped N bases.
-                    dyStringClear(dy);
-                    dyStringClear(dyExtra);
-                    int ix;
-                    for (ix = 0;  ix < psl->blockCount - 1;  ix++)
-                        {
-                        int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix];
-                        int qGapEnd = psl->qStarts[ix+1];
-                        int qGapLen = qGapEnd - qGapStart;
-                        int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix];
-                        int tGapEnd = psl->tStarts[ix+1];
-                        int tGapLen = tGapEnd - tGapStart;
-                        if (qGapLen > tGapLen)
-                            {
-                            insCount++;
-                            int insLen = qGapLen - tGapLen;
-                            insBases += insLen;
-                            if (isNotEmpty(dy->string))
-                                dyStringAppend(dy, ", ");
-                            if (insLen <= 12)
-                                {
-                                char insSeq[insLen+1];
-                                safencpy(insSeq, sizeof insSeq, si->seq->dna + qGapEnd - insLen,
-                                         insLen);
-                                touppers(insSeq);
-                                dyStringPrintf(dy, "%d-%d:%s",
-                                               tGapEnd, tGapEnd+1, insSeq);
-                                }
-                            else
-                                dyStringPrintf(dy, "%d-%d:%d bases",
-                                               tGapEnd, tGapEnd+1, insLen);
-                            }
-                        else if (tGapLen > qGapLen)
-                            {
-                            delCount++;
-                            int delLen = tGapLen - qGapLen;;
-                            delBases += delLen;
-                            if (isNotEmpty(dyExtra->string))
-                                dyStringAppend(dyExtra, ", ");
-                            if (delLen <= 12)
-                                {
-                                char delSeq[delLen+1];
-                                safencpy(delSeq, sizeof delSeq, refGenome->dna + tGapEnd - delLen,
-                                         delLen);
-                                touppers(delSeq);
-                                dyStringPrintf(dyExtra, "%d-%d:%s",
-                                               tGapEnd - delLen + 1, tGapEnd, delSeq);
-                                }
-                            else
-                                dyStringPrintf(dyExtra, "%d-%d:%d bases",
-                                               tGapEnd - delLen + 1, tGapEnd, delLen);
-                            }
-                        }
-                    }
                 printf("</td><td class='%s'>%d ",
-                       qcClassForIndel(insBases), insBases);
-                if (insBases)
+                       qcClassForIndel(si->insBases), si->insBases);
+                if (si->insBases)
                     {
-                    printTooltip(dy->string);
+                    printTooltip(si->insRanges);
                     }
                 printf("</td><td class='%s'>%d ",
-                       qcClassForIndel(delBases), delBases);
-                if (delBases)
+                       qcClassForIndel(si->delBases), si->delBases);
+                if (si->delBases)
                     {
-                    printTooltip(dyExtra->string);
+                    printTooltip(si->delRanges);
                     }
                 printf("</td>");
                 }
             else
                 printf("<td colspan=3 class='%s'> not alignable </td>",
                        qcClassForLength(0));
             }
         int snvCount = slCount(si->sncList) - alignedAmbigCount;
         printf("<td class='%s'>%d", qcClassForSNVs(snvCount), snvCount);
         if (snvCount > 0)
             {
             dyStringClear(dy);
             struct singleNucChange *snc;
             for (snc = si->sncList;  snc != NULL;  snc = snc->next)
                 {
@@ -1618,31 +1560,32 @@
 struct baseVal *bv;
 for (bv = imputedBases;  bv != NULL;  bv = bv->next)
     {
     struct singleNucChange *snc;
     AllocVar(snc);
     snc->chromStart = bv->chromStart;
     snc->refBase = '?';
     snc->newBase = bv->val[0];
     slAddHead(&sncList, snc);
     }
 slReverse(&sncList);
 return sncList;
 }
 
 static void writeOneTsvRow(FILE *f, char *sampleId, struct usherResults *results,
-                           struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
+                           struct hash *seqInfoHash, struct geneInfo *geneInfoList,
+                           struct seqWindow *gSeqWin)
 /* Write one row of tab-separate summary for sampleId. */
 {
     struct placementInfo *info = hashFindVal(results->samplePlacements, sampleId);
     // sample name / ID
     fprintf(f, "%s\t", sampleId);
     // nucleotide mutations
     struct slName *mut;
     for (mut = info->sampleMuts;  mut != NULL;  mut = mut->next)
         {
         if (mut != info->sampleMuts)
             fputc(',', f);
         fputs(mut->name, f);
         }
     fputc('\t', f);
     // AA mutations
@@ -1663,73 +1606,125 @@
             fprintf(f, "%s:%s", geneAaMut->name, aaMut->name);
             }
         }
     fputc('\t', f);
     // imputed bases (if any)
     struct baseVal *bv;
     for (bv = info->imputedBases;  bv != NULL;  bv = bv->next)
         {
         if (bv != info->imputedBases)
             fputc(',', f);
         fprintf(f, "%d%s", bv->chromStart+1, bv->val);
         }
     fputc('\t', f);
     // path through tree to sample
     printVariantPathNoNodeNames(f, info->variantPath);
+    // number of equally parsimonious placements
+    fprintf(f, "\t%d", info->bestNodeCount);
+    // parsimony score
+    fprintf(f, "\t%d", info->parsimonyScore);
+    struct seqInfo *si = hashFindVal(seqInfoHash, sampleId);
+    if (si)
+        {
+        if (si->psl)
+            {
+            // length
+            fprintf(f, "\t%d", si->seq->size);
+            struct psl *psl = si->psl;
+            // aligned bases, indel counts & ranges
+            int aliCount = psl->match + psl->misMatch + psl->repMatch;
+            fprintf(f, "\t%d\t%d\t%s\t%d\t%s",
+                    aliCount, si->insBases, emptyForNull(si->insRanges),
+                    si->delBases, emptyForNull(si->delRanges));
+            }
+        else
+            fprintf(f, "\t\t\t\t\t\t");
+        // SNVs that were masked (Problematic Sites track), not used in placement
+        fputc('\t', f);
+        struct singleNucChange *snc;
+        for (snc = si->maskedSncList;  snc != NULL;  snc = snc->next)
+            {
+            if (snc != si->maskedSncList)
+                fputc(',', f);
+            fprintf(f, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase);
+            }
+        }
+    else
+        {
+        warn("writeOneTsvRow: no sequenceInfo for sample '%s'", sampleId);
+        fprintf(f, "\t\t\t\t\t\t\t");
+        }
     fputc('\n', f);
 }
 
 static void rWriteTsvSummaryTreeOrder(struct phyloTree *node, FILE *f, struct usherResults *results,
-                                      struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
+                                      struct hash *seqInfoHash, struct geneInfo *geneInfoList,
+                                      struct seqWindow *gSeqWin)
 /* As we encounter leaves (user-uploaded samples) in depth-first search order, write out a line
  * of TSV summary for each one. */
 {
 if (node->numEdges)
     {
     int i;
     for (i = 0;  i < node->numEdges;  i++)
-        rWriteTsvSummaryTreeOrder(node->edges[i], f, results, geneInfoList, gSeqWin);
+        rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin);
     }
 else
     {
-    writeOneTsvRow(f, node->ident->name, results, geneInfoList, gSeqWin);
+    writeOneTsvRow(f, node->ident->name, results, seqInfoHash, geneInfoList, gSeqWin);
     }
 }
 
 
+static struct hash *hashFromSeqInfoList(struct seqInfo *seqInfoList)
+/* Hash sequence name to seqInfo for quick lookup. */
+{
+struct hash *hash = hashNew(0);
+struct seqInfo *si;
+for (si = seqInfoList;  si != NULL;  si = si->next)
+    hashAdd(hash, si->seq->name, si);
+return hash;
+}
+
 static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree,
-                                        struct slName *sampleIds, struct geneInfo *geneInfoList,
-                                        struct seqWindow *gSeqWin, int *pStartTime)
+                                        struct slName *sampleIds, struct seqInfo *seqInfoList,
+                                        struct geneInfo *geneInfoList, struct seqWindow *gSeqWin,
+                                        int *pStartTime)
 /* Write a tab-separated summary file for download.  If the user uploaded enough samples to make
  * a tree, then write out samples in tree order; otherwise use sampleIds list. */
 {
 struct tempName *tsvTn = NULL;
 AllocVar(tsvTn);
 trashDirFile(tsvTn, "ct", "usher", ".tsv");
 FILE *f = mustOpen(tsvTn->forCgi, "w");
-fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path\n");
+fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path"
+        "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases"
+        "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations"
+        "\n");
+struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList);
 if (sampleTree)
     {
-    rWriteTsvSummaryTreeOrder(sampleTree, f, results, geneInfoList, gSeqWin);
+    rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin);
     }
 else
     {
     struct slName *sample;
     for (sample = sampleIds;  sample != NULL;  sample = sample->next)
-        writeOneTsvRow(f, sample->name, results, geneInfoList, gSeqWin);
+        writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin);
     }
 carefulClose(&f);
+hashFree(&seqInfoHash);
 reportTiming(pStartTime, "write tsv summary");
 return tsvTn;
 }
 
 static struct slName **getProblematicSites(char *db)
 /* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that
  * masking is recommended, one per position in genome; otherwise return NULL. */
 {
 struct slName **pSites = NULL;
 char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile");
 if (isNotEmpty(pSitesFile) && fileExists(pSitesFile))
     {
     AllocArray(pSites, chromSize);
     struct bbiFile *bbi = bigBedFileOpen(pSitesFile);
     struct lm *lm = lmInit(0);
@@ -1917,32 +1912,32 @@
                 }
             reportTiming(&startTime, "describe placements");
             }
         else
             printf("<p>(Skipping details and subtrees; "
                    "you uploaded %d sequences, and details/subtrees are shown only when "
                    "you upload at most %d sequences.)</p>\n",
                    seqCount, MAX_SEQ_DETAILS);
 
         // Make custom tracks for uploaded samples and subtree(s).
         struct phyloTree *sampleTree = NULL;
         struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree,
                                                   source, fontHeight, &sampleTree, &startTime);
 
         // Make a TSV summary file
-        struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, geneInfoList,
-                                                 gSeqWin, &startTime);
+        struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoList,
+                                                 geneInfoList, gSeqWin, &startTime);
 
         // Offer big tree w/new samples for download
         puts("<h3>Downloads</h3>");
         puts("<ul>");
         printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree "
                "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml);
         printf("<li><a href='%s' download>TSV summary of sequences and placements</a>\n",
                tsvTn->forHtml);
         for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
            {
             int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds);
             printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml,
                    ti->subtreeUserSampleIds->name);
             if (subtreeUserSampleCount > 10)
                 printf(" and %d other samples", subtreeUserSampleCount - 1);