327d2ed5c8b98712241ead111cb880b19c87666b
angie
  Tue Apr 13 17:50:41 2021 -0700
Add spike mutation category (VoC, escape, RBD, cleavage, spike, D614G) to spike mutation summary TSV output.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 10f181d..1edf288 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -1693,37 +1693,79 @@
     }
 struct baseVal *bv;
 for (bv = imputedBases;  bv != NULL;  bv = bv->next)
     {
     struct singleNucChange *snc;
     AllocVar(snc);
     snc->chromStart = bv->chromStart;
     snc->refBase = '?';
     snc->newBase = bv->val[0];
     slAddHead(&sncList, snc);
     }
 slReverse(&sncList);
 return sncList;
 }
 
+enum spikeMutType
+/* Some categories of Spike mutation are more concerning than others. */
+{
+    smtNone,        // Just a spike mutation.
+    smtVoC,         // Thought to be the problematic mutation in a Variant of Concern.
+    smtEscape,      // Implicated in Antibody Escape experiments.
+    smtRbd,         // Receptor Binding Domain
+    smtCleavage,    // Furin cleavage site
+    smtD614G        // Went from rare to ~99% frequency in first half of 2020; old news.
+};
+
+static char *spikeMutTypeToString(enum spikeMutType smt)
+/* Returns a string version of smt.  Do not free the returned value. */
+{
+char *string = NULL;
+switch (smt)
+    {
+    case smtNone:
+        string = "spike";
+        break;
+    case smtVoC:
+        string = "VoC";
+        break;
+    case smtEscape:
+        string = "escape";
+        break;
+    case smtRbd:
+        string = "RBD";
+        break;
+    case smtCleavage:
+        string = "cleavage";
+        break;
+    case smtD614G:
+        string = "D614G";
+        break;
+    default:
+        errAbort("spikeMutTypeToString: Invalid value %d", smt);
+    }
+return string;
+}
+
 struct aaMutInfo
 // A protein change, who has it, and how important we think it is.
 {
     char *name;                 // The name that people are used to seeing, e.g. "E484K', "N501Y"
     struct slName *sampleIds;   // The uploaded samples that have it
     int priority;               // For sorting; lower number means scarier.
     int pos;                    // 1-based position
+    enum spikeMutType spikeType;// If spike mutation, what kind?
     char oldAa;                 // Reference AA
     char newAa;                 // Alt AA
 };
 
 int aaMutInfoCmp(const void *a, const void *b)
 /* Compare aaMutInfo priority for sorting. */
 {
 const struct aaMutInfo *amiA = *(struct aaMutInfo * const *)a;
 const struct aaMutInfo *amiB = *(struct aaMutInfo * const *)b;
 return amiA->priority - amiB->priority;
 }
 
 // For now, hardcode SARS-CoV-2 Spike RBD coords and antibody escape positions (1-based).
 static int rbdStart = 319;
 static int rbdEnd = 541;
@@ -1794,83 +1836,108 @@
 escapeMutPos[494] = TRUE;
 escapeMutPos[499] = TRUE;
 escapeMutPos[504] = TRUE;
 escapeMutPos[514] = TRUE;
 escapeMutPos[517] = TRUE;
 escapeMutPos[522] = TRUE;
 escapeMutPos[525] = TRUE;
 escapeMutPos[526] = TRUE;
 escapeMutPos[527] = TRUE;
 escapeMutPos[528] = TRUE;
 escapeMutPos[529] = TRUE;
 escapeMutPos[530] = TRUE;
 escapeMutPos[531] = TRUE;
 }
 
-static int spikePriority(int pos, char newAa)
-/* Lower number for scarier spike mutation, per spike mutations track / RBD. */
+static int spikePriority(int pos, char newAa, enum spikeMutType *pSmt)
+/* Lower number for scarier spike mutation, per spike mutations track / RBD. */
 {
 if (escapeMutPos == NULL)
     initEscapeMutPos();
 int priority = 0;
+enum spikeMutType smt = smtNone;
 if (pos >= rbdStart && pos <= rbdEnd)
     {
     // Receptor binding domain
     priority = 100;
+    smt = smtRbd;
     // Antibody escape mutation in Variant of Concern/Interest
     if (pos == 484)
+        {
         priority = 10;
+        smt = smtVoC;
+        }
     else if (pos == 501)
+        {
         priority = 15;
+        smt = smtVoC;
+        }
     else if (pos == 452)
+        {
         priority = 20;
+        smt = smtVoC;
+        }
     // Other antibody escape mutations
     else if (pos == 439 || pos == 477)
+        {
         priority = 25;
+        smt = smtEscape;
+        }
     else if (escapeMutPos[pos])
+        {
         priority = 50;
+        smt = smtEscape;
+        }
     }
 else if (pos >= 675 && pos <= 681)
+    {
     // Furin cleavage site; circumstantial evidence for Q677{H,P} spread in US.
     // Interesting threads on SPHERES 2021-02-26 about P681H tradeoff between infectivity vs
     // range of cell types that can be infected and other observations about that region.
     priority = 110;
+    smt = smtCleavage;
+    }
 else if (pos == 614 && newAa == 'G')
+    {
     // Old hat
     priority = 1000;
+    smt = smtD614G;
+    }
 else
     // Somewhere else in Spike
     priority = 500;
+if (pSmt != NULL)
+    *pSmt = smt;
 return priority;
 }
 
 static void addSpikeChange(struct hash *spikeChanges, char *aaMutStr, char *sampleId)
 /* Tally up an observance of a S gene change in a sample. */
 {
 // Parse oldAa, pos, newAA out of aaMutStr.  Need a more elegant way of doing this;
 // this is a rush job to get something usable out there asap.
 char oldAa = aaMutStr[0];
 int pos = atoi(aaMutStr+1);
 char newAa = aaMutStr[strlen(aaMutStr)-1];
 struct hashEl *hel = hashLookup(spikeChanges, aaMutStr);
 if (hel == NULL)
     {
     struct aaMutInfo *ami;
     AllocVar(ami);
     ami->name = cloneString(aaMutStr);
     slNameAddHead(&ami->sampleIds, sampleId);
-    ami->priority = spikePriority(pos, newAa);
+    ami->priority = spikePriority(pos, newAa, &ami->spikeType);
     ami->pos = pos;
     ami->oldAa = oldAa;
     ami->newAa = newAa;
     hashAdd(spikeChanges, aaMutStr, ami);
     }
 else
     {
     struct aaMutInfo *ami = hel->val;
     slNameAddHead(&ami->sampleIds, sampleId);
     }
 }
 
 static void writeOneTsvRow(FILE *f, char *sampleId, struct usherResults *results,
                            struct hash *seqInfoHash, struct geneInfo *geneInfoList,
                            struct seqWindow *gSeqWin, struct hash *spikeChanges)
@@ -2023,58 +2090,59 @@
         writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges);
     }
 carefulClose(&f);
 hashFree(&seqInfoHash);
 reportTiming(pStartTime, "write tsv summary");
 return tsvTn;
 }
 
 static struct tempName *writeSpikeChangeSummary(struct hash *spikeChanges, int totalSampleCount)
 /* Write a tab-separated summary of S (Spike) gene changes for download. */
 {
 struct tempName *tsvTn = NULL;
 AllocVar(tsvTn);
 trashDirFile(tsvTn, "ct", "usher_S_muts", ".tsv");
 FILE *f = mustOpen(tsvTn->forCgi, "w");
-fprintf(f, "aa_mutation\tsample_count\tsample_frequency\tsample_ids"
+fprintf(f, "aa_mutation\tsample_count\tsample_frequency\tsample_ids\tcategory"
         "\n");
 struct aaMutInfo *sChanges[spikeChanges->elCount];
 struct hashCookie cookie = hashFirst(spikeChanges);
 int ix = 0;
 struct hashEl *hel;
 while ((hel = hashNext(&cookie)) != NULL)
     {
     if (ix >= spikeChanges->elCount)
         errAbort("writeSpikeChangeSummary: hash elCount is %d but got more elements",
                  spikeChanges->elCount);
     sChanges[ix++] = hel->val;
     }
 if (ix != spikeChanges->elCount)
     errAbort("writeSpikeChangeSummary: hash elCount is %d but got fewer elements (%d)",
              spikeChanges->elCount, ix);
 qsort(sChanges, ix, sizeof(sChanges[0]), aaMutInfoCmp);
 for (ix = 0;  ix < spikeChanges->elCount;  ix++)
     {
     struct aaMutInfo *ami = sChanges[ix];
     int sampleCount = slCount(ami->sampleIds);
     fprintf(f, "S:%s\t%d\t%f",
             ami->name, sampleCount, (double)sampleCount / (double)totalSampleCount);
     slReverse(&ami->sampleIds);
     fprintf(f, "\t%s", ami->sampleIds->name);
     struct slName *sample;
     for (sample = ami->sampleIds->next;  sample != NULL;  sample = sample->next)
         fprintf(f, ",%s", sample->name);
+    fprintf(f, "\t%s", spikeMutTypeToString(ami->spikeType));
     fputc('\n', f);
     }
 carefulClose(&f);
 return tsvTn;
 }
 
 static struct tempName *makeSubtreeZipFile(struct usherResults *results, struct tempName *jsonTns[],
                                            struct tempName *singleSubtreeJsonTn, int *pStartTime)
 /* Make a zip archive file containing all of the little subtree Newick and JSON files so
  * user doesn't have to click on each one. */
 {
 struct tempName *zipTn;
 AllocVar(zipTn);
 trashDirFile(zipTn, "ct", "usher_subtrees", ".zip");
 int subtreeCount = slCount(results->subtreeInfoList);