327d2ed5c8b98712241ead111cb880b19c87666b angie Tue Apr 13 17:50:41 2021 -0700 Add spike mutation category (VoC, escape, RBD, cleavage, spike, D614G) to spike mutation summary TSV output. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 10f181d..1edf288 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1693,37 +1693,79 @@ } struct baseVal *bv; for (bv = imputedBases; bv != NULL; bv = bv->next) { struct singleNucChange *snc; AllocVar(snc); snc->chromStart = bv->chromStart; snc->refBase = '?'; snc->newBase = bv->val[0]; slAddHead(&sncList, snc); } slReverse(&sncList); return sncList; } +enum spikeMutType +/* Some categories of Spike mutation are more concerning than others. */ +{ + smtNone, // Just a spike mutation. + smtVoC, // Thought to be the problematic mutation in a Variant of Concern. + smtEscape, // Implicated in Antibody Escape experiments. + smtRbd, // Receptor Binding Domain + smtCleavage, // Furin cleavage site + smtD614G // Went from rare to ~99% frequency in first half of 2020; old news. +}; + +static char *spikeMutTypeToString(enum spikeMutType smt) +/* Returns a string version of smt. Do not free the returned value. */ +{ +char *string = NULL; +switch (smt) + { + case smtNone: + string = "spike"; + break; + case smtVoC: + string = "VoC"; + break; + case smtEscape: + string = "escape"; + break; + case smtRbd: + string = "RBD"; + break; + case smtCleavage: + string = "cleavage"; + break; + case smtD614G: + string = "D614G"; + break; + default: + errAbort("spikeMutTypeToString: Invalid value %d", smt); + } +return string; +} + struct aaMutInfo // A protein change, who has it, and how important we think it is. { char *name; // The name that people are used to seeing, e.g. "E484K', "N501Y" struct slName *sampleIds; // The uploaded samples that have it int priority; // For sorting; lower number means scarier. int pos; // 1-based position + enum spikeMutType spikeType;// If spike mutation, what kind? char oldAa; // Reference AA char newAa; // Alt AA }; int aaMutInfoCmp(const void *a, const void *b) /* Compare aaMutInfo priority for sorting. */ { const struct aaMutInfo *amiA = *(struct aaMutInfo * const *)a; const struct aaMutInfo *amiB = *(struct aaMutInfo * const *)b; return amiA->priority - amiB->priority; } // For now, hardcode SARS-CoV-2 Spike RBD coords and antibody escape positions (1-based). static int rbdStart = 319; static int rbdEnd = 541; @@ -1794,83 +1836,108 @@ escapeMutPos[494] = TRUE; escapeMutPos[499] = TRUE; escapeMutPos[504] = TRUE; escapeMutPos[514] = TRUE; escapeMutPos[517] = TRUE; escapeMutPos[522] = TRUE; escapeMutPos[525] = TRUE; escapeMutPos[526] = TRUE; escapeMutPos[527] = TRUE; escapeMutPos[528] = TRUE; escapeMutPos[529] = TRUE; escapeMutPos[530] = TRUE; escapeMutPos[531] = TRUE; } -static int spikePriority(int pos, char newAa) -/* Lower number for scarier spike mutation, per spike mutations track / RBD. */ +static int spikePriority(int pos, char newAa, enum spikeMutType *pSmt) +/* Lower number for scarier spike mutation, per spike mutations track / RBD. */ { if (escapeMutPos == NULL) initEscapeMutPos(); int priority = 0; +enum spikeMutType smt = smtNone; if (pos >= rbdStart && pos <= rbdEnd) { // Receptor binding domain priority = 100; + smt = smtRbd; // Antibody escape mutation in Variant of Concern/Interest if (pos == 484) + { priority = 10; + smt = smtVoC; + } else if (pos == 501) + { priority = 15; + smt = smtVoC; + } else if (pos == 452) + { priority = 20; + smt = smtVoC; + } // Other antibody escape mutations else if (pos == 439 || pos == 477) + { priority = 25; + smt = smtEscape; + } else if (escapeMutPos[pos]) + { priority = 50; + smt = smtEscape; + } } else if (pos >= 675 && pos <= 681) + { // Furin cleavage site; circumstantial evidence for Q677{H,P} spread in US. // Interesting threads on SPHERES 2021-02-26 about P681H tradeoff between infectivity vs // range of cell types that can be infected and other observations about that region. priority = 110; + smt = smtCleavage; + } else if (pos == 614 && newAa == 'G') + { // Old hat priority = 1000; + smt = smtD614G; + } else // Somewhere else in Spike priority = 500; +if (pSmt != NULL) + *pSmt = smt; return priority; } static void addSpikeChange(struct hash *spikeChanges, char *aaMutStr, char *sampleId) /* Tally up an observance of a S gene change in a sample. */ { // Parse oldAa, pos, newAA out of aaMutStr. Need a more elegant way of doing this; // this is a rush job to get something usable out there asap. char oldAa = aaMutStr[0]; int pos = atoi(aaMutStr+1); char newAa = aaMutStr[strlen(aaMutStr)-1]; struct hashEl *hel = hashLookup(spikeChanges, aaMutStr); if (hel == NULL) { struct aaMutInfo *ami; AllocVar(ami); ami->name = cloneString(aaMutStr); slNameAddHead(&ami->sampleIds, sampleId); - ami->priority = spikePriority(pos, newAa); + ami->priority = spikePriority(pos, newAa, &ami->spikeType); ami->pos = pos; ami->oldAa = oldAa; ami->newAa = newAa; hashAdd(spikeChanges, aaMutStr, ami); } else { struct aaMutInfo *ami = hel->val; slNameAddHead(&ami->sampleIds, sampleId); } } static void writeOneTsvRow(FILE *f, char *sampleId, struct usherResults *results, struct hash *seqInfoHash, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *spikeChanges) @@ -2023,58 +2090,59 @@ writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } carefulClose(&f); hashFree(&seqInfoHash); reportTiming(pStartTime, "write tsv summary"); return tsvTn; } static struct tempName *writeSpikeChangeSummary(struct hash *spikeChanges, int totalSampleCount) /* Write a tab-separated summary of S (Spike) gene changes for download. */ { struct tempName *tsvTn = NULL; AllocVar(tsvTn); trashDirFile(tsvTn, "ct", "usher_S_muts", ".tsv"); FILE *f = mustOpen(tsvTn->forCgi, "w"); -fprintf(f, "aa_mutation\tsample_count\tsample_frequency\tsample_ids" +fprintf(f, "aa_mutation\tsample_count\tsample_frequency\tsample_ids\tcategory" "\n"); struct aaMutInfo *sChanges[spikeChanges->elCount]; struct hashCookie cookie = hashFirst(spikeChanges); int ix = 0; struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) { if (ix >= spikeChanges->elCount) errAbort("writeSpikeChangeSummary: hash elCount is %d but got more elements", spikeChanges->elCount); sChanges[ix++] = hel->val; } if (ix != spikeChanges->elCount) errAbort("writeSpikeChangeSummary: hash elCount is %d but got fewer elements (%d)", spikeChanges->elCount, ix); qsort(sChanges, ix, sizeof(sChanges[0]), aaMutInfoCmp); for (ix = 0; ix < spikeChanges->elCount; ix++) { struct aaMutInfo *ami = sChanges[ix]; int sampleCount = slCount(ami->sampleIds); fprintf(f, "S:%s\t%d\t%f", ami->name, sampleCount, (double)sampleCount / (double)totalSampleCount); slReverse(&ami->sampleIds); fprintf(f, "\t%s", ami->sampleIds->name); struct slName *sample; for (sample = ami->sampleIds->next; sample != NULL; sample = sample->next) fprintf(f, ",%s", sample->name); + fprintf(f, "\t%s", spikeMutTypeToString(ami->spikeType)); fputc('\n', f); } carefulClose(&f); return tsvTn; } static struct tempName *makeSubtreeZipFile(struct usherResults *results, struct tempName *jsonTns[], struct tempName *singleSubtreeJsonTn, int *pStartTime) /* Make a zip archive file containing all of the little subtree Newick and JSON files so * user doesn't have to click on each one. */ { struct tempName *zipTn; AllocVar(zipTn); trashDirFile(zipTn, "ct", "usher_subtrees", ".zip"); int subtreeCount = slCount(results->subtreeInfoList);