8441e4b8e776dea862c62a466f765c8cd94b1e3d angie Mon Mar 1 00:55:04 2021 -0800 Add a TSV download file summarizing S (Spike) protein mutations, emphasizing antibody escape and Receptor Binding Domain mutations. Don't show too many buttons at the top. Try a little harder to find metadata for collapsed nodes in subtrees. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 7fc3fe5..184faff 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -144,31 +144,30 @@ filename, maxChoices); break; } struct dyString *dy = dyStringNew(0); addPathIfNecessary(dy, db, words[0]); treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string); addPathIfNecessary(dy, db, words[1]); treeChoices->metadataFiles[treeChoices->count] = cloneString(dy->string); treeChoices->sources[treeChoices->count] = cloneString(words[2]); // Description can be either a file or just some text. addPathIfNecessary(dy, db, words[3]); if (fileExists(dy->string)) { char *desc = NULL; readInGulp(dy->string, &desc, NULL); - fprintf(stderr, "reading '%s' --> '%s'\n", dy->string, desc); treeChoices->descriptions[treeChoices->count] = desc; } else treeChoices->descriptions[treeChoices->count] = cloneString(words[3]); treeChoices->count++; dyStringFree(&dy); } lineFileClose(&lf); } return treeChoices; } static char *urlFromTn(struct tempName *tn) /* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't * just leave it up to the user's web browser to do the right thing with "../". */ @@ -762,30 +761,37 @@ met = hashFindVal(sampleMetadata, gbId); } if (!met) met = hashFindVal(sampleMetadata, sampleId); if (!met && strchr(sampleId, '|')) { char copy[strlen(sampleId)+1]; safecpy(copy, sizeof copy, sampleId); char *words[4]; int wordCount = chopString(copy, "|", words, ArraySize(words)); if (isNotEmpty(words[0])) met = hashFindVal(sampleMetadata, words[0]); if (met == NULL && wordCount > 1 && isNotEmpty(words[1])) met = hashFindVal(sampleMetadata, words[1]); } +// If it's one of our collapsed node names, dig out the example name and try that. +if (!met && isdigit(sampleId[0]) && strstr(sampleId, "_from_")) + { + char *eg = strstr(sampleId, "_eg_"); + if (eg) + met = hashFindVal(sampleMetadata, eg+strlen("_eg_")); + } return met; } static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ { char *lineage = NULL; struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); if (met) lineage = met->lineage; return lineage; } static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree, char *source, struct placementInfo *info, struct hash *sampleMetadata) @@ -1613,63 +1619,231 @@ } struct baseVal *bv; for (bv = imputedBases; bv != NULL; bv = bv->next) { struct singleNucChange *snc; AllocVar(snc); snc->chromStart = bv->chromStart; snc->refBase = '?'; snc->newBase = bv->val[0]; slAddHead(&sncList, snc); } slReverse(&sncList); return sncList; } +struct aaMutInfo +// A protein change, who has it, and how important we think it is. +{ + char *name; // The name that people are used to seeing, e.g. "E484K', "N501Y" + struct slName *sampleIds; // The uploaded samples that have it + int priority; // For sorting; lower number means scarier. + int pos; // 1-based position + char oldAa; // Reference AA + char newAa; // Alt AA +}; + +int aaMutInfoCmp(const void *a, const void *b) +/* Compare aaMutInfo priority for sorting. */ +{ +const struct aaMutInfo *amiA = *(struct aaMutInfo * const *)a; +const struct aaMutInfo *amiB = *(struct aaMutInfo * const *)b; +return amiA->priority - amiB->priority; +} + +// For now, hardcode SARS-CoV-2 Spike RBD coords and antibody escape positions (1-based). +static int rbdStart = 319; +static int rbdEnd = 541; +static boolean *escapeMutPos = NULL; + +static void initEscapeMutPos() +/* Allocate excapeMutPos and set positions implicated by at least a couple experiments from Bloom Lab + * or that appear in Whelan, Rappuoli or McCoy tracks. */ +{ +AllocArray(escapeMutPos, rbdEnd); +escapeMutPos[332] = TRUE; +escapeMutPos[334] = TRUE; +escapeMutPos[335] = TRUE; +escapeMutPos[337] = TRUE; +escapeMutPos[339] = TRUE; +escapeMutPos[340] = TRUE; +escapeMutPos[345] = TRUE; +escapeMutPos[346] = TRUE; +escapeMutPos[348] = TRUE; +escapeMutPos[352] = TRUE; +escapeMutPos[357] = TRUE; +escapeMutPos[359] = TRUE; +escapeMutPos[361] = TRUE; +escapeMutPos[362] = TRUE; +escapeMutPos[363] = TRUE; +escapeMutPos[365] = TRUE; +escapeMutPos[366] = TRUE; +escapeMutPos[367] = TRUE; +escapeMutPos[369] = TRUE; +escapeMutPos[370] = TRUE; +escapeMutPos[371] = TRUE; +escapeMutPos[372] = TRUE; +escapeMutPos[373] = TRUE; +escapeMutPos[374] = TRUE; +escapeMutPos[376] = TRUE; +escapeMutPos[378] = TRUE; +escapeMutPos[383] = TRUE; +escapeMutPos[384] = TRUE; +escapeMutPos[385] = TRUE; +escapeMutPos[386] = TRUE; +escapeMutPos[408] = TRUE; +escapeMutPos[417] = TRUE; +escapeMutPos[441] = TRUE; +escapeMutPos[444] = TRUE; +escapeMutPos[445] = TRUE; +escapeMutPos[445] = TRUE; +escapeMutPos[447] = TRUE; +escapeMutPos[449] = TRUE; +escapeMutPos[450] = TRUE; +escapeMutPos[452] = TRUE; +escapeMutPos[455] = TRUE; +escapeMutPos[456] = TRUE; +escapeMutPos[458] = TRUE; +escapeMutPos[472] = TRUE; +escapeMutPos[473] = TRUE; +escapeMutPos[474] = TRUE; +escapeMutPos[476] = TRUE; +escapeMutPos[477] = TRUE; +escapeMutPos[478] = TRUE; +escapeMutPos[479] = TRUE; +escapeMutPos[483] = TRUE; +escapeMutPos[484] = TRUE; +escapeMutPos[485] = TRUE; +escapeMutPos[486] = TRUE; +escapeMutPos[487] = TRUE; +escapeMutPos[489] = TRUE; +escapeMutPos[490] = TRUE; +escapeMutPos[494] = TRUE; +escapeMutPos[499] = TRUE; +escapeMutPos[504] = TRUE; +escapeMutPos[514] = TRUE; +escapeMutPos[517] = TRUE; +escapeMutPos[522] = TRUE; +escapeMutPos[525] = TRUE; +escapeMutPos[526] = TRUE; +escapeMutPos[527] = TRUE; +escapeMutPos[528] = TRUE; +escapeMutPos[529] = TRUE; +escapeMutPos[530] = TRUE; +escapeMutPos[531] = TRUE; +} + +static int spikePriority(int pos, char newAa) +/* Lower number for scarier spike mutation, per spike mutations track / RBD. */ +{ +if (escapeMutPos == NULL) + initEscapeMutPos(); +int priority = 0; +if (pos >= rbdStart && pos <= rbdEnd) + { + // Receptor binding domain + priority = 100; + // Antibody escape mutation in Variant of Concern/Interest + if (pos == 484) + priority = 10; + else if (pos == 501) + priority = 15; + else if (pos == 452) + priority = 20; + // Other antibody escape mutations + else if (pos == 439 || pos == 477) + priority = 25; + else if (escapeMutPos[pos]) + priority = 50; + } +else if (pos >= 675 && pos <= 681) + // Furin cleavage site; circumstantial evidence for Q677{H,P} spread in US. + // Interesting threads on SPHERES 2021-02-26 about P681H tradeoff between infectivity vs + // range of cell types that can be infected and other observations about that region. + priority = 110; +else if (pos == 614 && newAa == 'G') + // Old hat + priority = 1000; +else + // Somewhere else in Spike + priority = 500; +return priority; +} + +static void addSpikeChange(struct hash *spikeChanges, char *aaMutStr, char *sampleId) +/* Tally up an observance of a S gene change in a sample. */ +{ +// Parse oldAa, pos, newAA out of aaMutStr. Need a more elegant way of doing this; +// this is a rush job to get something usable out there asap. +char oldAa = aaMutStr[0]; +int pos = atoi(aaMutStr+1); +char newAa = aaMutStr[strlen(aaMutStr)-1]; +struct hashEl *hel = hashLookup(spikeChanges, aaMutStr); +if (hel == NULL) + { + struct aaMutInfo *ami; + AllocVar(ami); + ami->name = cloneString(aaMutStr); + slNameAddHead(&ami->sampleIds, sampleId); + ami->priority = spikePriority(pos, newAa); + ami->pos = pos; + ami->oldAa = oldAa; + ami->newAa = newAa; + hashAdd(spikeChanges, aaMutStr, ami); + } +else + { + struct aaMutInfo *ami = hel->val; + slNameAddHead(&ami->sampleIds, sampleId); + } +} + static void writeOneTsvRow(FILE *f, char *sampleId, struct usherResults *results, struct hash *seqInfoHash, struct geneInfo *geneInfoList, - struct seqWindow *gSeqWin) -/* Write one row of tab-separate summary for sampleId. */ + struct seqWindow *gSeqWin, struct hash *spikeChanges) +/* Write one row of tab-separate summary for sampleId. Accumulate S gene AA change info. */ { struct placementInfo *info = hashFindVal(results->samplePlacements, sampleId); // sample name / ID fprintf(f, "%s\t", sampleId); // nucleotide mutations struct slName *mut; for (mut = info->sampleMuts; mut != NULL; mut = mut->next) { if (mut != info->sampleMuts) fputc(',', f); fputs(mut->name, f); } fputc('\t', f); // AA mutations struct singleNucChange *sncList = sncListFromSampleMutsAndImputed(info->sampleMuts, info->imputedBases); struct slPair *geneAaMutations = getAaMutations(sncList, geneInfoList, gSeqWin); struct slPair *geneAaMut; boolean first = TRUE; for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next) { struct slName *aaMut; for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) { if (first) first = FALSE; else fputc(',', f); fprintf(f, "%s:%s", geneAaMut->name, aaMut->name); + if (sameString(geneAaMut->name, "S")) + addSpikeChange(spikeChanges, aaMut->name, sampleId); } } fputc('\t', f); // imputed bases (if any) struct baseVal *bv; for (bv = info->imputedBases; bv != NULL; bv = bv->next) { if (bv != info->imputedBases) fputc(',', f); fprintf(f, "%d%s", bv->chromStart+1, bv->val); } fputc('\t', f); // path through tree to sample printVariantPathNoNodeNames(f, info->variantPath); // number of equally parsimonious placements @@ -1679,110 +1853,154 @@ struct seqInfo *si = hashFindVal(seqInfoHash, sampleId); if (si) { if (si->psl) { // length fprintf(f, "\t%d", si->seq->size); struct psl *psl = si->psl; // aligned bases, indel counts & ranges int aliCount = psl->match + psl->misMatch + psl->repMatch; fprintf(f, "\t%d\t%d\t%s\t%d\t%s", aliCount, si->insBases, emptyForNull(si->insRanges), si->delBases, emptyForNull(si->delRanges)); } else - fprintf(f, "\t\t\t\t\t\t"); + fprintf(f, "\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a"); // SNVs that were masked (Problematic Sites track), not used in placement fputc('\t', f); struct singleNucChange *snc; for (snc = si->maskedSncList; snc != NULL; snc = snc->next) { if (snc != si->maskedSncList) fputc(',', f); fprintf(f, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); } } else { warn("writeOneTsvRow: no sequenceInfo for sample '%s'", sampleId); - fprintf(f, "\t\t\t\t\t\t\t"); + fprintf(f, "\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a"); } fputc('\n', f); } static void rWriteTsvSummaryTreeOrder(struct phyloTree *node, FILE *f, struct usherResults *results, struct hash *seqInfoHash, struct geneInfo *geneInfoList, - struct seqWindow *gSeqWin) + struct seqWindow *gSeqWin, struct hash *spikeChanges) /* As we encounter leaves (user-uploaded samples) in depth-first search order, write out a line * of TSV summary for each one. */ { if (node->numEdges) { int i; for (i = 0; i < node->numEdges; i++) - rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin); + rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin, + spikeChanges); } else { - writeOneTsvRow(f, node->ident->name, results, seqInfoHash, geneInfoList, gSeqWin); + writeOneTsvRow(f, node->ident->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } } static struct hash *hashFromSeqInfoList(struct seqInfo *seqInfoList) /* Hash sequence name to seqInfo for quick lookup. */ { struct hash *hash = hashNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) hashAdd(hash, si->seq->name, si); return hash; } static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree, struct slName *sampleIds, struct seqInfo *seqInfoList, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, - int *pStartTime) + struct hash *spikeChanges, int *pStartTime) /* Write a tab-separated summary file for download. If the user uploaded enough samples to make - * a tree, then write out samples in tree order; otherwise use sampleIds list. */ + * a tree, then write out samples in tree order; otherwise use sampleIds list. + * Accumulate S gene changes. */ { struct tempName *tsvTn = NULL; AllocVar(tsvTn); -trashDirFile(tsvTn, "ct", "usher", ".tsv"); +trashDirFile(tsvTn, "ct", "usher_samples", ".tsv"); FILE *f = mustOpen(tsvTn->forCgi, "w"); fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path" "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases" "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations" "\n"); struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList); if (sampleTree) { - rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin); + rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin, + spikeChanges); } else { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) - writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin); + writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } carefulClose(&f); hashFree(&seqInfoHash); reportTiming(pStartTime, "write tsv summary"); return tsvTn; } +static struct tempName *writeSpikeChangeSummary(struct hash *spikeChanges, int totalSampleCount) +/* Write a tab-separated summary of S (Spike) gene changes for download. */ +{ +struct tempName *tsvTn = NULL; +AllocVar(tsvTn); +trashDirFile(tsvTn, "ct", "usher_S_muts", ".tsv"); +FILE *f = mustOpen(tsvTn->forCgi, "w"); +fprintf(f, "aa_mutation\tsample_count\tsample_frequency\tsample_ids" + "\n"); +struct aaMutInfo *sChanges[spikeChanges->elCount]; +struct hashCookie cookie = hashFirst(spikeChanges); +int ix = 0; +struct hashEl *hel; +while ((hel = hashNext(&cookie)) != NULL) + { + if (ix >= spikeChanges->elCount) + errAbort("writeSpikeChangeSummary: hash elCount is %d but got more elements", + spikeChanges->elCount); + sChanges[ix++] = hel->val; + } +if (ix != spikeChanges->elCount) + errAbort("writeSpikeChangeSummary: hash elCount is %d but got fewer elements (%d)", + spikeChanges->elCount, ix); +qsort(sChanges, ix, sizeof(sChanges[0]), aaMutInfoCmp); +for (ix = 0; ix < spikeChanges->elCount; ix++) + { + struct aaMutInfo *ami = sChanges[ix]; + int sampleCount = slCount(ami->sampleIds); + fprintf(f, "S:%s\t%d\t%f", + ami->name, sampleCount, (double)sampleCount / (double)totalSampleCount); + slReverse(&ami->sampleIds); + fprintf(f, "\t%s", ami->sampleIds->name); + struct slName *sample; + for (sample = ami->sampleIds->next; sample != NULL; sample = sample->next) + fprintf(f, ",%s", sample->name); + fputc('\n', f); + } +carefulClose(&f); +return tsvTn; +} + static struct slName **getProblematicSites(char *db) /* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that * masking is recommended, one per position in genome; otherwise return NULL. */ { struct slName **pSites = NULL; char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile"); if (isNotEmpty(pSitesFile) && fileExists(pSitesFile)) { AllocArray(pSites, chromSize); struct bbiFile *bbi = bigBedFileOpen(pSitesFile); struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { char *extra = bb->rest; @@ -1924,31 +2142,33 @@ AllocVar(singleSubtreeJsonTn); trashDirFile(singleSubtreeJsonTn, "ct", "singleSubtreeAuspice", ".json"); treeToAuspiceJson(results->singleSubtreeInfo, db, geneInfoList, gSeqWin, sampleMetadata, singleSubtreeJsonTn->forCgi, source); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json"); treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, jsonTns[ix]->forCgi, source); } puts("<p></p>"); - int subtreeButtonCount = (seqCount <= MAX_SEQ_DETAILS) ? subtreeCount : 0; + int subtreeButtonCount = subtreeCount; + if (seqCount > MAX_SEQ_DETAILS || subtreeCount > MAX_SUBTREE_BUTTONS) + subtreeButtonCount = 0; makeButtonRow(singleSubtreeJsonTn, jsonTns, subtreeButtonCount, isFasta); printf("<p>If you have metadata you wish to display, click a 'view subtree in " "Nextstrain' button, and then you can drag on a CSV file to " "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>." "</p>\n"); if (seqCount <= MAX_SEQ_DETAILS) { summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree, refGenome); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) @@ -1965,41 +2185,45 @@ subtree, sampleMetadata, bigTree, source); } reportTiming(&startTime, "describe placements"); } else printf("<p>(Skipping details and subtrees; " "you uploaded %d sequences, and details/subtrees are shown only when " "you upload at most %d sequences.)</p>\n", seqCount, MAX_SEQ_DETAILS); // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, source, fontHeight, &sampleTree, &startTime); - // Make a TSV summary file + // Make a sample summary TSV file and accumulate S gene changes + struct hash *spikeChanges = hashNew(0); struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoList, - geneInfoList, gSeqWin, &startTime); + geneInfoList, gSeqWin, spikeChanges, &startTime); + struct tempName *sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds)); // Offer big tree w/new samples for download puts("<h3>Downloads</h3>"); puts("<ul>"); printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree " "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml); printf("<li><a href='%s' download>TSV summary of sequences and placements</a>\n", tsvTn->forHtml); + printf("<li><a href='%s' download>TSV summary of S (Spike) gene changes</a>\n", + sTsvTn->forHtml); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml, ti->subtreeUserSampleIds->name); if (subtreeUserSampleCount > 10) printf(" and %d other samples", subtreeUserSampleCount - 1); else { struct slName *sln; for (sln = ti->subtreeUserSampleIds->next; sln != NULL; sln = sln->next) printf(", %s", sln->name); } puts(" (Newick file)</a>"); printf("<li><a href='%s' download>Auspice JSON for subtree with %s",