7e58340888377874edaad1dbc5174e20295f890c angie Mon Feb 22 14:17:33 2021 -0800 Support upload of more sequences, add TSV file summarizing sample variants and placements. Requested by Joe de Risi (UCSF). Increase timeout to 10 minutes; make TSV with each sample's ID, nuc muts, AA muts, imputed bases and path from root to sample. Also use Yatish's new -K subtree algorithm in usher: one subtree encompassing all uploaded samples, plus the specified number of samples randomly selected from the rest of the tree. Don't show every single sample name in the title because there can be 1000 samples in the same subtree now. :) diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 67eb685..2e5feaf 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -571,44 +571,44 @@ struct phyloTree *leafKids[leafCount]; int leafIx = 0; for (i = 0; i < node->numEdges; i++) { struct phyloTree *kid = node->edges[i]; if (kid->numEdges == 0) leafKids[leafIx++] = kid; } qsort(leafKids, leafCount, sizeof(leafKids[0]), mutCountCmp); neighbors = slNameNew(leafKids[0]->ident->name); } } return neighbors; } -static void printVariantPathNoNodeNames(struct variantPathNode *variantPath) -/* Print out variant path with no node names (even if non-numeric) */ +static void printVariantPathNoNodeNames(FILE *f, struct variantPathNode *variantPath) +/* Print out variant path with no node names (even if non-numeric) to f. */ { struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) - printf(" > "); + fprintf(f, " > "); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) - printf(", "); - printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); + fprintf(f, ", "); + fprintf(f, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } } static struct hash *getSampleMetadata(char *metadataFile) /* If config.ra defines a metadataFile, load its contents into a hash indexed by EPI ID and return; * otherwise return NULL. */ { struct hash *sampleMetadata = NULL; if (isNotEmpty(metadataFile) && fileExists(metadataFile)) { sampleMetadata = hashNew(0); struct lineFile *lf = lineFileOpen(metadataFile, TRUE); int headerWordCount = 0; char **headerWords = NULL; @@ -804,38 +804,38 @@ if (info->bestNodeCount == 1) printf("<p>The placement in the tree is unambiguous; " "there are no other parsimony-optimal placements in the phylogenetic tree.</p>\n"); else if (info->bestNodeCount > 1) printf("<p>This placement is not the only parsimony-optimal placement in the tree; " "%d other placements exist.</p>\n", info->bestNodeCount - 1); if (showBestNodePaths && info->bestNodes) { if (info->bestNodeCount != slCount(info->bestNodes)) errAbort("Inconsistent bestNodeCount (%d) and number of bestNodes (%d)", info->bestNodeCount, slCount(info->bestNodes)); if (info->bestNodeCount > 1) printf("<ul><li><b>used for placement</b>: "); if (differentString(info->bestNodes->name, "?") && !isAllDigits(info->bestNodes->name)) printf("%s ", info->bestNodes->name); - printVariantPathNoNodeNames(info->bestNodes->variantPath); + printVariantPathNoNodeNames(stdout, info->bestNodes->variantPath); struct bestNodeInfo *bn; for (bn = info->bestNodes->next; bn != NULL; bn = bn->next) { printf("\n<li>"); if (differentString(bn->name, "?") && !isAllDigits(bn->name)) printf("%s ", bn->name); - printVariantPathNoNodeNames(bn->variantPath); + printVariantPathNoNodeNames(stdout, bn->variantPath); char *lineage = lineageForSample(sampleMetadata, bn->name); if (isNotEmpty(lineage)) printf(": lineage %s", lineage); } if (info->bestNodeCount > 1) puts("</ul>"); puts("</p>"); } } static int placementInfoRefCmpSampleMuts(const void *va, const void *vb) /* Compare slRef->placementInfo->sampleMuts lists. Shorter lists first. Using alpha sort * to distinguish between different sampleMuts contents arbitrarily; the purpose is to * clump samples with identical lists. */ { @@ -1494,30 +1494,168 @@ { printf("<td>%d", ix+1); if (ti && nextstrainHost()) { char *nextstrainUrl = nextstrainUrlFromTn(jsonTns[ix]); printf(" (<a href='%s' target=_blank>view in Nextstrain<a>)", nextstrainUrl); } printf("</td>"); } puts("</tr>"); } puts("</tbody></table><p></p>"); } } +static struct singleNucChange *sncListFromSampleMutsAndImputed(struct slName *sampleMuts, + struct baseVal *imputedBases) +/* Convert a list of "<ref><pos><alt>" names to struct singleNucChange list. + * However, if <alt> is ambiguous, skip it because variantProjector doesn't like it. + * Add imputed base predictions. */ +{ +struct singleNucChange *sncList = NULL; +struct slName *mut; +for (mut = sampleMuts; mut != NULL; mut = mut->next) + { + char ref = mut->name[0]; + if (ref < 'A' || ref > 'Z') + errAbort("sncListFromSampleMuts: expected ref base value, got '%c' in '%s'", + ref, mut->name); + int pos = atoi(&(mut->name[1])); + if (pos < 1 || pos > chromSize) + errAbort("sncListFromSampleMuts: expected pos between 1 and %d, got %d in '%s'", + chromSize, pos, mut->name); + char alt = mut->name[strlen(mut->name)-1]; + if (alt < 'A' || alt > 'Z') + errAbort("sncListFromSampleMuts: expected alt base value, got '%c' in '%s'", + alt, mut->name); + if (isIupacAmbiguous(alt)) + continue; + struct singleNucChange *snc; + AllocVar(snc); + snc->chromStart = pos-1; + snc->refBase = ref; + snc->newBase = alt; + slAddHead(&sncList, snc); + } +struct baseVal *bv; +for (bv = imputedBases; bv != NULL; bv = bv->next) + { + struct singleNucChange *snc; + AllocVar(snc); + snc->chromStart = bv->chromStart; + snc->refBase = '?'; + snc->newBase = bv->val[0]; + slAddHead(&sncList, snc); + } +slReverse(&sncList); +return sncList; +} + +static void writeOneTsvRow(FILE *f, char *sampleId, struct usherResults *results, + struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) +/* Write one row of tab-separate summary for sampleId. */ +{ + struct placementInfo *info = hashFindVal(results->samplePlacements, sampleId); + // sample name / ID + fprintf(f, "%s\t", sampleId); + // nucleotide mutations + struct slName *mut; + for (mut = info->sampleMuts; mut != NULL; mut = mut->next) + { + if (mut != info->sampleMuts) + fputc(',', f); + fputs(mut->name, f); + } + fputc('\t', f); + // AA mutations + struct singleNucChange *sncList = sncListFromSampleMutsAndImputed(info->sampleMuts, + info->imputedBases); + struct slPair *geneAaMutations = getAaMutations(sncList, geneInfoList, gSeqWin); + struct slPair *geneAaMut; + boolean first = TRUE; + for (geneAaMut = geneAaMutations; geneAaMut != NULL; geneAaMut = geneAaMut->next) + { + struct slName *aaMut; + for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) + { + if (first) + first = FALSE; + else + fputc(',', f); + fprintf(f, "%s:%s", geneAaMut->name, aaMut->name); + } + } + fputc('\t', f); + // imputed bases (if any) + struct baseVal *bv; + for (bv = info->imputedBases; bv != NULL; bv = bv->next) + { + if (bv != info->imputedBases) + fputc(',', f); + fprintf(f, "%d%s", bv->chromStart+1, bv->val); + } + fputc('\t', f); + // path through tree to sample + printVariantPathNoNodeNames(f, info->variantPath); + fputc('\n', f); +} + +static void rWriteTsvSummaryTreeOrder(struct phyloTree *node, FILE *f, struct usherResults *results, + struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) +/* As we encounter leaves (user-uploaded samples) in depth-first search order, write out a line + * of TSV summary for each one. */ +{ +if (node->numEdges) + { + int i; + for (i = 0; i < node->numEdges; i++) + rWriteTsvSummaryTreeOrder(node->edges[i], f, results, geneInfoList, gSeqWin); + } +else + { + writeOneTsvRow(f, node->ident->name, results, geneInfoList, gSeqWin); + } +} + + +static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree, + struct slName *sampleIds, struct geneInfo *geneInfoList, + struct seqWindow *gSeqWin, int *pStartTime) +/* Write a tab-separated summary file for download. If the user uploaded enough samples to make + * a tree, then write out samples in tree order; otherwise use sampleIds list. */ +{ +struct tempName *tsvTn = NULL; +AllocVar(tsvTn); +trashDirFile(tsvTn, "ct", "usher", ".tsv"); +FILE *f = mustOpen(tsvTn->forCgi, "w"); +fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path\n"); +if (sampleTree) + { + rWriteTsvSummaryTreeOrder(sampleTree, f, results, geneInfoList, gSeqWin); + } +else + { + struct slName *sample; + for (sample = sampleIds; sample != NULL; sample = sample->next) + writeOneTsvRow(f, sample->name, results, geneInfoList, gSeqWin); + } +carefulClose(&f); +reportTiming(pStartTime, "write tsv summary"); +return tsvTn; +} + static struct slName **getProblematicSites(char *db) /* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that * masking is recommended, one per position in genome; otherwise return NULL. */ { struct slName **pSites = NULL; char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile"); if (isNotEmpty(pSitesFile) && fileExists(pSitesFile)) { AllocArray(pSites, chromSize); struct bbiFile *bbi = bigBedFileOpen(pSitesFile); struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { char *extra = bb->rest; @@ -1624,104 +1762,138 @@ else if (lfLooksLikeVcf(lf)) { vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds); reportTiming(&startTime, "check uploaded VCF"); } else { if (isNotEmpty(lf->fileName)) warn("Sorry, can't recognize your file %s as FASTA or VCF.\n", lf->fileName); else warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n"); } lineFileClose(&lf); if (vcfTn) { + fflush(stdout); + int seqCount = slCount(seqInfoList); struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); - if (results->subtreeInfoList) + if (results->subtreeInfoList || seqCount > MAX_SEQ_DETAILS) { int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); + struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome); + struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize); struct hash *sampleMetadata = getSampleMetadata(metadataFile); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json"); - treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, sampleMetadata, - jsonTns[ix]->forCgi, source); + treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, jsonTns[ix]->forCgi, + source); } puts("<p></p>"); + if (subtreeCount > 0 && subtreeCount <= MAX_SUBTREE_BUTTONS) + { makeButtonRow(jsonTns, subtreeCount, isFasta); - printf("<p>If you have metadata you wish to display, click a 'view subtree in Nextstrain' " - "button, and then you can drag on a CSV file to " + printf("<p>If you have metadata you wish to display, click a 'view subtree in " + "Nextstrain' button, and then you can drag on a CSV file to " "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>." "</p>\n"); + } + if (seqCount <= MAX_SEQ_DETAILS) + { summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree, refGenome); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) printf("%d related samples", subtreeUserSampleCount); else if (subtreeCount > 1) printf("Unrelated sample"); printf("</h3>\n"); makeNextstrainButton("viewNextstrainSub", ix, jsonTns); puts("<br>"); // Make a sub-subtree with only user samples for display: struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi); subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds); - describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, subtree, - sampleMetadata, bigTree, source); + describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, + subtree, sampleMetadata, bigTree, source); } reportTiming(&startTime, "describe placements"); + } + else + printf("<p>(Skipping details and subtrees; " + "you uploaded %d sequences, and details/subtrees are shown only when " + "you upload at most %d sequences.)</p>\n", + seqCount, MAX_SEQ_DETAILS); // Make custom tracks for uploaded samples and subtree(s). + struct phyloTree *sampleTree = NULL; struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, - source, fontHeight, &startTime); + source, fontHeight, &sampleTree, &startTime); + + // Make a TSV summary file + struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, geneInfoList, + gSeqWin, &startTime); // Offer big tree w/new samples for download puts("<h3>Downloads</h3>"); puts("<ul>"); printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree " "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml); + printf("<li><a href='%s' download>TSV summary of sequences and placements</a>\n", + tsvTn->forHtml); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { + int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml, ti->subtreeUserSampleIds->name); + if (subtreeUserSampleCount > 10) + printf(" and %d other samples", subtreeUserSampleCount - 1); + else + { struct slName *sln; for (sln = ti->subtreeUserSampleIds->next; sln != NULL; sln = sln->next) printf(", %s", sln->name); + } puts(" (Newick file)</a>"); printf("<li><a href='%s' download>Auspice JSON for subtree with %s", jsonTns[ix]->forHtml, ti->subtreeUserSampleIds->name); + if (subtreeUserSampleCount > 10) + printf(" and %d other samples", subtreeUserSampleCount - 1); + else + { + struct slName *sln; for (sln = ti->subtreeUserSampleIds->next; sln != NULL; sln = sln->next) printf(", %s", sln->name); + } puts(" (JSON file)</a>"); } puts("</ul>"); // Notify in opposite order of custom track creation. puts("<h3>Custom tracks for viewing in the Genome Browser</h3>"); printf("<p>Added custom track of uploaded samples.</p>\n"); + if (subtreeCount <= MAX_SUBTREE_CTS) printf("<p>Added %d subtree custom track%s.</p>\n", subtreeCount, (subtreeCount > 1 ? "s" : "")); - ctFile = urlFromTn(ctTn); } else { warn("No subtree output from usher.\n"); } } return ctFile; }