17cdc50389b482ee9f3f1ce649ed297201269d53 angie Thu Feb 25 20:41:01 2021 -0800 If usher's output includes clade and lineage assignments, parse them and include them in the summary table. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index c63468d..7fc3fe5 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1033,30 +1033,53 @@ int *retIx) /* Find the subtree that contains sample name and set *retIx to its index in the list. * If we can't find it, return NULL and set *retIx to -1. */ { struct subtreeInfo *ti; int ix; for (ti = subtreeInfoList, ix = 0; ti != NULL; ti = ti->next, ix++) if (slNameInList(ti->subtreeUserSampleIds, name)) break; if (ti == NULL) ix = -1; *retIx = ix; return ti; } +static void lookForCladesAndLineages(struct seqInfo *seqInfoList, struct hash *samplePlacements, + boolean *retGotClades, boolean *retGotLineages) +/* See if UShER has annotated any clades and/or lineages for seqs. */ +{ +boolean gotClades = FALSE, gotLineages = FALSE; +struct seqInfo *si; +for (si = seqInfoList; si != NULL; si = si->next) + { + struct placementInfo *pi = hashFindVal(samplePlacements, si->seq->name); + if (pi) + { + if (isNotEmpty(pi->nextClade)) + gotClades = TRUE; + if (isNotEmpty(pi->pangoLineage)) + gotLineages = TRUE; + if (gotClades && gotLineages) + break; + } + } +*retGotClades = gotClades; +*retGotLineages = gotLineages; +} + static char *nextstrainHost() /* Return the nextstrain hostname from an hg.conf param, or NULL if missing. */ { return cfgOption("nextstrainHost"); } static char *nextstrainUrlFromTn(struct tempName *jsonTn) /* Return a link to Nextstrain to view an annotated subtree. */ { char *jsonUrlForNextstrain = urlFromTn(jsonTn); char *protocol = strstr(jsonUrlForNextstrain, "://"); if (protocol) jsonUrlForNextstrain = protocol + strlen("://"); struct dyString *dy = dyStringCreate("%s/fetch/%s", nextstrainHost(), jsonUrlForNextstrain); return dyStringCannibalize(&dy); @@ -1107,31 +1130,31 @@ } } if (0 && isFasta) { printf(" "); struct dyString *js = dyStringCreate("window.open('https://master.clades.nextstrain.org/" "?input-fasta=%s');", "needATn"); //#*** TODO: save FASTA to file cgiMakeOnClickButton("viewNextclade", js->string, "view sequences in Nextclade"); } puts("</p>"); } #define TOOLTIP(text) " <div class='tooltip'>(?)<span class='tooltiptext'>" text "</span></div>" -static void printSummaryHeader(boolean isFasta) +static void printSummaryHeader(boolean isFasta, boolean gotClades, boolean gotLineages) /* Print the summary table header row with tooltips explaining columns. */ { puts("<thead><tr>"); if (isFasta) puts("<th>Fasta Sequence</th>\n" "<th>Size" TOOLTIP("Length of uploaded sequence in bases, excluding runs of N bases at " "beginning and/or end") "</th>\n<th>#Ns" TOOLTIP("Number of 'N' bases in uploaded sequence, excluding runs of N bases at " "beginning and/or end") "</th>"); else puts("<th>VCF Sample</th>\n" "<th>#Ns" @@ -1148,37 +1171,45 @@ "</th>\n<th>Inserted bases" TOOLTIP("Number of bases in aligned portion of uploaded sequence that are not present in " "reference NC_045512.2 Wuhan/Hu-1") "</th>\n<th>Deleted bases" TOOLTIP("Number of bases in reference NC_045512.2 Wuhan/Hu-1 that are not " "present in aligned portion of uploaded sequence") "</th>"); puts("<th>#SNVs used for placement" TOOLTIP("Number of single-nucleotide variants in uploaded sample " "(does not include N's or mixed bases) used by UShER to place sample " "in phylogenetic tree") "</th>\n<th>#Masked SNVs" TOOLTIP("Number of single-nucleotide variants in uploaded sample that are masked " "(not used for placement) because they occur at known " "<a href='https://virological.org/t/issues-with-sars-cov-2-sequencing-data/473/12' " - "target=_blank>Problematic Sites</a>") - "</th>\n<th>Neighboring sample in tree" + "target=_blank>Problematic Sites</a>"));; +if (gotClades) + puts("</th>\n<th>Nextstrain clade" + TOOLTIP("The <a href='https://nextstrain.org/blog/2021-01-06-updated-SARS-CoV-2-clade-naming' " + "target=_blank>Nextstrain clade</a> assigned to the sample by UShER")); +if (gotLineages) + puts("</th>\n<th>Pango lineage" + TOOLTIP("The <a href='https://cov-lineages.org/' " + "target=_blank>Pango lineage</a> assigned to the sample by UShER")); +puts("</th>\n<th>Neighboring sample in tree" TOOLTIP("A sample already in the tree that is a child of the node at which the uploaded " "sample was placed, to give an example of a closely related sample") "</th>\n<th>Lineage of neighbor" - TOOLTIP("The <a href='https://github.com/cov-lineages/pangolin' target=_blank>" - "Pangolin lineage</a> assigned to the nearest neighboring sample already in the tree") + TOOLTIP("The <a href='https://cov-lineages.org/' target=_blank>" + "Pango lineage</a> assigned to the nearest neighboring sample already in the tree") "</th>\n<th>#Imputed values for mixed bases" TOOLTIP("If the uploaded sequence contains mixed/ambiguous bases, then UShER may assign " "values based on maximum parsimony") "</th>\n<th>#Maximally parsimonious placements" TOOLTIP("Number of potential placements in the tree with minimal parsimony score; " "the higher the number, the less confident the placement") "</th>\n<th>Parsimony score" TOOLTIP("Number of mutations/changes that must be added to the tree when placing the " "uploaded sample; the higher the number, the more diverged the sample") "</th>\n<th>Subtree number" TOOLTIP("Sequence number of subtree that contains this sample") "</th></tr></thead>"); } @@ -1348,31 +1379,33 @@ if (si->nCountStart && si->nCountEnd) dyStringAppend(dy, " and "); if (si->nCountEnd) dyStringPrintf(dy, "%d N bases at end", si->nCountEnd); } static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta, struct usherResults *ur, struct tempName *jsonTns[], struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree, struct dnaSeq *refGenome) /* Show a table with composition & alignment stats for each sequence that passed basic QC. */ { if (seqInfoList) { puts("<table class='seqSummary'>"); - printSummaryHeader(isFasta); + boolean gotClades = FALSE, gotLineages = FALSE; + lookForCladesAndLineages(seqInfoList, ur->samplePlacements, &gotClades, &gotLineages); + printSummaryHeader(isFasta, gotClades, gotLineages); puts("<tbody>"); struct dyString *dy = dyStringNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) { puts("<tr>"); printf("<th>%s</td>", replaceChars(si->seq->name, "|", " | ")); if (isFasta) { if (si->nCountStart || si->nCountEnd) { int effectiveLength = si->seq->size - (si->nCountStart + si->nCountEnd); dyStringClear(dy); dyStringPrintf(dy, "%d ", effectiveLength); appendExcludingNs(dy, si); @@ -1477,57 +1510,67 @@ dyStringPrintf(dy, "%c%d%c (%s", snc->refBase, snc->chromStart+1, snc->newBase, reasonList->name); for (reason = reasonList->next; reason != NULL; reason = reason->next) { replaceChar(reason->name, '_', ' '); dyStringPrintf(dy, ", %s", reason->name); } dyStringAppendC(dy, ')'); } printTooltip(dy->string); } printf("</td>"); struct placementInfo *pi = hashFindVal(ur->samplePlacements, si->seq->name); if (pi) { + if (gotClades) + printf("<td>%s</td>", pi->nextClade ? pi->nextClade : "n/a"); + if (gotLineages) + printf("<td>%s</td>", pi->pangoLineage ? pi->pangoLineage : "n/a"); struct slName *neighbor = findNearestNeighbor(bigTree, pi->sampleId, pi->variantPath); char *lineage = neighbor ? lineageForSample(sampleMetadata, neighbor->name) : "?"; printf("<td>%s</td><td>%s</td>", neighbor ? replaceChars(neighbor->name, "|", " | ") : "?", lineage ? lineage : "?"); int imputedCount = slCount(pi->imputedBases); printf("<td class='%s'>%d", qcClassForImputedBases(imputedCount), imputedCount); if (imputedCount > 0) { dyStringClear(dy); struct baseVal *bv; for (bv = pi->imputedBases; bv != NULL; bv = bv->next) { dyStringAppendSep(dy, ", "); dyStringPrintf(dy, "%d: %s", bv->chromStart+1, bv->val); } printTooltip(dy->string); } printf("</td><td class='%s'>%d", qcClassForPlacements(pi->bestNodeCount), pi->bestNodeCount); printf("</td><td class='%s'>%d", qcClassForPScore(pi->parsimonyScore), pi->parsimonyScore); printf("</td>"); } else + { + if (gotClades) + printf("<td>n/a></td>"); + if (gotLineages) + printf("<td>n/a></td>"); printf("<td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td>"); + } int ix; struct subtreeInfo *ti = subtreeInfoForSample(ur->subtreeInfoList, si->seq->name, &ix); if (ix < 0) //#*** Probably an error. printf("<td>n/a</td>"); else { printf("<td>%d", ix+1); if (ti && nextstrainHost()) { char *nextstrainUrl = nextstrainUrlFromTn(jsonTns[ix]); printf(" (<a href='%s' target=_blank>view in Nextstrain<a>)", nextstrainUrl); } printf("</td>"); }