17cdc50389b482ee9f3f1ce649ed297201269d53 angie Thu Feb 25 20:41:01 2021 -0800 If usher's output includes clade and lineage assignments, parse them and include them in the summary table. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index c63468d..7fc3fe5 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1033,30 +1033,53 @@ int *retIx) /* Find the subtree that contains sample name and set *retIx to its index in the list. * If we can't find it, return NULL and set *retIx to -1. */ { struct subtreeInfo *ti; int ix; for (ti = subtreeInfoList, ix = 0; ti != NULL; ti = ti->next, ix++) if (slNameInList(ti->subtreeUserSampleIds, name)) break; if (ti == NULL) ix = -1; *retIx = ix; return ti; } +static void lookForCladesAndLineages(struct seqInfo *seqInfoList, struct hash *samplePlacements, + boolean *retGotClades, boolean *retGotLineages) +/* See if UShER has annotated any clades and/or lineages for seqs. */ +{ +boolean gotClades = FALSE, gotLineages = FALSE; +struct seqInfo *si; +for (si = seqInfoList; si != NULL; si = si->next) + { + struct placementInfo *pi = hashFindVal(samplePlacements, si->seq->name); + if (pi) + { + if (isNotEmpty(pi->nextClade)) + gotClades = TRUE; + if (isNotEmpty(pi->pangoLineage)) + gotLineages = TRUE; + if (gotClades && gotLineages) + break; + } + } +*retGotClades = gotClades; +*retGotLineages = gotLineages; +} + static char *nextstrainHost() /* Return the nextstrain hostname from an hg.conf param, or NULL if missing. */ { return cfgOption("nextstrainHost"); } static char *nextstrainUrlFromTn(struct tempName *jsonTn) /* Return a link to Nextstrain to view an annotated subtree. */ { char *jsonUrlForNextstrain = urlFromTn(jsonTn); char *protocol = strstr(jsonUrlForNextstrain, "://"); if (protocol) jsonUrlForNextstrain = protocol + strlen("://"); struct dyString *dy = dyStringCreate("%s/fetch/%s", nextstrainHost(), jsonUrlForNextstrain); return dyStringCannibalize(&dy); @@ -1107,31 +1130,31 @@ } } if (0 && isFasta) { printf(" "); struct dyString *js = dyStringCreate("window.open('https://master.clades.nextstrain.org/" "?input-fasta=%s');", "needATn"); //#*** TODO: save FASTA to file cgiMakeOnClickButton("viewNextclade", js->string, "view sequences in Nextclade"); } puts("

"); } #define TOOLTIP(text) "
(?)" text "
" -static void printSummaryHeader(boolean isFasta) +static void printSummaryHeader(boolean isFasta, boolean gotClades, boolean gotLineages) /* Print the summary table header row with tooltips explaining columns. */ { puts(""); if (isFasta) puts("Fasta Sequence\n" "Size" TOOLTIP("Length of uploaded sequence in bases, excluding runs of N bases at " "beginning and/or end") "\n#Ns" TOOLTIP("Number of 'N' bases in uploaded sequence, excluding runs of N bases at " "beginning and/or end") ""); else puts("VCF Sample\n" "#Ns" @@ -1148,37 +1171,45 @@ "\nInserted bases" TOOLTIP("Number of bases in aligned portion of uploaded sequence that are not present in " "reference NC_045512.2 Wuhan/Hu-1") "\nDeleted bases" TOOLTIP("Number of bases in reference NC_045512.2 Wuhan/Hu-1 that are not " "present in aligned portion of uploaded sequence") ""); puts("#SNVs used for placement" TOOLTIP("Number of single-nucleotide variants in uploaded sample " "(does not include N's or mixed bases) used by UShER to place sample " "in phylogenetic tree") "\n#Masked SNVs" TOOLTIP("Number of single-nucleotide variants in uploaded sample that are masked " "(not used for placement) because they occur at known " "Problematic Sites") - "\nNeighboring sample in tree" + "target=_blank>Problematic Sites"));; +if (gotClades) + puts("\nNextstrain clade" + TOOLTIP("The Nextstrain clade assigned to the sample by UShER")); +if (gotLineages) + puts("\nPango lineage" + TOOLTIP("The Pango lineage assigned to the sample by UShER")); +puts("\nNeighboring sample in tree" TOOLTIP("A sample already in the tree that is a child of the node at which the uploaded " "sample was placed, to give an example of a closely related sample") "\nLineage of neighbor" - TOOLTIP("The " - "Pangolin lineage assigned to the nearest neighboring sample already in the tree") + TOOLTIP("The " + "Pango lineage assigned to the nearest neighboring sample already in the tree") "\n#Imputed values for mixed bases" TOOLTIP("If the uploaded sequence contains mixed/ambiguous bases, then UShER may assign " "values based on maximum parsimony") "\n#Maximally parsimonious placements" TOOLTIP("Number of potential placements in the tree with minimal parsimony score; " "the higher the number, the less confident the placement") "\nParsimony score" TOOLTIP("Number of mutations/changes that must be added to the tree when placing the " "uploaded sample; the higher the number, the more diverged the sample") "\nSubtree number" TOOLTIP("Sequence number of subtree that contains this sample") ""); } @@ -1348,31 +1379,33 @@ if (si->nCountStart && si->nCountEnd) dyStringAppend(dy, " and "); if (si->nCountEnd) dyStringPrintf(dy, "%d N bases at end", si->nCountEnd); } static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta, struct usherResults *ur, struct tempName *jsonTns[], struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree, struct dnaSeq *refGenome) /* Show a table with composition & alignment stats for each sequence that passed basic QC. */ { if (seqInfoList) { puts(""); - printSummaryHeader(isFasta); + boolean gotClades = FALSE, gotLineages = FALSE; + lookForCladesAndLineages(seqInfoList, ur->samplePlacements, &gotClades, &gotLineages); + printSummaryHeader(isFasta, gotClades, gotLineages); puts(""); struct dyString *dy = dyStringNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) { puts(""); printf("", pi->nextClade ? pi->nextClade : "n/a"); + if (gotLineages) + printf("", pi->pangoLineage ? pi->pangoLineage : "n/a"); struct slName *neighbor = findNearestNeighbor(bigTree, pi->sampleId, pi->variantPath); char *lineage = neighbor ? lineageForSample(sampleMetadata, neighbor->name) : "?"; printf("", neighbor ? replaceChars(neighbor->name, "|", " | ") : "?", lineage ? lineage : "?"); int imputedCount = slCount(pi->imputedBases); printf(""); } else + { + if (gotClades) + printf(""); + if (gotLineages) + printf(""); printf(""); + } int ix; struct subtreeInfo *ti = subtreeInfoForSample(ur->subtreeInfoList, si->seq->name, &ix); if (ix < 0) //#*** Probably an error. printf(""); else { printf(""); }
%s", replaceChars(si->seq->name, "|", " | ")); if (isFasta) { if (si->nCountStart || si->nCountEnd) { int effectiveLength = si->seq->size - (si->nCountStart + si->nCountEnd); dyStringClear(dy); dyStringPrintf(dy, "%d ", effectiveLength); appendExcludingNs(dy, si); @@ -1477,57 +1510,67 @@ dyStringPrintf(dy, "%c%d%c (%s", snc->refBase, snc->chromStart+1, snc->newBase, reasonList->name); for (reason = reasonList->next; reason != NULL; reason = reason->next) { replaceChar(reason->name, '_', ' '); dyStringPrintf(dy, ", %s", reason->name); } dyStringAppendC(dy, ')'); } printTooltip(dy->string); } printf(""); struct placementInfo *pi = hashFindVal(ur->samplePlacements, si->seq->name); if (pi) { + if (gotClades) + printf("%s%s%s%s%d", qcClassForImputedBases(imputedCount), imputedCount); if (imputedCount > 0) { dyStringClear(dy); struct baseVal *bv; for (bv = pi->imputedBases; bv != NULL; bv = bv->next) { dyStringAppendSep(dy, ", "); dyStringPrintf(dy, "%d: %s", bv->chromStart+1, bv->val); } printTooltip(dy->string); } printf("%d", qcClassForPlacements(pi->bestNodeCount), pi->bestNodeCount); printf("%d", qcClassForPScore(pi->parsimonyScore), pi->parsimonyScore); printf("n/a>n/a>n/an/an/an/an/an/a%d", ix+1); if (ti && nextstrainHost()) { char *nextstrainUrl = nextstrainUrlFromTn(jsonTns[ix]); printf(" (view in Nextstrain)", nextstrainUrl); } printf("