2a9d39b83afd2a71f4a0d57042b4a6fcee81a556 angie Thu Jan 26 14:54:41 2023 -0800 Added two RSV clade systems (Goya et al. and Ramaekers et al.) and different sources (nextclade vs. tree vs. direct assignments) as coloring options in Nextstrain JSON output for RSV. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 19602d5..09d35ed 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -771,30 +771,35 @@ if (lineageIx < 0) lineageIx = stringArrayIx("pango_lineage", headerWords, headerWordCount); int countryIx = stringArrayIx("country", headerWords, headerWordCount); int divisionIx = stringArrayIx("division", headerWords, headerWordCount); int locationIx = stringArrayIx("location", headerWords, headerWordCount); int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount); int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount); int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount); int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount); int regionIx = stringArrayIx("region", headerWords, headerWordCount); int nCladeUsherIx = stringArrayIx("Nextstrain_clade_usher", headerWords, headerWordCount); int lineageUsherIx = stringArrayIx("pango_lineage_usher", headerWords, headerWordCount); int authorsIx = stringArrayIx("authors", headerWords, headerWordCount); int pubsIx = stringArrayIx("publications", headerWords, headerWordCount); int nLineageIx = stringArrayIx("Nextstrain_lineage", headerWords, headerWordCount); + int gnCladeIx = stringArrayIx("goya_nextclade", headerWords, headerWordCount); + int rnCladeIx = stringArrayIx("ramaekers_nextclade", headerWords, headerWordCount); + int guCladeIx = stringArrayIx("goya_usher", headerWords, headerWordCount); + int ruCladeIx = stringArrayIx("ramaekers_usher", headerWords, headerWordCount); + int rtCladeIx = stringArrayIx("ramaekers_tableS1", headerWords, headerWordCount); while (lineFileNext(lf, &line, NULL)) { char *words[headerWordCount]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, headerWordCount, wordCount); struct sampleMetadata *met; AllocVar(met); if (strainIx >= 0) met->strain = cloneString(words[strainIx]); if (epiIdIx >= 0) met->epiId = cloneString(words[epiIdIx]); if (genbankIx >= 0 && !sameString("?", words[genbankIx])) met->gbAcc = cloneString(words[genbankIx]); if (dateIx >= 0) met->date = cloneString(words[dateIx]); @@ -820,30 +825,43 @@ met->origLab = cloneString(words[origLabIx]); if (subLabIx >= 0) met->subLab = cloneString(words[subLabIx]); if (regionIx >= 0) met->region = cloneString(words[regionIx]); if (nCladeUsherIx >= 0) met->nCladeUsher = cloneString(words[nCladeUsherIx]); if (lineageUsherIx >= 0) met->lineageUsher = cloneString(words[lineageUsherIx]); if (authorsIx >= 0) met->authors = cloneString(words[authorsIx]); if (pubsIx >= 0) met->pubs = cloneString(words[pubsIx]); if (nLineageIx >= 0) met->nLineage = cloneString(words[nLineageIx]); + // For RSV, use lineage for Ramaekers clades and nClade for Goya clades. + // This is getting ugly and we really should specify metadata columns in config.ra files. + if (gnCladeIx >= 0) + met->nClade = cloneString(words[gnCladeIx]); + if (rnCladeIx >= 0) + met->lineage = cloneString(words[rnCladeIx]); + if (guCladeIx >= 0) + met->nCladeUsher = cloneString(words[guCladeIx]); + if (ruCladeIx >= 0) + met->lineageUsher = cloneString(words[ruCladeIx]); + // Uglier still, use gClade to store Ramaekers Table S1 designations because it's left over. + if (rtCladeIx >= 0) + met->gClade = cloneString(words[rtCladeIx]); // If epiId and/or genbank ID is included, we'll probably be using that to look up items. if (epiIdIx >= 0 && !isEmpty(words[epiIdIx])) hashAdd(sampleMetadata, words[epiIdIx], met); if (genbankIx >= 0 && !isEmpty(words[genbankIx]) && !sameString("?", words[genbankIx])) { if (strchr(words[genbankIx], '.')) { // Index by versionless accession char copy[strlen(words[genbankIx])+1]; safecpy(copy, sizeof copy, words[genbankIx]); char *dot = strchr(copy, '.'); *dot = '\0'; hashAdd(sampleMetadata, copy, met); } else @@ -1399,46 +1417,67 @@ TOOLTIP("Number of bases in aligned portion of uploaded sequence that are not present in " "reference %s") "\nDeleted bases" TOOLTIP("Number of bases in reference %s that are not " "present in aligned portion of uploaded sequence") "", refName, refName, refName); puts("#SNVs used for placement" TOOLTIP("Number of single-nucleotide variants in uploaded sample " "(does not include N's or mixed bases) used by UShER to place sample " "in phylogenetic tree") "\n#Masked SNVs" TOOLTIP("Number of single-nucleotide variants in uploaded sample that are masked " "(not used for placement) because they occur at known " "Problematic Sites"));; +boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db)); if (gotClades) { if (sameString(db, "wuhCor1")) puts("\nNextstrain clade" TOOLTIP("The Nextstrain clade assigned to the sample by " "placement in the tree")); + else if (isRsv) + puts("\n" + "Goya 2020 clade" + TOOLTIP("The clade described in " + "Goya et al. 2020, " + ""Toward unified molecular surveillance of RSV: A proposal for " + "genotype definition" " + "assigned by placement in the tree")); else puts("\nNextstrain lineage" TOOLTIP("The Nextstrain lineage assigned by " "placement in the tree")); } if (gotLineages) + { + if (isRsv) + puts("\n" + "Ramaekers 2020 clade" + TOOLTIP("The clade described in " + "" + "Ramaekers et al. 2020, " + ""Towards a unified classification for human respiratory syncytial virus " + "genotypes" " + "assigned by placement in the tree")); + else puts("\nPango lineage" TOOLTIP("The Pango lineage assigned to the sample by UShER")); + } puts("\nNeighboring sample in tree" TOOLTIP("A sample already in the tree that is a child of the node at which the uploaded " "sample was placed, to give an example of a closely related sample") "\nLineage of neighbor"); if (sameString(db, "wuhCor1")) puts(TOOLTIP("The " "Pango lineage assigned by pangolin " "to the nearest neighboring sample already in the tree")); else puts(TOOLTIP("The lineage assigned by Nextclade " "to the nearest neighboring sample already in the tree")); puts("\n#Imputed values for mixed bases" TOOLTIP("If the uploaded sequence contains mixed/ambiguous bases, then UShER may assign " "values based on maximum parsimony") "\n#Maximally parsimonious placements"