afafa0301ea4b14fbe1fbd5aa379c5351ecd640d angie Tue Aug 2 19:41:44 2022 -0700 Add support for non-wuhCor1 genomes (e.g. monkeypox GenArk hub). * Search in hgPhyloPlaceData for config.ra files, taking assembly name (minus hub prefix) from directory name. * Add a menu input to the main page for switching between supported genomes if there are more than one. * Replace hardcoded values or global vars with dnaSeq attributes, assembly metadata queries or new config.ra settings. * Separate out SARS-CoV-2-specific help text like GISAID/CNCB descriptions. * Support metadata columns for GenBank-specific stuff & Nextstrain lineages (for MPXV). * also a little refactoring in runUsher in preparation for supporting usher server mode: parse new placement info file so we don't have to parse that data form usher stderr output. TODO: update Nextstrain/Auspice JSON output to use appropriate metadata columns and support monkeypox genes. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index ef70bd4..c7ebb67 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -12,73 +12,139 @@ #include "hash.h" #include "hgConfig.h" #include "htmshell.h" #include "hui.h" #include "iupac.h" #include "jsHelper.h" #include "linefile.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "pipeline.h" #include "psl.h" #include "ra.h" #include "regexHelper.h" +#include "trackHub.h" #include "trashDir.h" #include "vcf.h" // Globals: static boolean measureTiming = FALSE; -// wuhCor1-specific: -char *chrom = "NC_045512v2"; -int chromSize = 29903; - // Parameter constants: int maxGenotypes = 1000; // Upper limit on number of samples user can upload at once. boolean showParsimonyScore = FALSE; +struct slName *phyloPlaceDbList() +/* Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file is a supported db + * or track hub name (without the hub_number_ prefix). Return a list of them, or NULL if none + * are found. */ +{ +struct slName *dbList = NULL; +// I was hoping the pattern would be wildMatch'd only against filenames so I could use "config.ra", +// but both directories and files must match the pattern so must use "*". +struct slName *dataDirPaths = pathsInDirAndSubdirs(PHYLOPLACE_DATA_DIR, "*"); +struct slName *path; +for (path = dataDirPaths; path != NULL; path = path->next) + { + if (endsWith(path->name, "/config.ra")) + { + char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN]; + splitPath(path->name, dir, name, extension); + if (endsWith(dir, "/")) + dir[strlen(dir)-1] = '\0'; + char *db = strrchr(dir, '/'); + if (db == NULL) + db = dir; + else + db++; + if (hDbExists(db)) + slNameAddHead(&dbList, db); + else + { + struct trackHubGenome *hubGenome = trackHubGetGenomeUndecorated(db); + if (hubGenome != NULL) + slNameAddHead(&dbList, hubGenome->name); + else + { + // Not connected to session currently. If the name looks like an NCBI Assembly ID + // then try connecting to the corresponding UCSC assembly hub. + regmatch_t substrs[5]; + if (regexMatchSubstr(db, "^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\\.[0-9]$", + substrs, ArraySize(substrs))) + { + char gcPrefix[4], first3[4], mid3[4], last3[4]; + regexSubstringCopy(db, substrs[1], gcPrefix, sizeof gcPrefix); + regexSubstringCopy(db, substrs[2], first3, sizeof first3); + regexSubstringCopy(db, substrs[3], mid3, sizeof mid3); + regexSubstringCopy(db, substrs[4], last3, sizeof last3); + struct dyString *dy = dyStringCreate("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/" + "%s/%s/hub.txt", + gcPrefix, first3, mid3, last3, db); + struct errCatch *errCatch = errCatchNew(); + struct trackHub *hub = NULL; + if (errCatchStart(errCatch)) + { + hub = trackHubOpen(dy->string, db); + } + errCatchEnd(errCatch); + if (hub != NULL && hub->genomeList != NULL && + endsWith(hub->genomeList->name, db)) + slNameAddHead(&dbList, hub->genomeList->name); + } + } + } + } + } +// Reverse alphabetical sort to put wuhCor1/SARS-CoV-2 first +slNameSort(&dbList); +slReverse(&dbList); +return dbList; +} + char *phyloPlaceDbSetting(char *db, char *settingName) /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */ { static struct hash *configHash = NULL; static char *configDb = NULL; -if (!sameOk(db, configDb)) +char *dbBase = trackHubSkipHubName(db); +if (!sameOk(dbBase, configDb)) { char configFile[1024]; - safef(configFile, sizeof configFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", db); + safef(configFile, sizeof configFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", dbBase); if (fileExists(configFile)) { configHash = raReadSingle(configFile); - configDb = cloneString(db); + configDb = cloneString(dbBase); } } -if (sameOk(db, configDb)) +if (sameOk(dbBase, configDb)) return cloneString(hashFindVal(configHash, settingName)); return NULL; } char *phyloPlaceDbSettingPath(char *db, char *settingName) /* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra, * or NULL if not found. (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */ { char *fileName = phyloPlaceDbSetting(db, settingName); if (isNotEmpty(fileName) && fileName[0] != '/' && !fileExists(fileName)) { - struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s", db, fileName); + struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s", + trackHubSkipHubName(db), fileName); if (fileExists(dy->string)) return dyStringCannibalize(&dy); else return NULL; } return fileName; } char *getUsherPath(boolean abortIfNotFound) /* Return hgPhyloPlaceData/usher if it exists, else NULL. Do not free the returned value. */ { char *usherPath = PHYLOPLACE_DATA_DIR "/usher"; if (fileExists(usherPath)) return usherPath; else if (abortIfNotFound) @@ -94,52 +160,54 @@ return matUtilsPath; else if (abortIfNotFound) errAbort("Missing matUtils executable (expected to be at %s)", matUtilsPath); return NULL; } char *getUsherAssignmentsPath(char *db, boolean abortIfNotFound) /* If <db>/config.ra specifies the file for use by usher --load-assignments and the file exists, * return the path, else NULL. Do not free the returned value. */ { char *usherAssignmentsPath = phyloPlaceDbSettingPath(db, "usherAssignmentsFile"); if (isNotEmpty(usherAssignmentsPath) && fileExists(usherAssignmentsPath)) return usherAssignmentsPath; else if (abortIfNotFound) errAbort("Missing usher protobuf file (config setting in " - PHYLOPLACE_DATA_DIR "/%s/config.ra = %s", db, usherAssignmentsPath); + PHYLOPLACE_DATA_DIR "/%s/config.ra = %s", + trackHubSkipHubName(db), usherAssignmentsPath); return NULL; } //#*** This needs to go in a lib so CGIs know whether to include it in the menu. needs better name. boolean hgPhyloPlaceEnabled() /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */ { char *cfgSetting = cfgOption("hgPhyloPlaceEnabled"); boolean isEnabled = (isNotEmpty(cfgSetting) && differentWord(cfgSetting, "off") && differentWord(cfgSetting, "no")); +//#*** TODO -- make less wuhCor1-specific return (isEnabled && hDbExists("wuhCor1")); } static void addPathIfNecessary(struct dyString *dy, char *db, char *fileName) /* If fileName exists, copy it into dy, else try hgPhyloPlaceData/<db>/fileName */ { dyStringClear(dy); if (fileExists(fileName)) dyStringAppend(dy, fileName); else - dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", db, fileName); + dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", trackHubSkipHubName(db), fileName); } struct treeChoices *loadTreeChoices(char *db) /* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */ { struct treeChoices *treeChoices = NULL; char *filename = phyloPlaceDbSettingPath(db, "treeChoices"); if (isNotEmpty(filename) && fileExists(filename)) { AllocVar(treeChoices); int maxChoices = 128; AllocArray(treeChoices->protobufFiles, maxChoices); AllocArray(treeChoices->metadataFiles, maxChoices); AllocArray(treeChoices->sources, maxChoices); AllocArray(treeChoices->descriptions, maxChoices); @@ -224,31 +292,32 @@ { if (node->numEdges > 0) { if (node->priv) { struct singleNucChange *snc, *sncs = node->priv; for (snc = sncs; snc != NULL; snc = snc->next) informativeBases[snc->chromStart] = TRUE; } int i; for (i = 0; i < node->numEdges; i++) rInformativeBasesFromTree(node->edges[i], informativeBases); } } -static boolean *informativeBasesFromTree(struct phyloTree *bigTree, struct slName **maskSites) +static boolean *informativeBasesFromTree(struct phyloTree *bigTree, struct slName **maskSites, + int chromSize) /* Return an array indexed by reference position with TRUE at positions that have a non-leaf * variant in bigTree. If a position is in maskSites but is informative in bigTree then remove * it from maskSites because it was not masked (yet) back when the tree was built. */ { boolean *informativeBases; AllocArray(informativeBases, chromSize); if (bigTree) { rInformativeBasesFromTree(bigTree, informativeBases); int i; for (i = 0; i < chromSize; i++) { if (maskSites[i] && informativeBases[i]) maskSites[i] = NULL; } @@ -349,34 +418,34 @@ { lineFileAbort(lf, "VCF header did not include #CHROM line defining sample IDs for " "genotype columns"); } int colCount = chopTabs(line, NULL); int genotypeCount = colCount - VCF_NUM_COLS_BEFORE_GENOTYPES; if (genotypeCount != sampleCount) { lineFileAbort(lf, "VCF header defines %d samples but there are %d genotype columns", sampleCount, genotypeCount); } char *words[colCount]; chopTabs(line, words); //#*** TODO: check that POS is sorted int pos = strtol(words[1], NULL, 10); - if (pos > chromSize) + if (pos > refGenome->size) { lineFileAbort(lf, "VCF POS value %d exceeds size of reference sequence (%d)", - pos, chromSize); + pos, refGenome->size); } // make sure REF value (if given) matches reference genome int chromStart = pos - 1; struct slName *maskedReasons = maskSites[chromStart]; char *ref = words[3]; if (strlen(ref) != 1) { // Not an SNV -- skip it. //#*** should probably report or at least count these... continue; } char refBase = toupper(refGenome->dna[chromStart]); if (ref[0] == '*' || ref[0] == '.') ref[0] = refBase; else if (ref[0] != refBase) @@ -422,31 +491,31 @@ slAddHead(&si->maskedReasonsList, slRefNew(maskedReasons)); } else { if (isIupacAmbiguous(alt[0])) si->ambigCount++; slAddHead(&si->sncList, snc); } } } } } } if (!maskedReasons) { - fputs(chrom, f); + fputs(refGenome->name, f); for (i = 1; i < colCount; i++) { fputc('\t', f); fputs(words[i], f); } fputc('\n', f); } } } } errCatchEnd(errCatch); carefulClose(&f); if (errCatch->gotError) { warn("%s", errCatch->message->string); @@ -479,35 +548,35 @@ struct singleNucChange *snc; for (snc = si->sncList; snc != NULL; snc = snc->next) { char mutStr[64]; safef(mutStr, sizeof mutStr, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); slNameAddHead(&sampleMuts, mutStr); } slReverse(&sampleMuts); pi->sampleMuts = sampleMuts; } else errAbort("addSampleMutsFromSeqInfo: no seqInfo for placed sequence '%s'", pi->sampleId); } } -static void displaySampleMuts(struct placementInfo *info) +static void displaySampleMuts(struct placementInfo *info, char *refAcc) { printf("<p>Differences from the reference genome " - "(<a href='https://www.ncbi.nlm.nih.gov/nuccore/NC_045512.2' target=_blank>" - "NC_045512.2</a>): "); + "(<a href='https://www.ncbi.nlm.nih.gov/nuccore/%s' target=_blank>%s</a>): ", + refAcc, refAcc); if (info->sampleMuts == NULL) printf("(None; identical to reference)"); else { struct slName *sln; for (sln = info->sampleMuts; sln != NULL; sln = sln->next) { if (sln != info->sampleMuts) printf(", "); printf("%s", sln->name); } } puts("</p>"); } @@ -708,30 +777,33 @@ int nCladeIx = stringArrayIx("Nextstrain_clade", headerWords, headerWordCount); int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount); int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount); if (lineageIx < 0) lineageIx = stringArrayIx("pango_lineage", headerWords, headerWordCount); int countryIx = stringArrayIx("country", headerWords, headerWordCount); int divisionIx = stringArrayIx("division", headerWords, headerWordCount); int locationIx = stringArrayIx("location", headerWords, headerWordCount); int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount); int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount); int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount); int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount); int regionIx = stringArrayIx("region", headerWords, headerWordCount); int nCladeUsherIx = stringArrayIx("Nextstrain_clade_usher", headerWords, headerWordCount); int lineageUsherIx = stringArrayIx("pango_lineage_usher", headerWords, headerWordCount); + int authorsIx = stringArrayIx("authors", headerWords, headerWordCount); + int pubsIx = stringArrayIx("publications", headerWords, headerWordCount); + int nLineageIx = stringArrayIx("Nextstrain_lineage", headerWords, headerWordCount); while (lineFileNext(lf, &line, NULL)) { char *words[headerWordCount]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, headerWordCount, wordCount); struct sampleMetadata *met; AllocVar(met); if (strainIx >= 0) met->strain = cloneString(words[strainIx]); if (epiIdIx >= 0) met->epiId = cloneString(words[epiIdIx]); if (genbankIx >= 0 && !sameString("?", words[genbankIx])) met->gbAcc = cloneString(words[genbankIx]); if (dateIx >= 0) met->date = cloneString(words[dateIx]); @@ -751,30 +823,36 @@ met->location = cloneString(words[locationIx]); if (countryExpIx >= 0) met->countryExp = cloneString(words[countryExpIx]); if (divExpIx >= 0) met->divExp = cloneString(words[divExpIx]); if (origLabIx >= 0) met->origLab = cloneString(words[origLabIx]); if (subLabIx >= 0) met->subLab = cloneString(words[subLabIx]); if (regionIx >= 0) met->region = cloneString(words[regionIx]); if (nCladeUsherIx >= 0) met->nCladeUsher = cloneString(words[nCladeUsherIx]); if (lineageUsherIx >= 0) met->lineageUsher = cloneString(words[lineageUsherIx]); + if (authorsIx >= 0) + met->authors = cloneString(words[authorsIx]); + if (pubsIx >= 0) + met->pubs = cloneString(words[pubsIx]); + if (nLineageIx >= 0) + met->nLineage = cloneString(words[nLineageIx]); // If epiId and/or genbank ID is included, we'll probably be using that to look up items. if (epiIdIx >= 0 && !isEmpty(words[epiIdIx])) hashAdd(sampleMetadata, words[epiIdIx], met); if (genbankIx >= 0 && !isEmpty(words[genbankIx]) && !sameString("?", words[genbankIx])) { if (strchr(words[genbankIx], '.')) { // Index by versionless accession char copy[strlen(words[genbankIx])+1]; safecpy(copy, sizeof copy, words[genbankIx]); char *dot = strchr(copy, '.'); *dot = '\0'; hashAdd(sampleMetadata, copy, met); } else @@ -849,74 +927,95 @@ if (!met && isdigit(sampleId[0]) && strstr(sampleId, "_from_")) { char *eg = strstr(sampleId, "_eg_"); if (eg) met = hashFindVal(sampleMetadata, eg+strlen("_eg_")); } return met; } static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ { char *lineage = NULL; struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); if (met) - lineage = met->lineage; + lineage = met->lineage ? met->lineage : met->nLineage; return lineage; } static void findNearestNeighbors(struct hash *samplePlacements, struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree) /* For each placed sample, find the nearest neighbor in the bigTree and its assigned lineage, * and fill in those two fields of placementInfo. */ { if (!bigTree) return; struct hashCookie cookie = hashFirst(samplePlacements); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) { struct placementInfo *info = hel->val; info->nearestNeighbor = findNearestNeighbor(bigTree, info->sampleId, info->variantPath); if (isNotEmpty(info->nearestNeighbor)) info->neighborLineage = lineageForSample(sampleMetadata, info->nearestNeighbor); } } -static void printLineageUrl(char *lineage) +static void printLineageLink(char *lineage, char *db) +/* Print lineage with link to outbreak.info or pango-designation issue if appropriate. + * Caller must handle case of empty/NULL lineage. */ +{ +boolean isSarsCov2 = sameString(db, "wuhCor1"); +if (isEmpty(lineage)) + errAbort("programmer error:printLineageLink called with empty lineage"); +else if (isSarsCov2 && startsWith("proposed", lineage)) + printf("<a href='"PANGO_DESIGNATION_ISSUE_URLBASE"%s' target=_blank>%s</a>", + lineage+strlen("proposed"), lineage); +else if (isSarsCov2 && differentString(lineage, "None") && differentString(lineage, "Unassigned")) + { + // Trim _suffix that I add to extra tree annotations for problematic branches, if present. + char lineageCopy[strlen(lineage)+1]; + safecpy(lineageCopy, sizeof lineageCopy, lineage); + char *p = strchr(lineageCopy, '_'); + if (p != NULL) + *p = '\0'; + printf("<a href='"OUTBREAK_INFO_URLBASE"%s' target=_blank>%s</a>", lineageCopy, lineageCopy); + } +else + printf("%s", lineage); +} + +static void maybePrintLineageLink(char *lineage, char *db) /* If lineage is not empty/NULL, print ": lineage <lineage>" and link to outbreak.info * (unless lineage is "None") */ { if (isNotEmpty(lineage)) { - if (differentString(lineage, "None")) - printf(": lineage <a href='"OUTBREAK_INFO_URLBASE"%s' target=_blank>%s</a>", - lineage, lineage); - else - printf(": lineage %s", lineage); + printf(": lineage "); + printLineageLink(lineage, db); } } -static void displayNearestNeighbors(struct placementInfo *info, char *source) +static void displayNearestNeighbors(struct placementInfo *info, char *source, char *db) /* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */ { if (isNotEmpty(info->nearestNeighbor)) { printf("<p>Nearest neighboring %s sequence already in phylogenetic tree: %s", source, info->nearestNeighbor); - printLineageUrl(info->neighborLineage); + maybePrintLineageLink(info->neighborLineage, db); puts("</p>"); } } static int placementInfoRefCmpSampleMuts(const void *va, const void *vb) /* Compare slRef->placementInfo->sampleMuts lists. Shorter lists first. Using alpha sort * to distinguish between different sampleMuts contents arbitrarily; the purpose is to * clump samples with identical lists. */ { struct slRef * const *rra = va; struct slRef * const *rrb = vb; struct placementInfo *pa = (*rra)->val; struct placementInfo *pb = (*rrb)->val; int diff = slCount(pa->sampleMuts) - slCount(pb->sampleMuts); if (diff == 0) @@ -1023,31 +1122,31 @@ if (info) { if (isNotEmpty(info->nearestNeighbor)) neighbor = info->nearestNeighbor; if (isNotEmpty(info->neighborLineage)) lineage = info->neighborLineage; } printf("%-*s %s %s\n", maxLen, asciiRow, neighbor, lineage); } slNameFreeList(&rowList); dyStringFree(&dy); } static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements, struct phyloTree *subtree, struct hash *sampleMetadata, - char *source) + char *source, char *refAcc, char *db) /* Report how each sample fits into the big tree. */ { // Sort sample placements by sampleMuts so we can group identical samples. struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements); slSort(&placementRefs, placementInfoRefCmpSampleMuts); int relatedCount = slCount(placementRefs); int clumpSize = countIdentical(placementRefs); if (clumpSize < relatedCount && relatedCount > 2) { // Not all of the related sequences are identical, so they will be broken down into // separate "clumps". List all related samples first to avoid confusion. puts("<pre>"); asciiTreeWithNeighborInfo(subtree, samplePlacements); puts("</pre>"); } @@ -1067,42 +1166,42 @@ slNameAddHead(&sortedSamples, info->sampleId); } slNameSort(&sortedSamples); printf("<b>%d identical samples:</b>\n<ul>\n", clumpSize); struct slName *sln; for (sln = sortedSamples; sln != NULL; sln = sln->next) printf("<li><b>%s</b>\n", sln->name); puts("</ul>"); } else { printf("<b>%s</b>\n", info->sampleId); ref = ref->next; } refsToGo = ref; - displaySampleMuts(info); + displaySampleMuts(info, refAcc); if (info->imputedBases) { puts("<p>Base values imputed by parsimony:\n<ul>"); struct baseVal *bv; for (bv = info->imputedBases; bv != NULL; bv = bv->next) printf("<li>%d: %s\n", bv->chromStart+1, bv->val); puts("</ul>"); puts("</p>"); } displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples"); - displayNearestNeighbors(info, source); + displayNearestNeighbors(info, source, db); if (showParsimonyScore && info->parsimonyScore > 0) printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore); //#*** TODO: explain parsimony score } } struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds) /* Prune all descendants of node that have no leaf descendants in sampleIds. */ { if (node->numEdges) { struct phyloTree *prunedKids = NULL; int i; for (i = 0; i < node->numEdges; i++) { @@ -1151,31 +1250,33 @@ static void lookForCladesAndLineages(struct hash *samplePlacements, boolean *retGotClades, boolean *retGotLineages) /* See if UShER has annotated any clades and/or lineages for seqs. */ { boolean gotClades = FALSE, gotLineages = FALSE; struct hashEl *hel; struct hashCookie cookie = hashFirst(samplePlacements); while ((hel = hashNext(&cookie)) != NULL) { struct placementInfo *pi = hel->val; if (pi) { if (isNotEmpty(pi->nextClade)) gotClades = TRUE; if (isNotEmpty(pi->pangoLineage)) + { gotLineages = TRUE; + } if (gotClades && gotLineages) break; } } *retGotClades = gotClades; *retGotLineages = gotLineages; } static char *nextstrainHost() /* Return the nextstrain hostname from an hg.conf param, or NULL if missing. */ { return cfgOption("nextstrainHost"); } static char *nextstrainUrlFromTn(struct tempName *jsonTn) @@ -1249,87 +1350,101 @@ } } if (0 && isFasta) { printf(" "); struct dyString *js = dyStringCreate("window.open('https://master.clades.nextstrain.org/" "?input-fasta=%s');", "needATn"); //#*** TODO: save FASTA to file cgiMakeOnClickButton("viewNextclade", js->string, "view sequences in Nextclade"); } puts("</p>"); } #define TOOLTIP(text) " <div class='tooltip'>(?)<span class='tooltiptext'>" text "</span></div>" -static void printSummaryHeader(boolean isFasta, boolean gotClades, boolean gotLineages) +static void printSummaryHeader(boolean isFasta, boolean gotClades, boolean gotLineages, + char *refName, char *db) /* Print the summary table header row with tooltips explaining columns. */ { puts("<thead><tr>"); if (isFasta) puts("<th>Fasta Sequence</th>\n" "<th>Size" TOOLTIP("Length of uploaded sequence in bases, excluding runs of N bases at " "beginning and/or end") "</th>\n<th>#Ns" TOOLTIP("Number of 'N' bases in uploaded sequence, excluding runs of N bases at " "beginning and/or end") "</th>"); else puts("<th>VCF Sample</th>\n" "<th>#Ns" TOOLTIP("Number of no-call variants for this sample in uploaded VCF, " "i.e. '.' used in genotype column") "</th>"); puts("<th>#Mixed" TOOLTIP("Number of IUPAC ambiguous bases, e.g. 'R' for 'A or G'") "</th>"); if (isFasta) - puts("<th>Bases aligned" - TOOLTIP("Number of bases aligned to reference NC_045512.2 Wuhan/Hu-1, including " + printf("<th>Bases aligned" + TOOLTIP("Number of bases aligned to reference %s, including " "matches and mismatches") "</th>\n<th>Inserted bases" TOOLTIP("Number of bases in aligned portion of uploaded sequence that are not present in " - "reference NC_045512.2 Wuhan/Hu-1") + "reference %s") "</th>\n<th>Deleted bases" - TOOLTIP("Number of bases in reference NC_045512.2 Wuhan/Hu-1 that are not " + TOOLTIP("Number of bases in reference %s that are not " "present in aligned portion of uploaded sequence") - "</th>"); + "</th>", refName, refName, refName); puts("<th>#SNVs used for placement" TOOLTIP("Number of single-nucleotide variants in uploaded sample " "(does not include N's or mixed bases) used by UShER to place sample " "in phylogenetic tree") "</th>\n<th>#Masked SNVs" TOOLTIP("Number of single-nucleotide variants in uploaded sample that are masked " "(not used for placement) because they occur at known " "<a href='https://virological.org/t/issues-with-sars-cov-2-sequencing-data/473/12' " "target=_blank>Problematic Sites</a>"));; if (gotClades) + { + if (sameString(db, "wuhCor1")) puts("</th>\n<th>Nextstrain clade" TOOLTIP("The <a href='https://nextstrain.org/blog/2021-01-06-updated-SARS-CoV-2-clade-naming' " - "target=_blank>Nextstrain clade</a> assigned to the sample by UShER")); + "target=_blank>Nextstrain clade</a> assigned to the sample by " + "placement in the tree")); + else + puts("</th>\n<th>Nextstrain lineage" + TOOLTIP("The Nextstrain lineage assigned by " + "placement in the tree")); + } if (gotLineages) puts("</th>\n<th>Pango lineage" TOOLTIP("The <a href='https://cov-lineages.org/' " "target=_blank>Pango lineage</a> assigned to the sample by UShER")); puts("</th>\n<th>Neighboring sample in tree" TOOLTIP("A sample already in the tree that is a child of the node at which the uploaded " "sample was placed, to give an example of a closely related sample") - "</th>\n<th>Lineage of neighbor" - TOOLTIP("The <a href='https://cov-lineages.org/' target=_blank>" - "Pango lineage</a> assigned to the nearest neighboring sample already in the tree") - "</th>\n<th>#Imputed values for mixed bases" + "</th>\n<th>Lineage of neighbor"); +if (sameString(db, "wuhCor1")) + puts(TOOLTIP("The <a href='https://cov-lineages.org/' target=_blank>" + "Pango lineage</a> assigned by pangolin " + "to the nearest neighboring sample already in the tree")); +else + puts(TOOLTIP("The lineage assigned by Nextclade " + "to the nearest neighboring sample already in the tree")); +puts("</th>\n<th>#Imputed values for mixed bases" TOOLTIP("If the uploaded sequence contains mixed/ambiguous bases, then UShER may assign " "values based on maximum parsimony") "</th>\n<th>#Maximally parsimonious placements" TOOLTIP("Number of potential placements in the tree with minimal parsimony score; " "the higher the number, the less confident the placement") "</th>\n<th>Parsimony score" TOOLTIP("Number of mutations/changes that must be added to the tree when placing the " "uploaded sample; the higher the number, the more diverged the sample") "</th>\n<th>Subtree number" TOOLTIP("Sequence number of subtree that contains this sample") "</th></tr></thead>"); } // Default QC thresholds for calling an input sequence excellent/good/fair/bad [/fail]: @@ -1489,73 +1604,75 @@ printf(TOOLTIP("%s"), text); } static void appendExcludingNs(struct dyString *dy, struct seqInfo *si) /* Append a note to dy about how many N bases and start and/or end are excluded from statistic. */ { dyStringAppend(dy, "excluding "); if (si->nCountStart) dyStringPrintf(dy, "%d N bases at start", si->nCountStart); if (si->nCountStart && si->nCountEnd) dyStringAppend(dy, " and "); if (si->nCountEnd) dyStringPrintf(dy, "%d N bases at end", si->nCountEnd); } -static void printLineageTd(char *lineage, char *alt) +static void printLineageTd(char *lineage, char *alt, char *db) /* Print a table cell with lineage (& link to outbreak.info if not 'None') or alt if no lineage. */ { -if (lineage && differentString(lineage, "None")) - printf("<td><a href='"OUTBREAK_INFO_URLBASE"%s' target=_blank>%s</a></td>", lineage, lineage); -else if (lineage) - printf("<td>%s</td>", lineage); -else +if (isEmpty(lineage)) printf("<td>%s</td>", alt); +else + { + printf("<td>"); + printLineageLink(lineage, db); + printf("</td>"); + } } static void printSubtreeTd(struct subtreeInfo *subtreeInfoList, struct tempName *jsonTns[], char *seqName) /* Print a table cell with subtree (& link if possible) if found. */ { int ix; struct subtreeInfo *ti = subtreeInfoForSample(subtreeInfoList, seqName, &ix); if (ix < 0) //#*** Probably an error. printf("<td>n/a</td>"); else { printf("<td>%d", ix+1); if (ti && nextstrainHost()) { char *nextstrainUrl = nextstrainUrlFromTn(jsonTns[ix]); printf(" (<a href='%s' target=_blank>view in Nextstrain<a>)", nextstrainUrl); } printf("</td>"); } } static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta, struct usherResults *ur, struct tempName *jsonTns[], - struct hash *sampleMetadata, struct dnaSeq *refGenome) + char *refAcc, char *db) /* Show a table with composition & alignment stats for each sequence that passed basic QC. */ { if (seqInfoList) { puts("<table class='seqSummary'>"); boolean gotClades = FALSE, gotLineages = FALSE; lookForCladesAndLineages(ur->samplePlacements, &gotClades, &gotLineages); - printSummaryHeader(isFasta, gotClades, gotLineages); + printSummaryHeader(isFasta, gotClades, gotLineages, refAcc, db); puts("<tbody>"); struct dyString *dy = dyStringNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) { puts("<tr>"); printf("<th>%s</td>", replaceChars(si->seq->name, "|", " | ")); if (isFasta) { if (si->nCountStart || si->nCountEnd) { int effectiveLength = si->seq->size - (si->nCountStart + si->nCountEnd); dyStringClear(dy); dyStringPrintf(dy, "%d ", effectiveLength); appendExcludingNs(dy, si); @@ -1663,34 +1780,34 @@ { replaceChar(reason->name, '_', ' '); dyStringPrintf(dy, ", %s", reason->name); } dyStringAppendC(dy, ')'); } printTooltip(dy->string); } printf("</td>"); struct placementInfo *pi = hashFindVal(ur->samplePlacements, si->seq->name); if (pi) { if (gotClades) printf("<td>%s</td>", pi->nextClade ? pi->nextClade : "n/a"); if (gotLineages) - printLineageTd(pi->pangoLineage, "n/a"); + printLineageTd(pi->pangoLineage, "n/a", db); printf("<td>%s</td>", pi->nearestNeighbor ? replaceChars(pi->nearestNeighbor, "|", " | ") : "?"); - printLineageTd(pi->neighborLineage, "?"); + printLineageTd(pi->neighborLineage, "?", db); int imputedCount = slCount(pi->imputedBases); printf("<td class='%s'>%d", qcClassForImputedBases(imputedCount), imputedCount); if (imputedCount > 0) { dyStringClear(dy); struct baseVal *bv; for (bv = pi->imputedBases; bv != NULL; bv = bv->next) { dyStringAppendSep(dy, ", "); dyStringPrintf(dy, "%d: %s", bv->chromStart+1, bv->val); } printTooltip(dy->string); } printf("</td><td class='%s'>%d", @@ -1704,31 +1821,31 @@ if (gotClades) printf("<td>n/a</td>"); if (gotLineages) printf("<td>n/a</td>"); printf("<td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td>"); } printSubtreeTd(ur->subtreeInfoList, jsonTns, si->seq->name); puts("</tr>"); } puts("</tbody></table><p></p>"); } } static void summarizeSubtrees(struct slName *sampleIds, struct usherResults *results, struct hash *sampleMetadata, struct tempName *jsonTns[], - struct mutationAnnotatedTree *bigTree) + struct mutationAnnotatedTree *bigTree, char *db) /* Print a summary table of pasted/uploaded identifiers and subtrees */ { boolean gotClades = FALSE, gotLineages = FALSE; lookForCladesAndLineages(results->samplePlacements, &gotClades, &gotLineages); puts("<table class='seqSummary'><tbody>"); puts("<tr><th>Sequence</th>"); if (gotClades) puts("<th>Nextstrain clade (UShER)" TOOLTIP("The <a href='https://nextstrain.org/blog/2021-01-06-updated-SARS-CoV-2-clade-naming' " "target=_blank>Nextstrain clade</a> " "assigned to the sequence by UShER according to its place in the phylogenetic tree") "</th>"); if (gotLineages) puts("<th>Pango lineage (UShER)" TOOLTIP("The <a href='https://cov-lineages.org/' " @@ -1740,71 +1857,67 @@ "Pango lineage</a> assigned to the sequence by " "<a href='https://github.com/cov-lineages/pangolin/' target=_blank>pangolin</a>") "</th>" "<th>subtree</th></tr>"); struct slName *si; for (si = sampleIds; si != NULL; si = si->next) { puts("<tr>"); printf("<th>%s</td>", replaceChars(si->name, "|", " | ")); struct placementInfo *pi = hashFindVal(results->samplePlacements, si->name); if (pi) { if (gotClades) printf("<td>%s</td>", pi->nextClade ? pi->nextClade : "n/a"); if (gotLineages) - printLineageTd(pi->pangoLineage, "n/a"); + printLineageTd(pi->pangoLineage, "n/a", db); } else { if (gotClades) printf("<td>n/a</td>"); if (gotLineages) printf("<td>n/a</td>"); } // pangolin-assigned lineage char *lineage = lineageForSample(sampleMetadata, si->name); - if (isNotEmpty(lineage)) - printf("<td><a href='"OUTBREAK_INFO_URLBASE"%s' target=_blank>%s</a></td>", - lineage, lineage); - else - printf("<td>n/a</td>"); + printLineageTd(lineage, "n/a", db); // Maybe also #mutations with mouseover to show mutation path? printSubtreeTd(results->subtreeInfoList, jsonTns, si->name); } puts("</tbody></table><p></p>"); } static struct singleNucChange *sncListFromSampleMutsAndImputed(struct slName *sampleMuts, struct baseVal *imputedBases, struct seqWindow *gSeqWin) /* Convert a list of "<ref><pos><alt>" names to struct singleNucChange list. * However, if <alt> is ambiguous, skip it because variantProjector doesn't like it. * Add imputed base predictions. */ { struct singleNucChange *sncList = NULL; struct slName *mut; for (mut = sampleMuts; mut != NULL; mut = mut->next) { char ref = mut->name[0]; if (ref < 'A' || ref > 'Z') errAbort("sncListFromSampleMuts: expected ref base value, got '%c' in '%s'", ref, mut->name); int pos = atoi(&(mut->name[1])); - if (pos < 1 || pos > chromSize) + if (pos < 1 || pos > gSeqWin->end) errAbort("sncListFromSampleMuts: expected pos between 1 and %d, got %d in '%s'", - chromSize, pos, mut->name); + gSeqWin->end, pos, mut->name); char alt = mut->name[strlen(mut->name)-1]; if (alt < 'A' || alt > 'Z') errAbort("sncListFromSampleMuts: expected alt base value, got '%c' in '%s'", alt, mut->name); if (isIupacAmbiguous(alt)) continue; struct singleNucChange *snc; AllocVar(snc); snc->chromStart = pos-1; snc->refBase = snc->parBase = ref; snc->newBase = alt; slAddHead(&sncList, snc); } struct baseVal *bv; for (bv = imputedBases; bv != NULL; bv = bv->next) @@ -2275,31 +2388,31 @@ cmd[cIx++] = singleSubtreeJsonTn->forCgi; cmd[cIx++] = results->singleSubtreeInfo->subtreeTn->forCgi; struct subtreeInfo *ti; for (ti = results->subtreeInfoList; ti != NULL; ti = ti->next, sIx++) { cmd[cIx++] = jsonTns[sIx]->forCgi; cmd[cIx++] = ti->subtreeTn->forCgi; } cmd[cIx++] = NULL; struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL, 0); pipelineClose(&pl); reportTiming(pStartTime, "make subtree zipfile"); return zipTn; } -static struct slName **getProblematicSites(char *db) +static struct slName **getProblematicSites(char *db, char *chrom, int chromSize) /* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that * masking is recommended, one per position in genome; otherwise return NULL. */ { struct slName **pSites = NULL; char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile"); if (isNotEmpty(pSitesFile) && fileExists(pSitesFile)) { AllocArray(pSites, chromSize); struct bbiFile *bbi = bigBedFileOpen(pSitesFile); struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { char *extra = bb->rest; char *reason = nextWord(&extra); @@ -2473,31 +2586,31 @@ { char *words[3]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, 2, wordCount); char *fullName = hashFindVal(nameHash, words[1]); if (fullName) hashAdd(nameHash, words[0], fullName); else { missCount++; if (missExample == NULL) missExample = cloneString(words[1]); } } lineFileClose(&lf); - if (missCount > 0) + if (missCount > 0 && verboseLevel() > 1) fprintf(stderr, "aliasFile %s: %d values in second column were not found in tree, " "e.g. '%s'", aliasFile, missCount, missExample); } } static struct hash *getTreeNames(struct mutationAnnotatedTree *bigTree, char *aliasFile, boolean addComponents) /* Make a hash of full names of leaves of bigTree; if addComponents, also map components of those * names to the full names in case the user gives us partial names/IDs. */ { int nodeCount = bigTree->nodeHash->elCount; struct hash *nameHash = hashNew(digitsBaseTwo(nodeCount) + 3); rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash, addComponents); addAliases(nameHash, aliasFile); return nameHash; @@ -2626,32 +2739,34 @@ struct tempName *vcfTn = NULL; struct slName *sampleIds = NULL; char *usherPath = getUsherPath(TRUE); char *protobufPath = NULL; char *source = NULL; char *metadataFile = NULL; char *aliasFile = NULL; getProtobufMetadataSource(db, defaultProtobuf, &protobufPath, &metadataFile, &source, &aliasFile); struct mutationAnnotatedTree *bigTree = parseParsimonyProtobuf(protobufPath); reportTiming(&startTime, "parse protobuf file"); if (! bigTree) { warn("Problem parsing %s; can't make subtree subtracks.", protobufPath); } lineFileCarefulNewlines(lf); -struct slName **maskSites = getProblematicSites(db); -boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites); +char *chrom = hDefaultChrom(db); +int chromSize = hChromSize(db, chrom); +struct slName **maskSites = getProblematicSites(db, chrom, chromSize); +boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites, chromSize); struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize); boolean isFasta = FALSE; boolean subtreesOnly = FALSE; struct seqInfo *seqInfoList = NULL; if (lfLooksLikeFasta(lf)) { struct slPair *failedSeqs; struct slPair *failedPsls; struct hash *treeNames = getTreeNames(bigTree, NULL, FALSE); vcfTn = vcfFromFasta(lf, db, refGenome, informativeBases, maskSites, treeNames, &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime); if (failedSeqs) { puts("<p>"); struct slPair *fail; @@ -2705,31 +2820,31 @@ sampleIds, bigTree->condensedNodes, &startTime); } if (results && results->singleSubtreeInfo) { if (retSuccess) *retSuccess = TRUE; puts("<p></p>"); readQcThresholds(db); int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome); - struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize); + struct seqWindow *gSeqWin = memSeqWindowNew(chrom, refGenome->dna); struct hash *sampleMetadata = getSampleMetadata(metadataFile); struct hash *sampleUrls = hashNew(0); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); char subtreeName[512]; safef(subtreeName, sizeof(subtreeName), "subtreeAuspice%d", ix+1); trashDirFile(jsonTns[ix], "ct", subtreeName, ".json"); treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, NULL, jsonTns[ix]->forCgi, source); // Add a link for every sample to this subtree, so the single-subtree JSON can // link to subtree JSONs @@ -2754,72 +2869,79 @@ "</p>\n"); puts("<p><em>Note: " "The Nextstrain subtree views, and Download files below, are temporary files and will " "expire within two days. " "Please download the Nextstrain subtree JSON files if you will want to view them " "again in the future. The JSON files can be drag-dropped onto " "<a href='https://auspice.us/' target=_blank>https://auspice.us/</a>." "</em></p>"); struct tempName *tsvTn = NULL, *sTsvTn = NULL; struct tempName *zipTn = makeSubtreeZipFile(results, jsonTns, singleSubtreeJsonTn, &startTime); struct tempName *ctTn = NULL; if (subtreesOnly) { - summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, bigTree); + summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, bigTree, db); reportTiming(&startTime, "describe subtrees"); } else { findNearestNeighbors(results->samplePlacements, sampleMetadata, bigTree); // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; - ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree, - source, fontHeight, &sampleTree, &startTime); + ctTn = writeCustomTracks(db, vcfTn, results, sampleIds, bigTree, source, fontHeight, + &sampleTree, &startTime); // Make a sample summary TSV file and accumulate S gene changes struct hash *spikeChanges = hashNew(0); tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoHash, geneInfoList, gSeqWin, spikeChanges, &startTime); sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds)); downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml, zipTn->forHtml); int seqCount = slCount(seqInfoList); if (seqCount <= MAX_SEQ_DETAILS) { - summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, refGenome); + char *refAcc = cloneString(chrom); + if (regexMatch(refAcc, "v[0-9]+$")) + { + char *v = strrchr(refAcc, 'v'); + assert(v != NULL); + *v = '.'; + } + summarizeSequences(seqInfoList, isFasta, results, jsonTns, refAcc, db); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) printf("%d related samples", subtreeUserSampleCount); else if (subtreeCount > 1) printf("Unrelated sample"); printf("</h3>\n"); makeNextstrainButtonN("viewNextstrainSub", ix, subtreeUserSampleCount, subtreeSize, jsonTns); puts("<br>"); // Make a sub-subtree with only user samples for display: struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi); subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds); describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, - subtree, sampleMetadata, source); + subtree, sampleMetadata, source, refAcc, db); } reportTiming(&startTime, "describe placements"); } else printf("<p>(Skipping details; " "you uploaded %d sequences, and details are shown only when " "you upload at most %d sequences.)</p>\n", seqCount, MAX_SEQ_DETAILS); } puts("<h3>Downloads</h3>"); if (! subtreesOnly) { puts("<ul>"); // Offer big tree w/new samples for download