705420e703b067fbcad43ab67ed3e131552e7ac8 angie Wed Apr 24 14:23:03 2024 -0700 Pathogen drop-down choices can now be groups of references/trees, for example 'Dengue (types 1 - 4)' instead of a separate choice for each type. Instead of config.ra, each group has an organism.ra and subdirectories named after reference accessions that contain reference.ra files. nextclade sort is used to match the user's uploaded sequences against available references for the selected pathogen. SARS-CoV-2, M. tuberculosis and hMPXV still have only one reference and still use config.ra, but RSV, Dengue and Influenza will become groups. Presentation is still kinda rough, just a loop on the original results output. The server commands part needs testing and will not work yet for groups (currently used only for SARS-CoV-2). diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 69085ed..28189b4 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1,18 +1,18 @@ /* Place SARS-CoV-2 sequences in phylogenetic tree using usher program. */ -/* Copyright (C) 2020-2022 The Regents of the University of California */ +/* Copyright (C) 2020-2024 The Regents of the University of California */ #include "common.h" #include "bigBed.h" #include "cart.h" #include "cheapcgi.h" #include "errCatch.h" #include "fa.h" #include "genePred.h" #include "grp.h" #include "hCommon.h" #include "hash.h" #include "hgConfig.h" #include "htmshell.h" #include "hubConnect.h" #include "hui.h" @@ -27,149 +27,267 @@ #include "psl.h" #include "pthreadWrap.h" #include "ra.h" #include "regexHelper.h" #include "trackHub.h" #include "trashDir.h" #include "vcf.h" // Globals: static boolean measureTiming = FALSE; // Parameter constants: int maxGenotypes = 1000; // Upper limit on number of samples user can upload at once. boolean showParsimonyScore = FALSE; int minSamplesForOwnTree = 3; // If user uploads at least this many samples, show tree for them. +char *leftLabelWidthForLongNames = "55";// Leave plenty of room for tree and long virus strain names -struct slName *phyloPlaceDbList(struct cart *cart) -/* Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file is a supported data - * source and might also be a db or track hub name (without the hub_number_ prefix). Return a - * list of them, or NULL if none are found. */ +char *phyloPlaceOrgSetting(char *org, char *settingName) +/* Return cloned setting value if found in hgPhyloPlaceData/<org>/organism.ra or + * old-style hgPhyloPlaceData/<org>/config.ra, or NULL if not found. */ { -struct slName *dbList = NULL; -// I was hoping the pattern would be wildMatch'd only against filenames so I could use "config.ra", -// but both directories and files must match the pattern so must use "*". -struct slName *dataDirPaths = pathsInDirAndSubdirs(PHYLOPLACE_DATA_DIR, "*"); -struct slName *path; -for (path = dataDirPaths; path != NULL; path = path->next) +static struct hash *orgHashes = NULL; +if (orgHashes == NULL) + orgHashes = hashNew(0); +char *orgSkipHub = trackHubSkipHubName(org); +struct hash *orgHash = hashFindVal(orgHashes, orgSkipHub); +if (orgHash == NULL) { - if (endsWith(path->name, "/config.ra")) + char raFile[1024]; + safef(raFile, sizeof raFile, PHYLOPLACE_DATA_DIR "/%s/organism.ra", orgSkipHub); + if (fileExists(raFile)) + orgHash = raReadSingle(raFile); + else { - char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN]; - splitPath(path->name, dir, name, extension); - if (endsWith(dir, "/")) - dir[strlen(dir)-1] = '\0'; - char *db = strrchr(dir, '/'); - if (db == NULL) - db = dir; + safef(raFile, sizeof raFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", orgSkipHub); + if (fileExists(raFile)) + orgHash = raReadSingle(raFile); + } + if (orgHash == NULL) + errAbort("phyloPlaceOrgSetting: can't find organism.ra or config.ra for '%s'", orgSkipHub); + hashAdd(orgHashes, orgSkipHub, orgHash); + } +return cloneString(hashFindVal(orgHash, settingName)); +} + +// TODO: libify +INLINE boolean isUrl(char *url) +{ +return (startsWith("http://", url) || startsWith("https://", url) || startsWith("ftp://", url)); +} + +char *phyloPlaceOrgSettingPath(char *org, char *settingName) +/* Return cgi-bin-relative path to a file named by a setting for org, or NULL if not found. */ +{ +char *fileName = phyloPlaceOrgSetting(org, settingName); +if (isNotEmpty(fileName) && fileName[0] != '/' && !isUrl(fileName) && !fileExists(fileName)) + { + struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s", + trackHubSkipHubName(org), fileName); + if (fileExists(dy->string)) + return dyStringCannibalize(&dy); + else + return NULL; + } +return fileName; +} + +char *phyloPlaceRefSetting(char *org, char *ref, char *settingName) +/* Return cloned setting value if found in hgPhyloPlaceData/<org>/<ref>/reference.ra or + * old-style hgPhyloPlaceData/<ref>/config.ra, or NULL if not found. */ +{ +static struct hash *refHashes = NULL; +if (refHashes == NULL) + refHashes = hashNew(0); +char *orgSkipHub = trackHubSkipHubName(org); +char *refSkipHub = trackHubSkipHubName(ref); +char orgRefKey[strlen(orgSkipHub) + strlen(refSkipHub) + 16]; +safef(orgRefKey, sizeof orgRefKey, "%s|%s", orgSkipHub, refSkipHub); +struct hash *refHash = hashFindVal(refHashes, orgRefKey); +if (refHash == NULL) + { + char raFile[1024]; + safef(raFile, sizeof raFile, PHYLOPLACE_DATA_DIR "/%s/%s/reference.ra", orgSkipHub, refSkipHub); + if (fileExists(raFile)) + refHash = raReadSingle(raFile); + else + { + safef(raFile, sizeof raFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", refSkipHub); + if (fileExists(raFile)) + refHash = raReadSingle(raFile); + } + if (refHash == NULL) + errAbort("phyloPlaceOrgSetting: can't find "PHYLOPLACE_DATA_DIR"/%s/%s/reference.ra or " + PHYLOPLACE_DATA_DIR"/%s/config.ra", orgSkipHub, refSkipHub, refSkipHub); + hashAdd(refHashes, orgRefKey, refHash); + } +return cloneString(hashFindVal(refHash, settingName)); +} + +char *phyloPlaceRefSettingPath(char *org, char *ref, char *settingName) +/* Return cgi-bin-relative path to a file named by a setting from + * hgPhyloPlaceData/<org>/<ref>/reference.ra or old-style hgPhyloPlaceData/<ref>/config.ra, + * or NULL if not found. */ +{ +char *fileName = phyloPlaceRefSetting(org, ref, settingName); +if (isNotEmpty(fileName) && fileName[0] != '/' && !isUrl(fileName) && !fileExists(fileName)) + { + struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s/%s", + org, trackHubSkipHubName(ref), fileName); + if (fileExists(dy->string)) + return dyStringCannibalize(&dy); else - db++; - if (hDbExists(db)) - slNameAddHead(&dbList, db); + { + dyStringClear(dy); + dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", + trackHubSkipHubName(ref), fileName); + if (fileExists(dy->string)) + return dyStringCannibalize(&dy); else + return NULL; + } + } +return fileName; +} + +static char *connectIfHub(struct cart *cart, char *db) +/* If db is an "undecorated" hub name (e.g. GCF_... without the hub_... prefix), then connect to + * it if it isn't already connected. Return the complete hub name (with hub_... prefix) if + * connected successfully, or just db otherwise. */ +{ +char *maybeHubDb = db; +if (! hDbExists(db)) { // Not a db -- see if it's a hub that is already connected: struct trackHubGenome *hubGenome = trackHubGetGenomeUndecorated(db); if (hubGenome != NULL) - slNameAddHead(&dbList, hubGenome->name); + maybeHubDb = hubGenome->name; else { // Not connected to session currently. If the name looks like an NCBI Assembly ID // then try connecting to the corresponding UCSC assembly hub. regmatch_t substrs[5]; if (regexMatchSubstr(db, "^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\\.[0-9]$", substrs, ArraySize(substrs))) { char gcPrefix[4], first3[4], mid3[4], last3[4]; regexSubstringCopy(db, substrs[1], gcPrefix, sizeof gcPrefix); regexSubstringCopy(db, substrs[2], first3, sizeof first3); regexSubstringCopy(db, substrs[3], mid3, sizeof mid3); regexSubstringCopy(db, substrs[4], last3, sizeof last3); struct dyString *dy = dyStringCreate("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/" "%s/%s/hub.txt", gcPrefix, first3, mid3, last3, db); // Use cart variables to pretend user clicked to connect to this hub. cartSetString(cart, hgHubDataText, dy->string); cartSetString(cart, hgHubGenome, db); struct errCatch *errCatch = errCatchNew(); char *hubDb = NULL; if (errCatchStart(errCatch)) { hubDb = hubConnectLoadHubs(cart); } errCatchEnd(errCatch); if (hubDb != NULL) - slNameAddHead(&dbList, hubDb); - } - else - { - // Doesn't appear to be a hub; count on its config to specify a .2bit file. - slNameAddHead(&dbList, db); + maybeHubDb = hubDb; } } } +return maybeHubDb; } + +static char *finalDirName(char *path) +/* Clone & return the final directory name from a path to a file. */ +{ +char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN]; +splitPath(path, dir, name, extension); +if (endsWith(dir, "/")) + dir[strlen(dir)-1] = '\0'; +char *finalDir = strrchr(dir, '/'); +if (finalDir == NULL) + finalDir = dir; +else + finalDir++; +return cloneString(finalDir); } -// Reverse alphabetical sort to put wuhCor1/SARS-CoV-2 first -slNameSort(&dbList); -slReverse(&dbList); -return dbList; + +static int labelCmp(const void *va, const void *vb) +/* Compare two slPairs on their string values -- but treat SARS-CoV-2 as more important. */ +{ +const struct slPair *a = *((struct slPair **)va); +const struct slPair *b = *((struct slPair **)vb); +if (sameString((char *)(a->val), "SARS-CoV-2")) + return -1; +else if (sameString((char *)(b->val), "SARS-CoV-2")) + return 1; +return strcmp((char *)(a->val), (char *)(b->val)); } -char *phyloPlaceDbSetting(char *db, char *settingName) -/* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */ +struct slPair *phyloPlaceOrgList(struct cart *cart) +/* Each subdirectory of PHYLOPLACE_DATA_DIR that contains an organism.ra file is a collection of + * reference sequences that uploaded sequences will be matched against using nextclade sort. + * Some of those references might also be dbs or track hub names (without the hub_number_ prefix). + * Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file contains a single + * reference which might also be a db or track hub name (without the hub_number_ prefix). + * Return a list of {name, label} pairs, SARS-CoV-2 first, combining the two categories. */ +{ +struct slPair *orgList = NULL; +// I was hoping the pattern would be wildMatch'd only against filenames so I could use "*.ra", +// but both directories and files must match the pattern, so must use "*". +struct slName *dataDirPaths = pathsInDirAndSubdirs(PHYLOPLACE_DATA_DIR, "*"); +struct slName *path; +for (path = dataDirPaths; path != NULL; path = path->next) { -static struct hash *configHash = NULL; -static char *configDb = NULL; -char *dbBase = trackHubSkipHubName(db); -if (!sameOk(dbBase, configDb)) + if (endsWith(path->name, "/organism.ra")) { - char configFile[1024]; - safef(configFile, sizeof configFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", dbBase); - if (fileExists(configFile)) + // Use the directory name as symbol for this collection of reference sequences, get label + // and add {symbol, label} to list. + char *org = finalDirName(path->name); + char *label = phyloPlaceOrgSetting(org, "name"); + if (isEmpty(label)) + label = org; + char *description = phyloPlaceOrgSetting(org, "description"); + if (isNotEmpty(description)) { - configHash = raReadSingle(configFile); - configDb = cloneString(dbBase); - } + struct dyString *dy = dyStringCreate("%s %s", label, description); + label = dyStringCannibalize(&dy); } -if (sameOk(dbBase, configDb)) - return cloneString(hashFindVal(configHash, settingName)); -return NULL; + slAddHead(&orgList, slPairNew(org, label)); } - -// TODO: libify -INLINE boolean isUrl(char *url) -{ -return (startsWith("http://", url) || startsWith("https://", url) || startsWith("ftp://", url)); -} - -char *phyloPlaceDbSettingPath(char *db, char *settingName) -/* Return path to a file named by a setting from hgPhyloPlaceData/<db>/config.ra, - * or NULL if not found. (Append hgPhyloPlaceData/<db>/ to the beginning of relative path) */ + else if (endsWith(path->name, "/config.ra")) { -char *fileName = phyloPlaceDbSetting(db, settingName); -if (isNotEmpty(fileName) && fileName[0] != '/' && !isUrl(fileName) && !fileExists(fileName)) + // Old-style single-reference directory with config.ra instead of organism.ra; get label + // from config.ra and add {symbol, label} to list. + char *db = finalDirName(path->name); + char *label = phyloPlaceRefSetting(db, db, "name"); + if (isEmpty(label)) + label = hGenome(db); + char *description = phyloPlaceRefSetting(db, db, "description"); + if (isNotEmpty(description)) { - struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s", - trackHubSkipHubName(db), fileName); - if (fileExists(dy->string)) - return dyStringCannibalize(&dy); - else - return NULL; + struct dyString *dy = dyStringCreate("%s %s", label, description); + label = dyStringCannibalize(&dy); } -return fileName; + // If it's a hub, use its full hub_... name: + char *maybeHubDb = connectIfHub(cart, db); + slAddHead(&orgList, slPairNew(maybeHubDb, label)); + } + } +// Sort by label, putting SARS-CoV-2 first and then others in alphabetical order. +slSort(&orgList, labelCmp); +return orgList; } char *getUsherPath(boolean abortIfNotFound) /* Return hgPhyloPlaceData/usher-sampled if it exists, else try hgPhyloPlaceData/usher, else NULL. * Do not free the returned value. */ { char *usherPath = PHYLOPLACE_DATA_DIR "/usher-sampled"; if (fileExists(usherPath)) return usherPath; else { usherPath = PHYLOPLACE_DATA_DIR "/usher"; if (fileExists(usherPath)) return usherPath; } @@ -177,117 +295,121 @@ errAbort("Missing usher executable (expected to be at %s)", usherPath); return NULL; } char *getMatUtilsPath(boolean abortIfNotFound) /* Return hgPhyloPlaceData/matUtils if it exists, else NULL. Do not free the returned value. */ { char *matUtilsPath = PHYLOPLACE_DATA_DIR "/matUtils"; if (fileExists(matUtilsPath)) return matUtilsPath; else if (abortIfNotFound) errAbort("Missing matUtils executable (expected to be at %s)", matUtilsPath); return NULL; } -char *getUsherAssignmentsPath(char *db, boolean abortIfNotFound) -/* If <db>/config.ra specifies the file for use by usher --load-assignments and the file exists, - * return the path, else NULL. Do not free the returned value. */ +static char *getNextcladePath() +/* Return hgPhyloPlaceData/nextclade if it exists, else errAbort. Do not free the returned value. */ { -char *usherAssignmentsPath = phyloPlaceDbSettingPath(db, "usherAssignmentsFile"); -if (isNotEmpty(usherAssignmentsPath) && fileExists(usherAssignmentsPath)) - return usherAssignmentsPath; -else if (abortIfNotFound) - errAbort("Missing usher protobuf file (config setting in " - PHYLOPLACE_DATA_DIR "/%s/config.ra = %s", - trackHubSkipHubName(db), usherAssignmentsPath); +char *nextcladePath = PHYLOPLACE_DATA_DIR "/nextclade"; +if (fileExists(nextcladePath)) + return nextcladePath; +else + errAbort("Missing nextclade executable (expected to be at %s)", nextcladePath); return NULL; } //#*** This needs to go in a lib so CGIs know whether to include it in the menu. needs better name. boolean hgPhyloPlaceEnabled() /* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */ { char *cfgSetting = cfgOption("hgPhyloPlaceEnabled"); boolean isEnabled = (isNotEmpty(cfgSetting) && differentWord(cfgSetting, "off") && differentWord(cfgSetting, "no")); //#*** TODO -- make less wuhCor1-specific return (isEnabled && hDbExists("wuhCor1")); } -static void addPathIfNecessary(struct dyString *dy, char *db, char *fileName) -/* If fileName exists, copy it into dy, else try hgPhyloPlaceData/<db>/fileName */ +static void addPathIfNecessary(struct dyString *dy, char *org, char *ref, char *fileName) +/* If fileName exists, copy it into dy, else try hgPhyloPlaceData/<org>/<ref>/fileName */ { dyStringClear(dy); if (fileExists(fileName)) dyStringAppend(dy, fileName); else - dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", trackHubSkipHubName(db), fileName); + { + dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s/%s", org, trackHubSkipHubName(ref), fileName); + if (! fileExists(dy->string)) + { + dyStringClear(dy); + dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", trackHubSkipHubName(ref), fileName); + } + } } -struct treeChoices *loadTreeChoices(char *db) -/* If <db>/config.ra specifies a treeChoices file, load it up, else return NULL. */ +struct treeChoices *loadTreeChoices(char *org, char *ref) +/* If config specifies a treeChoices file, load it up, else return NULL. */ { struct treeChoices *treeChoices = NULL; -char *filename = phyloPlaceDbSettingPath(db, "treeChoices"); +char *filename = phyloPlaceRefSettingPath(org, ref, "treeChoices"); if (isNotEmpty(filename) && fileExists(filename)) { AllocVar(treeChoices); int maxChoices = 128; AllocArray(treeChoices->protobufFiles, maxChoices); AllocArray(treeChoices->metadataFiles, maxChoices); AllocArray(treeChoices->sources, maxChoices); AllocArray(treeChoices->descriptions, maxChoices); AllocArray(treeChoices->aliasFiles, maxChoices); AllocArray(treeChoices->sampleNameFiles, maxChoices); struct lineFile *lf = lineFileOpen(filename, TRUE); char *line; while (lineFileNextReal(lf, &line)) { char *words[7]; int wordCount = chopTabs(line, words); lineFileExpectAtLeast(lf, 4, wordCount); if (treeChoices->count >= maxChoices) { warn("File %s has too many lines, only showing first %d phylogenetic tree choices", filename, maxChoices); break; } struct dyString *dy = dyStringNew(0); - addPathIfNecessary(dy, db, words[0]); + addPathIfNecessary(dy, org, ref, words[0]); treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string); - addPathIfNecessary(dy, db, words[1]); + addPathIfNecessary(dy, org, ref, words[1]); treeChoices->metadataFiles[treeChoices->count] = cloneString(dy->string); treeChoices->sources[treeChoices->count] = cloneString(words[2]); // Description can be either a file or just some text. - addPathIfNecessary(dy, db, words[3]); + addPathIfNecessary(dy, org, ref, words[3]); if (fileExists(dy->string)) { char *desc = NULL; readInGulp(dy->string, &desc, NULL); treeChoices->descriptions[treeChoices->count] = desc; } else treeChoices->descriptions[treeChoices->count] = cloneString(words[3]); if (wordCount > 4) { - addPathIfNecessary(dy, db, words[4]); + addPathIfNecessary(dy, org, ref, words[4]); treeChoices->aliasFiles[treeChoices->count] = cloneString(dy->string); } if (wordCount > 5) { - addPathIfNecessary(dy, db, words[5]); + addPathIfNecessary(dy, org, ref, words[5]); treeChoices->sampleNameFiles[treeChoices->count] = cloneString(dy->string); } treeChoices->count++; dyStringFree(&dy); } lineFileClose(&lf); } return treeChoices; } static char *urlFromTn(struct tempName *tn) /* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't * just leave it up to the user's web browser to do the right thing with "../". */ { struct dyString *dy = dyStringCreate("%s%s", hLocalHostCgiBinUrl(), tn->forHtml); @@ -585,50 +707,103 @@ { char mutStr[64]; safef(mutStr, sizeof mutStr, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); slNameAddHead(&sampleMuts, mutStr); } slReverse(&sampleMuts); pi->sampleMuts = sampleMuts; } else { errAbort("addSampleMutsFromSeqInfo: no seqInfo for placed sequence '%s'", pi->sampleId); } } } +static void beginCollapsibleSpan() +/* Make a button to expand a section that is hidden by default. */ +{ +static int seqNum = 0; +char idBase[32]; +safef(idBase, sizeof idBase, "collapsible_%d", seqNum); +char buttonId[64]; +safef(buttonId, sizeof buttonId, "%s_button", idBase); +printf("<img height='18' width='18' id='%s' src='../images/add_sm.gif' alt='+' " + "title='click to expand' style='cursor:pointer;'>\n", + buttonId); +jsOnEventByIdF("click", buttonId, "let button = document.getElementById('%s'); " + "let container = document.getElementById('%s'); " + "if (button && container) {" + " let src = button.getAttribute('src');" + " if (src) {" + " if (src.indexOf('remove') > 0) {" + " container.style.display = 'none';" + " button.setAttribute('src', src.replace('/remove', '/add'));" + " } else {" + " container.style.display = 'block';" + " button.setAttribute('src', src.replace('/add', '/remove'));" + " }" + " }" + " return false;" + "} return true;", + buttonId, idBase); +printf("<span id='%s' style='display: none;'>\n", idBase); +seqNum++; +} + +static void endCollapsibleSpan() +{ +puts("</span>"); +} + static void displaySampleMuts(struct placementInfo *info, char *refAcc) { printf("<p>Differences from the reference genome " "(<a href='https://www.ncbi.nlm.nih.gov/nuccore/%s' target=_blank>%s</a>): ", refAcc, refAcc); if (info->sampleMuts == NULL) printf("(None; identical to reference)"); else { + boolean makeCollapsible = (slCount(info->sampleMuts) > 20); + if (makeCollapsible) + beginCollapsibleSpan(); struct slName *sln; for (sln = info->sampleMuts; sln != NULL; sln = sln->next) { if (sln != info->sampleMuts) printf(", "); printf("%s", sln->name); } + if (makeCollapsible) + endCollapsibleSpan(); } puts("</p>"); } +static int variantPathCountMuts(struct variantPathNode *variantPath) +/* Return the total count of mutations along variantPath. */ +{ +int mutCount = 0; +struct variantPathNode *vpn; +for (vpn = variantPath; vpn != NULL; vpn = vpn->next) + { + mutCount += slCount(vpn->sncList); + } +return mutCount; +} + boolean isInternalNodeName(char *nodeName, int minNewNode) /* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric * or <USHER_NODE_PREFIX>_<number> and, if minNewNode > 0, number is less than minNewNode. */ { if (startsWith(USHER_NODE_PREFIX, nodeName)) nodeName += strlen(USHER_NODE_PREFIX); return isAllDigits(nodeName) && (minNewNode <= 0 || (atoi(nodeName) < minNewNode)); } static void variantPathPrint(struct variantPathNode *variantPath) /* Print out a variantPath; print nodeName only if non-numeric * (i.e. a sample ID not internal node) */ { struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) @@ -638,35 +813,41 @@ if (!isInternalNodeName(vpn->nodeName, 0)) printf("%s: ", vpn->nodeName); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) printf(", "); printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } } static void displayVariantPath(struct variantPathNode *variantPath, char *sampleId) /* Display mutations on the path to this sample. */ { -printf("<p>Mutations along the path from the root of the phylogenetic tree to %s:<br>", +printf("<p>Mutations along the path from the root of the phylogenetic tree to %s:\n", sampleId); if (variantPath) { + boolean makeCollapsible = (variantPathCountMuts(variantPath) > 20); + if (makeCollapsible) + beginCollapsibleSpan(); + puts("<br>"); variantPathPrint(variantPath); + if (makeCollapsible) + endCollapsibleSpan(); puts("<br>"); } else puts("(None; your sample was placed at the root of the phylogenetic tree)"); puts("</p>"); } static int mutCountCmp(const void *a, const void *b) /* Compare number of mutations of phyloTree nodes a and b. */ { const struct phyloTree *nodeA = *(struct phyloTree * const *)a; const struct phyloTree *nodeB = *(struct phyloTree * const *)b; return slCount(nodeA->priv) - slCount(nodeB->priv); } @@ -1539,35 +1720,35 @@ static int qcThresholdsMaxSNVs[] = { 25, 35, 45, 55 }; static int qcThresholdsMaxMaskedSNVs[] = { 0, 1, 2, 3 }; static int qcThresholdsMaxImputed[] = { 0, 5, 20, 100 }; static int qcThresholdsMaxPlacements[] = { 1, 2, 3, 4 }; static int qcThresholdsMaxPScore[] = { 0, 2, 5, 10 }; static void wordsToQcThresholds(char **words, int *thresholds) /* Parse words from file into thresholds array. Caller must ensure words and thresholds each * have 4 items. */ { int i; for (i = 0; i < 4; i++) thresholds[i] = atoi(words[i]); } -static void readQcThresholds(char *db) +static void readQcThresholds(char *org, char *ref) /* If config.ra specifies a file with QC thresholds for excellent/good/fair/bad [/fail], * parse it and replace the default values in qcThresholds arrays. */ { -char *qcThresholdsFile = phyloPlaceDbSettingPath(db, "qcThresholds"); +char *qcThresholdsFile = phyloPlaceRefSettingPath(org, ref, "qcThresholds"); if (isNotEmpty(qcThresholdsFile)) { if (fileExists(qcThresholdsFile)) { struct lineFile *lf = lineFileOpen(qcThresholdsFile, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) { char *words[16]; int wordCount = chopTabs(line, words); lineFileExpectWords(lf, 5, wordCount); if (sameWord(words[0], "length")) wordsToQcThresholds(words+1, qcThresholdsMinLength); else if (sameWord(words[0], "nCount")) wordsToQcThresholds(words+1, qcThresholdsMaxNs); @@ -2539,37 +2720,37 @@ cmd[cIx++] = singleSubtreeJsonTn->forCgi; cmd[cIx++] = results->singleSubtreeInfo->subtreeTn->forCgi; struct subtreeInfo *ti; for (ti = results->subtreeInfoList; ti != NULL; ti = ti->next, sIx++) { cmd[cIx++] = jsonTns[sIx]->forCgi; cmd[cIx++] = ti->subtreeTn->forCgi; } cmd[cIx++] = NULL; struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL, 0); pipelineClose(&pl); reportTiming(pStartTime, "make subtree zipfile"); return zipTn; } -static struct slName **getProblematicSites(char *db, char *chrom, int chromSize) +static struct slName **getProblematicSites(char *org, char *ref, char *chrom, int chromSize) /* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that * masking is recommended, one per position in genome; otherwise return array of NULLs. */ { struct slName **pSites = NULL; AllocArray(pSites, chromSize); -char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile"); +char *pSitesFile = phyloPlaceRefSettingPath(org, ref, "maskFile"); if (isNotEmpty(pSitesFile) && fileExists(pSitesFile)) { struct bbiFile *bbi = bigBedFileOpen(pSitesFile); struct lm *lm = lmInit(0); struct bigBedInterval *bb, *bbList = bigBedIntervalQuery(bbi, chrom, 0, chromSize, 0, lm); for (bb = bbList; bb != NULL; bb = bb->next) { char *extra = bb->rest; char *reason = nextWord(&extra); int i; for (i = bb->start; i < bb->end; i++) slNameAddHead(&pSites[i], reason); } bigBedFileClose(&bbi); } @@ -2584,31 +2765,31 @@ printf("<a href='%s' download>Global phylogenetic tree with your sequences</a> | ", treeFile); printf("<a href='%s' download>TSV summary of sequences and placements</a> | ", sampleSummaryFile); printf("<a href='%s' download>TSV summary of Spike mutations</a> | ", spikeSummaryFile); printf("<a href='%s' download>ZIP file of subtree JSON and Newick files</a> | ", subtreeZipFile); puts("</p>"); } static int subTreeInfoUserSampleCmp(const void *pa, const void *pb) /* Compare subtreeInfo by number of user sample IDs (highest number first). */ { struct subtreeInfo *tiA = *(struct subtreeInfo **)pa; struct subtreeInfo *tiB = *(struct subtreeInfo **)pb; return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds); } -static void getProtobufMetadataSource(char *db, struct treeChoices *treeChoices, +static void getProtobufMetadataSource(struct treeChoices *treeChoices, char *protobufFile, char **retProtobufPath, char **retMetadataFile, char **retSource, char **retAliasFile, char **retSampleNameFile) /* If the config file specifies a list of tree choices, and protobufFile is a valid choice, then * set ret* to the files associated with that choice. Otherwise fall back on older conf settings. * Return the selected treeChoice if there is one. */ { if (treeChoices) { *retProtobufPath = protobufFile; if (isEmpty(*retProtobufPath)) *retProtobufPath = treeChoices->protobufFiles[0]; int i; for (i = 0; i < treeChoices->count; i++) if (sameString(treeChoices->protobufFiles[i], *retProtobufPath)) @@ -2618,36 +2799,31 @@ *retAliasFile = treeChoices->aliasFiles[i]; *retSampleNameFile = treeChoices->sampleNameFiles[i]; break; } if (i == treeChoices->count) { *retProtobufPath = treeChoices->protobufFiles[0]; *retMetadataFile = treeChoices->metadataFiles[0]; *retSource = treeChoices->sources[0]; *retAliasFile = treeChoices->aliasFiles[0]; *retSampleNameFile = treeChoices->sampleNameFiles[0]; } } else { - // Fall back on old settings - *retProtobufPath = getUsherAssignmentsPath(db, TRUE); - *retMetadataFile = phyloPlaceDbSettingPath(db, "metadataFile"); - *retSource = "GISAID"; - *retAliasFile = NULL; - *retSampleNameFile = NULL; + errAbort("getProtobufMetadataSource: treeChoices is NULL (missing protobufs.tab file?)"); } } static void addModVersion(struct hash *nameHash, char *id, char *fullName) /* Map id to fullName. If id has .version, strip that (modifying id!) and map versionless id * to fullName. */ { hashAdd(nameHash, id, fullName); char *p = strchr(id, '.'); if (p) { *p = '\0'; hashAdd(nameHash, id, fullName); } } @@ -2988,126 +3164,126 @@ struct hash *sampleMetadata = getSampleMetadata(metadataFile); reportTiming(&startTime, "read sample metadata (in a pthread)"); return sampleMetadata; } static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *)) /* Fork off a child process that parses a file and returns the resulting data structure. */ { pthread_t *pt; AllocVar(pt); if (! pthreadMayCreate(pt, NULL, workerFunction, filename)) pt = NULL; return pt; } -static struct dnaSeq *getChromSeq(char *db, char *refName) -/* Get the reference sequence for refName, using a .2bit file if configured, - * otherwise hdb lib functions (requires refName happens to be a real db or hub). */ +static struct dnaSeq *getChromSeq(char *org, char *ref, char *db) +/* Get the reference sequence for ref, using a .2bit file if configured, + * otherwise hdb lib functions (requires ref to be the same as db). */ { -char *twoBitName = phyloPlaceDbSettingPath(refName, "twoBitFile"); -char *chrom = phyloPlaceDbSetting(refName, "chrom"); +char *twoBitName = phyloPlaceOrgSettingPath(org, "twoBitFile"); struct dnaSeq *seq = NULL; if (isNotEmpty(twoBitName) && fileExists(twoBitName)) { + char *chrom = ref; struct slName *seqNames = twoBitSeqNames(twoBitName); - if (isEmpty(chrom)) - chrom = cloneString(seqNames->name); + if (!slNameFind(seqNames, chrom)) + chrom = seqNames->name; struct twoBitFile *tbf = twoBitOpen(twoBitName); seq = twoBitReadSeqFrag(tbf, chrom, 0, 0); // Convert to lower case so genoFind doesn't index it as containing no tiles. tolowers(seq->dna); twoBitClose(&tbf); slNameFreeList(&seqNames); } -else if (sameString(db, refName)) +else if (sameString(db, ref)) { + char *chrom = phyloPlaceRefSetting(org, ref, "chrom"); if (isEmpty(chrom)) chrom = hDefaultChrom(db); seq = hChromSeq(db, chrom, 0, hChromSize(db, chrom)); } else - errAbort("No twoBitFile or db/hub found for %s", refName); + errAbort("No twoBitFile or db/hub found for %s", ref); return seq; } static struct phyloTree *uploadedSamplesTree(char *singleSubtreeFile, struct slName *sampleIds) /* If the user uploaded enough samples to make a meaningful tree, then read in singleSubtreeFile * and prune all nodes that have no leaf descendants in sampleIds to get the tree of only the * uploaded samples. */ { struct phyloTree *tree = NULL; if (slCount(sampleIds) >= minSamplesForOwnTree) { tree = phyloOpenTree(singleSubtreeFile); tree = phyloPruneToIds(tree, sampleIds); } return tree; } -char *phyloPlaceSamples(struct lineFile *lf, char *db, char *refName, char *defaultProtobuf, - boolean doMeasureTiming, int subtreeSize, int fontHeight, - boolean *retSuccess) +static void phyloPlaceSamplesOneRef(struct lineFile *lf, char *db, char *org, + char *refName, char *defaultProtobuf, + boolean doMeasureTiming, int subtreeSize, + struct trackLayout *tl, struct cart *cart, boolean *retSuccess) /* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids: * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results, * make custom track files and return the top-level custom track file. * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils * to make subtrees, show subtree results, and return NULL. Set retSuccess to TRUE if we were * able to get at least some results for the user's input. */ { char *ctFile = NULL; -if (retSuccess) - *retSuccess = FALSE; measureTiming = doMeasureTiming; int startTime = clock1000(); struct tempName *vcfTn = NULL; struct slName *sampleIds = NULL; char *usherPath = getUsherPath(TRUE); char *protobufPath = NULL; char *source = NULL; char *metadataFile = NULL; char *aliasFile = NULL; char *sampleNameFile = NULL; -struct treeChoices *treeChoices = loadTreeChoices(refName); -getProtobufMetadataSource(refName, treeChoices, defaultProtobuf, +struct treeChoices *treeChoices = loadTreeChoices(org, refName); +getProtobufMetadataSource(treeChoices, defaultProtobuf, &protobufPath, &metadataFile, &source, &aliasFile, &sampleNameFile); reportTiming(&startTime, "start up and find the tree etc. files"); struct mutationAnnotatedTree *bigTree = NULL; lineFileCarefulNewlines(lf); -struct dnaSeq *refGenome = getChromSeq(db, refName); -struct slName **maskSites = getProblematicSites(refName, refGenome->name, refGenome->size); +struct dnaSeq *refGenome = getChromSeq(org, refName, db); +struct slName **maskSites = getProblematicSites(org, refName, refGenome->name, refGenome->size); //#*** TODO: add CGI param option for this almost-never-needed tweak: if (0) { bigTree = parseParsimonyProtobuf(protobufPath); reportTiming(&startTime, "parse protobuf file (at startup, for excluding informativeBases " "from maskSites)"); informativeBasesFromTree(bigTree->tree, maskSites, refGenome->size); reportTiming(&startTime, "remove any informative bases in tree from maskSites"); } boolean isFasta = FALSE; boolean subtreesOnly = FALSE; struct seqInfo *seqInfoList = NULL; if (lfLooksLikeFasta(lf)) { struct slPair *failedSeqs; struct slPair *failedPsls; struct hash *treeNames = NULL; // We need to check uploaded names in fasta only for original usher, not usher-sampled(-server). - if (!serverIsConfigured(refName) && !endsWith(usherPath, "-sampled")) + if (!serverIsConfigured(org) && !endsWith(usherPath, "-sampled")) treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, FALSE, &startTime); - vcfTn = vcfFromFasta(lf, refName, refGenome, maskSites, treeNames, + vcfTn = vcfFromFasta(lf, org, refName, refGenome, maskSites, treeNames, &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime); if (failedSeqs) { puts("<p>"); struct slPair *fail; for (fail = failedSeqs; fail != NULL; fail = fail->next) printf("%s<br>\n", fail->name); puts("</p>"); } if (failedPsls) { puts("<p>"); struct slPair *fail; for (fail = failedPsls; fail != NULL; fail = fail->next) printf("%s<br>\n", fail->name); @@ -3123,140 +3299,144 @@ vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds); reportTiming(&startTime, "check uploaded VCF"); } else { subtreesOnly = TRUE; struct hash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE, &startTime); addAliases(treeNames, aliasFile); reportTiming(&startTime, "load sample name aliases"); sampleIds = readSampleIds(lf, treeNames); reportTiming(&startTime, "look up uploaded sample names"); } lineFileClose(&lf); if (sampleIds == NULL) { - return ctFile; + return; } // Kick off child thread to load metadata simultaneously with running usher or matUtils. pthread_t *metadataPthread = mayStartLoaderPthread(metadataFile, loadMetadataWorker); struct usherResults *results = NULL; if (vcfTn) { fflush(stdout); - results = runUsher(refName, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds, + results = runUsher(org, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds, treeChoices, &startTime); } else if (subtreesOnly) { char *matUtilsPath = getMatUtilsPath(TRUE); results = runMatUtilsExtractSubtrees(refName, matUtilsPath, protobufPath, subtreeSize, sampleIds, treeChoices, &startTime); } struct hash *sampleMetadata = NULL; if (metadataPthread) { pthreadJoin(metadataPthread, (void **)(&sampleMetadata)); reportTiming(&startTime, "wait for sample metadata loading thread to finish"); } else { // We really need metadata -- load it the slow way. sampleMetadata = getSampleMetadata(metadataFile); reportTiming(&startTime, "load sample metadata without pthread"); } if (results && results->singleSubtreeInfo) { if (retSuccess) *retSuccess = TRUE; puts("<p></p>"); - readQcThresholds(refName); + readQcThresholds(org, refName); int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. - char *bigGenePredFile = phyloPlaceDbSettingPath(refName, "bigGenePredFile"); + char *bigGenePredFile = phyloPlaceRefSettingPath(org, refName, "bigGenePredFile"); struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome); struct seqWindow *gSeqWin = memSeqWindowNew(refGenome->name, refGenome->dna); struct hash *sampleUrls = hashNew(0); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); char subtreeName[512]; safef(subtreeName, sizeof(subtreeName), "subtreeAuspice%d", ix+1); trashDirFile(jsonTns[ix], "ct", subtreeName, ".json"); - treeToAuspiceJson(ti, refName, geneInfoList, gSeqWin, sampleMetadata, NULL, + treeToAuspiceJson(ti, org, refName, geneInfoList, gSeqWin, sampleMetadata, NULL, results->samplePlacements, jsonTns[ix]->forCgi, source); // Add a link for every sample to this subtree, so the single-subtree JSON can // link to subtree JSONs char *subtreeUrl = nextstrainUrlFromTn(jsonTns[ix]); struct slName *sample; for (sample = ti->subtreeUserSampleIds; sample != NULL; sample = sample->next) hashAdd(sampleUrls, sample->name, subtreeUrl); } struct tempName *singleSubtreeJsonTn; AllocVar(singleSubtreeJsonTn); trashDirFile(singleSubtreeJsonTn, "ct", "singleSubtreeAuspice", ".json"); - treeToAuspiceJson(results->singleSubtreeInfo, refName, geneInfoList, gSeqWin, sampleMetadata, + treeToAuspiceJson(results->singleSubtreeInfo, org, refName, geneInfoList, gSeqWin, sampleMetadata, sampleUrls, results->samplePlacements, singleSubtreeJsonTn->forCgi, source); reportTiming(&startTime, "make Auspice JSON"); struct subtreeInfo *subtreeInfoForButtons = results->subtreeInfoList; if (subtreeCount > MAX_SUBTREE_BUTTONS) subtreeInfoForButtons = NULL; - boolean canDoCustomTracks = (!subtreesOnly && sameString(db, refName)); + char *dbSetting = phyloPlaceRefSetting(org, refName, "db"); + if (dbSetting) + db = connectIfHub(cart, dbSetting); + boolean canDoCustomTracks = (!subtreesOnly && + (sameString(db, refName) || isNotEmpty(dbSetting))); makeButtonRow(singleSubtreeJsonTn, jsonTns, subtreeInfoForButtons, subtreeSize, isFasta, canDoCustomTracks); printf("<p>If you have metadata you wish to display, click a 'view subtree in " "Nextstrain' button, and then you can drag on a CSV file to " "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>." "</p>\n"); puts("<p><em>Note: " "The Nextstrain subtree views, and Download files below, are temporary files and will " "expire within two days. " "Please download the Nextstrain subtree JSON files if you will want to view them " "again in the future. The JSON files can be drag-dropped onto " "<a href='https://auspice.us/' target=_blank>https://auspice.us/</a>." "</em></p>"); struct tempName *tsvTn = NULL, *sTsvTn = NULL; struct tempName *zipTn = makeSubtreeZipFile(results, jsonTns, singleSubtreeJsonTn, &startTime); struct tempName *ctTn = NULL; if (subtreesOnly) { summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, refName, subtreeSize); reportTiming(&startTime, "describe subtrees"); } else { findNearestNeighbors(results, sampleMetadata); reportTiming(&startTime, "find nearest neighbors"); char *singleSubtreeFile = results->singleSubtreeInfo->subtreeTn->forCgi; struct phyloTree *sampleTree = uploadedSamplesTree(singleSubtreeFile, sampleIds); - if (sameString(db, refName)) + if (canDoCustomTracks) { // Make custom tracks for uploaded samples and subtree(s). - ctTn = writeCustomTracks(db, vcfTn, results, sampleIds, source, fontHeight, - sampleTree, &startTime); + ctTn = writeCustomTracks(org, refName, db, vcfTn, results, sampleIds, source, + tl->fontHeight, sampleTree, &startTime); } // Make a sample summary TSV file and accumulate S gene changes struct hash *seqInfoHash = hashFromSeqInfoListAndIds(seqInfoList, sampleIds); addSampleMutsFromSeqInfo(results->samplePlacements, seqInfoHash); struct hash *spikeChanges = hashNew(0); tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoHash, geneInfoList, gSeqWin, spikeChanges, &startTime); sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds)); downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml, zipTn->forHtml); int seqCount = slCount(seqInfoList); if (seqCount <= MAX_SEQ_DETAILS) { @@ -3289,31 +3469,31 @@ } reportTiming(&startTime, "describe placements"); } else printf("<p>(Skipping details; " "you uploaded %d sequences, and details are shown only when " "you upload at most %d sequences.)</p>\n", seqCount, MAX_SEQ_DETAILS); } puts("<h3>Downloads</h3>"); if (! subtreesOnly) { puts("<ul>"); // Offer big tree w/new samples for download - printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree " + printf("<li><a href='%s' download>phylogenetic tree " "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml); printf("<li><a href='%s' download>TSV summary of sequences and placements</a>\n", tsvTn->forHtml); printf("<li><a href='%s' download>TSV summary of S (Spike) gene changes</a>\n", sTsvTn->forHtml); } printf("<li><a href='%s' download>ZIP archive of subtree Newick and JSON files</a>\n", zipTn->forHtml); // For now, leave in the individual links so I don't break anybody's pipeline that's // scraping this page... for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml, ti->subtreeUserSampleIds->name); @@ -3337,19 +3517,234 @@ printf(", %s", sln->name); } puts(" (JSON file)</a>"); } puts("</ul>"); if (ctTn != NULL) { // Notify in opposite order of custom track creation. puts("<h3>Custom tracks for viewing in the Genome Browser</h3>"); printf("<p>Added custom track of uploaded samples.</p>\n"); if (subtreeCount > 0 && subtreeCount <= MAX_SUBTREE_CTS) printf("<p>Added %d subtree custom track%s.</p>\n", subtreeCount, (subtreeCount > 1 ? "s" : "")); ctFile = urlFromTn(ctTn); + // Form submits subtree custom tracks to hgTracks + printf("<form action='%s' name='resultsForm_%s' method=%s>\n\n", + hgTracksName(), db, cartUsualString(cart, "formMethod", "POST")); + cartSaveSession(cart); + cgiMakeHiddenVar("db", db); + cgiMakeHiddenVar(CT_CUSTOM_TEXT_VAR, ctFile); + if (tl->leftLabelWidthChars < 0 || tl->leftLabelWidthChars == leftLabelWidthDefaultChars) + cgiMakeHiddenVar(leftLabelWidthVar, leftLabelWidthForLongNames); + cgiMakeButton("submit", "view in Genome Browser"); + puts("</form>"); + } + } +} + +static char *dumpLfToTrashFile(struct lineFile *lf) +/* Dump the contents of lf to a trash file (for passing to an executable). Return trash file name. */ +{ +struct tempName tmp; +trashDirFile(&tmp, "ct", "usher_tmp", ".txt"); +FILE *f = mustOpen(tmp.forCgi, "w"); +char *line; +int size; +while (lineFileNext(lf, &line, &size)) + { + fputs(line, f); + fputc('\n', f); + } +carefulClose(&f); +return cloneString(tmp.forCgi); +} + +char *runNextcladeSort(char *seqFile, char *nextcladeIndex) +/* Run the nextclade sort command on uploaded seqs, return output TSV file name. */ +{ +char *nextcladePath = getNextcladePath(); +struct tempName tnNextcladeOut; +trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_sort", ".tsv"); +#define NEXTCLADE_NUM_THREADS "4" +char *cmd[] = { nextcladePath, "sort", seqFile, "-m", nextcladeIndex, + "-j", NEXTCLADE_NUM_THREADS, "-r", tnNextcladeOut.forCgi, NULL }; +char **cmds[] = { cmd, NULL }; +struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL, 0); +pipelineClose(&pl); +return cloneString(tnNextcladeOut.forCgi); +} + +static struct slPair *matchSamplesWithReferences(char *nextcladeIndex, struct lineFile *lf, + struct slName **retNoMatches, int *pStartTime) +/* Save lf's content (in memory from CGI post) to a temporary file so we can run nextclade sort + * to identify the best reference match for each uploaded sequence in lf. + * For each reference identified as the best match for at least one sequence, write all sequences + * for that reference to a different temporary file; return a list of pairs of {reference accession, + * filename that holds the uploaded sequences matched with that reference}. */ +{ +struct slPair *refFiles = NULL; +char *uploadedFile = dumpLfToTrashFile(lf); +reportTiming(pStartTime, "dump uploaded seqs to file"); + +char *nextcladeOut = runNextcladeSort(uploadedFile, nextcladeIndex); +reportTiming(pStartTime, "run nextclade sort"); + +struct lineFile *nlf = lineFileOpen(nextcladeOut, TRUE); +char *row[5]; +// Check header line... the nextclade guys change output columns frequently +if (lineFileRow(nlf, row)) + { + if (differentString(row[1], "seqName") || differentString(row[2], "dataset")) + errAbort("nextclade sort output header format has changed"); } +else + errAbort("nextclade sort output is empty"); +// Build up a hash of sample names to the best matching ref (the first one in the nextclade output +// file; ignore subsequent matches for the same sample), a list of samples that match no ref, and +// a hash of ref to open file handle to which we will write the sequences that matched that ref. +struct slName *noMatches = NULL; +struct hash *sampleRef = hashNew(0); +struct hash *refOpenFileHandles = hashNew(0); +while (lineFileRow(nlf, row)) + { + char *sample = row[1]; + char *ref = row[2]; + if (isEmpty(ref)) + slNameAddHead(&noMatches, sample); + else if (! hashFindVal(sampleRef, sample)) + { + hashAdd(sampleRef, sample, cloneString(ref)); + if (! hashFindVal(refOpenFileHandles, ref)) + { + struct tempName tnRefSamples; + trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt"); + hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w")); + slPairAdd(&refFiles, ref, cloneString(tnRefSamples.forCgi)); + } + } + } +lineFileClose(&nlf); +// Now go through the uploaded seqs again to write them out into separate files per ref. +FILE *f = mustOpen(uploadedFile, "r"); +char *nameLine = NULL; +struct dnaSeq *seq = NULL; +while (faReadMixedNext(f, FALSE, "uploaded_seq", TRUE, &nameLine, &seq)) + { + char *seqName = nameLine; + if (nameLine[0] == '>') + seqName = trimSpaces(nameLine+1); + char *ref = hashFindVal(sampleRef, seqName); + if (ref) + { + FILE *f = hashFindVal(refOpenFileHandles, ref); + if (f == NULL) + errAbort("matchSamplesWithReferences: can't find file handle for ref '%s' " + "for sample '%s'", ref, seqName); + faWriteNext(f, seqName, seq->dna, seq->size); + } + dnaSeqFree(&seq); + } +carefulClose(&f); +// Close per-ref file handles +struct hashEl *hel; +struct hashCookie cookie = hashFirst(refOpenFileHandles); +while ((hel = hashNext(&cookie)) != NULL) + { + carefulClose((FILE **)&(hel->val)); + } + +// Clean up +hashFree(&refOpenFileHandles); +freeHashAndVals(&sampleRef); +unlink(uploadedFile); +unlink(nextcladeOut); +if (retNoMatches) + *retNoMatches = noMatches; +reportTiming(pStartTime, "collated refs and samples"); +return refFiles; +} + +static void describeRef(char *org, char *ref, struct dyString *dyRefDesc) +/* Depending on whether name and description are specified in config, overwrite dyRefDesc with + * a description of the reference. */ +{ +char *name = phyloPlaceRefSetting(org, ref, "name"); +char *description = phyloPlaceRefSetting(org, ref, "description"); +dyStringClear(dyRefDesc); +if (isNotEmpty(description)) + { + if (isNotEmpty(name)) + dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref); + else + dyStringPrintf(dyRefDesc, "%s %s", ref, description); + } +else + { + if (isNotEmpty(name)) + dyStringPrintf(dyRefDesc, "%s (%s)", name, ref); + else + dyStringAppend(dyRefDesc, ref); + } +} + +boolean phyloPlaceSamples(struct lineFile *lf, char *db, char *org, char *defaultProtobuf, + boolean doMeasureTiming, int subtreeSize, struct trackLayout *tl, + struct cart *cart) +/* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids: + * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results, + * make custom track files. + * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils + * to make subtrees, show subtree results. + * Return TRUE if we were able to get at least some results for the user's input. */ +{ +boolean success = FALSE; +char *nextcladeIndex = phyloPlaceOrgSettingPath(org, "nextcladeIndex"); +if (isNotEmpty(nextcladeIndex)) + { + if (! fileExists(nextcladeIndex)) + errAbort("config for '%s' specifies nextcladeIndex file '%s' but it does not exist", + org, nextcladeIndex); + if (! lfLooksLikeFasta(lf)) + { + char *name = phyloPlaceOrgSetting(org, "name"); + if (isEmpty(name)) + name = org; + char *description = emptyForNull(phyloPlaceOrgSetting(org, "description")); + errAbort("Sorry, only fasta input is supported for %s %s", + name, description); + } + struct slName *noMatches = NULL; + int startTime = clock1000(); + struct slPair *refFiles = matchSamplesWithReferences(nextcladeIndex, lf, &noMatches, &startTime); + lineFileClose(&lf); + if (noMatches != NULL) + { + printf("<br>No reference was found for the following sequences:\n<ul>\n"); + struct slName *noMatch; + for (noMatch = noMatches; noMatch != NULL; noMatch = noMatch->next) + printf("<li>%s\n", noMatch->name); + puts("</ul>"); + } + struct dyString *dyRefDesc = dyStringNew(0); + struct slPair *ref; + for (ref = refFiles; ref != NULL; ref = ref->next) + { + if (ref != refFiles) + puts("<hr>"); + describeRef(org, ref->name, dyRefDesc); + printf("<h2>Sequence(s) matched reference %s</h2>\n", dyRefDesc->string); + struct lineFile *rlf = lineFileOpen(ref->val, TRUE); + phyloPlaceSamplesOneRef(rlf, db, org, ref->name, defaultProtobuf, + doMeasureTiming, subtreeSize, tl, cart, &success); + } + puts("<br>"); + } +else + { + // No nextcladeIndex means this is the old-style single-reference setup (i.e. SARS-CoV-2). + phyloPlaceSamplesOneRef(lf, db, org, org, defaultProtobuf, doMeasureTiming, subtreeSize, + tl, cart, &success); } -return ctFile; +return success; }