1b20a1774021cdcb07303801d55b7ca3fa6cff09
braney
Wed Jan 8 05:54:20 2025 -0800
fix up chop functions so they don't call ArraySize with NULL
diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 1314d28..bae73ed 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -1,4039 +1,4039 @@
/* Place SARS-CoV-2 sequences in phylogenetic tree using usher program. */
/* Copyright (C) 2020-2024 The Regents of the University of California */
#include "common.h"
#include "bigBed.h"
#include "cart.h"
#include "cheapcgi.h"
#include "errCatch.h"
#include "fa.h"
#include "genePred.h"
#include "grp.h"
#include "hCommon.h"
#include "hash.h"
#include "hgConfig.h"
#include "htmshell.h"
#include "hubConnect.h"
#include "hui.h"
#include "iupac.h"
#include "jsHelper.h"
#include "linefile.h"
#include "mmHash.h"
#include "obscure.h"
#include "parsimonyProto.h"
#include "phyloPlace.h"
#include "phyloTree.h"
#include "pipeline.h"
#include "psl.h"
#include "pthreadWrap.h"
#include "ra.h"
#include "regexHelper.h"
#include "sessionData.h"
#include "trackHub.h"
#include "trashDir.h"
#include "vcf.h"
// Globals:
static boolean measureTiming = FALSE;
// Parameter constants:
int maxGenotypes = 1000; // Upper limit on number of samples user can upload at once.
boolean showParsimonyScore = FALSE;
int minSamplesForOwnTree = 3; // If user uploads at least this many samples, show tree for them.
char *leftLabelWidthForLongNames = "55";// Leave plenty of room for tree and long virus strain names
char *phyloPlaceOrgSetting(char *org, char *settingName)
/* Return cloned setting value if found in hgPhyloPlaceData//organism.ra or
* old-style hgPhyloPlaceData//config.ra, or NULL if not found. */
{
static struct hash *orgHashes = NULL;
if (orgHashes == NULL)
orgHashes = hashNew(0);
char *orgSkipHub = trackHubSkipHubName(org);
struct hash *orgHash = hashFindVal(orgHashes, orgSkipHub);
if (orgHash == NULL)
{
char raFile[1024];
safef(raFile, sizeof raFile, PHYLOPLACE_DATA_DIR "/%s/organism.ra", orgSkipHub);
if (fileExists(raFile))
orgHash = raReadSingle(raFile);
else
{
safef(raFile, sizeof raFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", orgSkipHub);
if (fileExists(raFile))
orgHash = raReadSingle(raFile);
}
if (orgHash == NULL)
errAbort("phyloPlaceOrgSetting: can't find organism.ra or config.ra for '%s'", orgSkipHub);
hashAdd(orgHashes, orgSkipHub, orgHash);
}
return cloneString(hashFindVal(orgHash, settingName));
}
// TODO: libify
INLINE boolean isUrl(char *url)
{
return (startsWith("http://", url) || startsWith("https://", url) || startsWith("ftp://", url));
}
char *phyloPlaceOrgSettingPath(char *org, char *settingName)
/* Return cgi-bin-relative path to a file named by a setting for org, or NULL if not found. */
{
char *fileName = phyloPlaceOrgSetting(org, settingName);
if (isNotEmpty(fileName) && fileName[0] != '/' && !isUrl(fileName) && !fileExists(fileName))
{
struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s",
trackHubSkipHubName(org), fileName);
if (fileExists(dy->string))
return dyStringCannibalize(&dy);
else
return NULL;
}
return fileName;
}
char *phyloPlaceRefSetting(char *org, char *ref, char *settingName)
/* Return cloned setting value if found in hgPhyloPlaceData///reference.ra or
* old-style hgPhyloPlaceData//config.ra, or NULL if not found. */
{
static struct hash *refHashes = NULL;
if (refHashes == NULL)
refHashes = hashNew(0);
char *orgSkipHub = trackHubSkipHubName(org);
char *refSkipHub = trackHubSkipHubName(ref);
char orgRefKey[strlen(orgSkipHub) + strlen(refSkipHub) + 16];
safef(orgRefKey, sizeof orgRefKey, "%s|%s", orgSkipHub, refSkipHub);
struct hash *refHash = hashFindVal(refHashes, orgRefKey);
if (refHash == NULL)
{
char raFile[1024];
safef(raFile, sizeof raFile, PHYLOPLACE_DATA_DIR "/%s/%s/reference.ra", orgSkipHub, refSkipHub);
if (fileExists(raFile))
refHash = raReadSingle(raFile);
else
{
safef(raFile, sizeof raFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", refSkipHub);
if (fileExists(raFile))
refHash = raReadSingle(raFile);
}
if (refHash == NULL)
errAbort("phyloPlaceOrgSetting: can't find "PHYLOPLACE_DATA_DIR"/%s/%s/reference.ra or "
PHYLOPLACE_DATA_DIR"/%s/config.ra", orgSkipHub, refSkipHub, refSkipHub);
hashAdd(refHashes, orgRefKey, refHash);
}
return cloneString(hashFindVal(refHash, settingName));
}
char *phyloPlaceRefSettingPath(char *org, char *ref, char *settingName)
/* Return cgi-bin-relative path to a file named by a setting from
* hgPhyloPlaceData///reference.ra or old-style hgPhyloPlaceData//config.ra,
* or NULL if not found. */
{
char *fileName = phyloPlaceRefSetting(org, ref, settingName);
if (isNotEmpty(fileName) && fileName[0] != '/' && !isUrl(fileName) && !fileExists(fileName))
{
struct dyString *dy = dyStringCreate(PHYLOPLACE_DATA_DIR "/%s/%s/%s",
org, trackHubSkipHubName(ref), fileName);
if (fileExists(dy->string))
return dyStringCannibalize(&dy);
else
{
dyStringClear(dy);
dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s",
trackHubSkipHubName(ref), fileName);
if (fileExists(dy->string))
return dyStringCannibalize(&dy);
else
return NULL;
}
}
return fileName;
}
static char *connectIfHub(struct cart *cart, char *db)
/* If db is an "undecorated" hub name (e.g. GCF_... without the hub_... prefix), then connect to
* it if it isn't already connected. Return the complete hub name (with hub_... prefix) if
* connected successfully, or just db otherwise. */
{
char *maybeHubDb = db;
if (! hDbExists(db))
{
// Not a db -- see if it's a hub that is already connected:
struct trackHubGenome *hubGenome = trackHubGetGenomeUndecorated(db);
if (hubGenome != NULL)
maybeHubDb = hubGenome->name;
else
{
// Not connected to session currently. If the name looks like an NCBI Assembly ID
// then try connecting to the corresponding UCSC assembly hub.
regmatch_t substrs[5];
if (regexMatchSubstr(db, "^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\\.[0-9]$",
substrs, ArraySize(substrs)))
{
char gcPrefix[4], first3[4], mid3[4], last3[4];
regexSubstringCopy(db, substrs[1], gcPrefix, sizeof gcPrefix);
regexSubstringCopy(db, substrs[2], first3, sizeof first3);
regexSubstringCopy(db, substrs[3], mid3, sizeof mid3);
regexSubstringCopy(db, substrs[4], last3, sizeof last3);
struct dyString *dy = dyStringCreate("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/"
"%s/%s/hub.txt",
gcPrefix, first3, mid3, last3, db);
// Use cart variables to pretend user clicked to connect to this hub.
cartSetString(cart, hgHubDataText, dy->string);
cartSetString(cart, hgHubGenome, db);
struct errCatch *errCatch = errCatchNew();
char *hubDb = NULL;
if (errCatchStart(errCatch))
{
hubDb = hubConnectLoadHubs(cart);
}
errCatchEnd(errCatch);
maybeHubDb = hubDb;
if (errCatch->gotError)
warn("%s", errCatch->message->string);
}
}
}
return maybeHubDb;
}
static char *finalDirName(char *path)
/* Clone & return the final directory name from a path to a file. */
{
char dir[PATH_LEN], name[FILENAME_LEN], extension[FILEEXT_LEN];
splitPath(path, dir, name, extension);
if (endsWith(dir, "/"))
dir[strlen(dir)-1] = '\0';
char *finalDir = strrchr(dir, '/');
if (finalDir == NULL)
finalDir = dir;
else
finalDir++;
return cloneString(finalDir);
}
static int labelCmp(const void *va, const void *vb)
/* Compare two slPairs on their string values -- but treat SARS-CoV-2 as more important. */
{
const struct slPair *a = *((struct slPair **)va);
const struct slPair *b = *((struct slPair **)vb);
if (sameString((char *)(a->val), "SARS-CoV-2"))
return -1;
else if (sameString((char *)(b->val), "SARS-CoV-2"))
return 1;
return strcmp((char *)(a->val), (char *)(b->val));
}
struct slPair *phyloPlaceOrgList(struct cart *cart)
/* Each subdirectory of PHYLOPLACE_DATA_DIR that contains an organism.ra file is a collection of
* reference sequences that uploaded sequences will be matched against using nextclade sort.
* Some of those references might also be dbs or track hub names (without the hub_number_ prefix).
* Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file contains a single
* reference which might also be a db or track hub name (without the hub_number_ prefix).
* Return a list of {name, label} pairs, SARS-CoV-2 first, combining the two categories. */
{
struct slPair *orgList = NULL;
// I was hoping the pattern would be wildMatch'd only against filenames so I could use "*.ra",
// but both directories and files must match the pattern, so must use "*".
struct slName *dataDirPaths = pathsInDirAndSubdirs(PHYLOPLACE_DATA_DIR, "*");
struct slName *path;
for (path = dataDirPaths; path != NULL; path = path->next)
{
if (endsWith(path->name, "/organism.ra"))
{
// Use the directory name as symbol for this collection of reference sequences, get label
// and add {symbol, label} to list.
char *org = finalDirName(path->name);
char *label = phyloPlaceOrgSetting(org, "name");
if (isEmpty(label))
label = org;
char *description = phyloPlaceOrgSetting(org, "description");
if (isNotEmpty(description))
{
struct dyString *dy = dyStringCreate("%s %s", label, description);
label = dyStringCannibalize(&dy);
}
slAddHead(&orgList, slPairNew(org, label));
}
else if (endsWith(path->name, "/config.ra"))
{
// Old-style single-reference directory with config.ra instead of organism.ra; get label
// from config.ra and add {symbol, label} to list.
char *db = finalDirName(path->name);
char *label = phyloPlaceRefSetting(db, db, "name");
if (isEmpty(label))
label = hGenome(db);
char *description = phyloPlaceRefSetting(db, db, "description");
if (isNotEmpty(description))
{
struct dyString *dy = dyStringCreate("%s %s", label, description);
label = dyStringCannibalize(&dy);
}
// If it's a hub, use its full hub_... name:
char *maybeHubDb = connectIfHub(cart, db);
if (maybeHubDb == NULL)
maybeHubDb = db;
slAddHead(&orgList, slPairNew(maybeHubDb, label));
}
}
// Sort by label, putting SARS-CoV-2 first and then others in alphabetical order.
slSort(&orgList, labelCmp);
return orgList;
}
char *getUsherPath(boolean abortIfNotFound)
/* Return hgPhyloPlaceData/usher-sampled if it exists, else try hgPhyloPlaceData/usher, else NULL.
* Do not free the returned value. */
{
char *usherPath = PHYLOPLACE_DATA_DIR "/usher-sampled";
if (fileExists(usherPath))
return usherPath;
else
{
usherPath = PHYLOPLACE_DATA_DIR "/usher";
if (fileExists(usherPath))
return usherPath;
}
if (abortIfNotFound)
errAbort("Missing usher executable (expected to be at %s)", usherPath);
return NULL;
}
char *getMatUtilsPath(boolean abortIfNotFound)
/* Return hgPhyloPlaceData/matUtils if it exists, else NULL. Do not free the returned value. */
{
char *matUtilsPath = PHYLOPLACE_DATA_DIR "/matUtils";
if (fileExists(matUtilsPath))
return matUtilsPath;
else if (abortIfNotFound)
errAbort("Missing matUtils executable (expected to be at %s)", matUtilsPath);
return NULL;
}
char *getNextcladePath()
/* Return hgPhyloPlaceData/nextclade if it exists, else errAbort. Do not free the returned value. */
{
char *nextcladePath = PHYLOPLACE_DATA_DIR "/nextclade";
if (fileExists(nextcladePath))
return nextcladePath;
else
errAbort("Missing nextclade executable (expected to be at %s)", nextcladePath);
return NULL;
}
//#*** This needs to go in a lib so CGIs know whether to include it in the menu. needs better name.
boolean hgPhyloPlaceEnabled()
/* Return TRUE if hgPhyloPlace is enabled in hg.conf and db wuhCor1 exists. */
{
char *cfgSetting = cfgOption("hgPhyloPlaceEnabled");
boolean isEnabled = (isNotEmpty(cfgSetting) &&
differentWord(cfgSetting, "off") && differentWord(cfgSetting, "no"));
//#*** TODO -- make less wuhCor1-specific
return (isEnabled && hDbExists("wuhCor1"));
}
static void addPathIfNecessary(struct dyString *dy, char *org, char *ref, char *fileName)
/* If fileName exists, copy it into dy, else try hgPhyloPlaceData///fileName */
{
dyStringClear(dy);
if (fileExists(fileName))
dyStringAppend(dy, fileName);
else
{
dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s/%s", org, trackHubSkipHubName(ref), fileName);
if (! fileExists(dy->string))
{
dyStringClear(dy);
dyStringPrintf(dy, PHYLOPLACE_DATA_DIR "/%s/%s", trackHubSkipHubName(ref), fileName);
}
}
}
struct treeChoices *loadTreeChoices(char *org, char *ref)
/* If config specifies a treeChoices file, load it up, else return NULL. */
{
struct treeChoices *treeChoices = NULL;
char *filename = phyloPlaceRefSettingPath(org, ref, "treeChoices");
if (isNotEmpty(filename) && fileExists(filename))
{
AllocVar(treeChoices);
int maxChoices = 128;
AllocArray(treeChoices->protobufFiles, maxChoices);
AllocArray(treeChoices->metadataFiles, maxChoices);
AllocArray(treeChoices->sources, maxChoices);
AllocArray(treeChoices->descriptions, maxChoices);
AllocArray(treeChoices->aliasFiles, maxChoices);
AllocArray(treeChoices->sampleNameFiles, maxChoices);
struct lineFile *lf = lineFileOpen(filename, TRUE);
char *line;
while (lineFileNextReal(lf, &line))
{
char *words[7];
int wordCount = chopTabs(line, words);
lineFileExpectAtLeast(lf, 4, wordCount);
if (treeChoices->count >= maxChoices)
{
warn("File %s has too many lines, only showing first %d phylogenetic tree choices",
filename, maxChoices);
break;
}
struct dyString *dy = dyStringNew(0);
addPathIfNecessary(dy, org, ref, words[0]);
treeChoices->protobufFiles[treeChoices->count] = cloneString(dy->string);
addPathIfNecessary(dy, org, ref, words[1]);
treeChoices->metadataFiles[treeChoices->count] = cloneString(dy->string);
treeChoices->sources[treeChoices->count] = cloneString(words[2]);
// Description can be either a file or just some text.
addPathIfNecessary(dy, org, ref, words[3]);
if (fileExists(dy->string))
{
char *desc = NULL;
readInGulp(dy->string, &desc, NULL);
treeChoices->descriptions[treeChoices->count] = desc;
}
else
treeChoices->descriptions[treeChoices->count] = cloneString(words[3]);
if (wordCount > 4)
{
addPathIfNecessary(dy, org, ref, words[4]);
treeChoices->aliasFiles[treeChoices->count] = cloneString(dy->string);
}
if (wordCount > 5)
{
addPathIfNecessary(dy, org, ref, words[5]);
treeChoices->sampleNameFiles[treeChoices->count] = cloneString(dy->string);
}
treeChoices->count++;
dyStringFree(&dy);
}
lineFileClose(&lf);
}
return treeChoices;
}
static char *urlFromTn(struct tempName *tn)
/* Make a full URL to a trash file that our net.c code will be able to follow, for when we can't
* just leave it up to the user's web browser to do the right thing with "../". */
{
struct dyString *dy = dyStringCreate("%s%s", hLocalHostCgiBinUrl(), tn->forHtml);
char *url = dyStringCannibalize(&dy);
int size = strlen(url) + 1;
strSwapStrs(url, size, "cgi-bin/../", "");
return url;
}
void reportTiming(int *pStartTime, char *message)
/* Print out a report to stderr of how much time something took. */
{
if (measureTiming)
{
int now = clock1000();
fprintf(stderr, "%dms to %s\n", now - *pStartTime, message);
*pStartTime = now;
}
}
static boolean lfLooksLikeFasta(struct lineFile *lf)
/* Peek at file to see if it looks like FASTA, i.e. begins with a >header. */
{
boolean hasFastaHeader = FALSE;
char *line;
if (lineFileNext(lf, &line, NULL))
{
if (line[0] == '>')
hasFastaHeader = TRUE;
lineFileReuse(lf);
}
return hasFastaHeader;
}
static void rInformativeBasesFromTree(struct phyloTree *node, boolean *informativeBases)
/* For each variant associated with a non-leaf node, set informativeBases[chromStart]. */
{
if (node->numEdges > 0)
{
if (node->priv)
{
struct singleNucChange *snc, *sncs = node->priv;
for (snc = sncs; snc != NULL; snc = snc->next)
informativeBases[snc->chromStart] = TRUE;
}
int i;
for (i = 0; i < node->numEdges; i++)
rInformativeBasesFromTree(node->edges[i], informativeBases);
}
}
static boolean *informativeBasesFromTree(struct phyloTree *bigTree, struct slName **maskSites,
int chromSize)
/* Return an array indexed by reference position with TRUE at positions that have a non-leaf
* variant in bigTree. If a position is in maskSites but is informative in bigTree then remove
* it from maskSites because it was not masked (yet) back when the tree was built. */
{
boolean *informativeBases;
AllocArray(informativeBases, chromSize);
if (bigTree)
{
rInformativeBasesFromTree(bigTree, informativeBases);
int i;
for (i = 0; i < chromSize; i++)
{
if (maskSites[i] && informativeBases[i])
maskSites[i] = NULL;
}
}
return informativeBases;
}
static boolean lfLooksLikeVcf(struct lineFile *lf)
/* Peek at file to see if it looks like VCF, i.e. begins with a ##fileformat=VCF header. */
{
boolean hasVcfHeader = FALSE;
char *line;
if (lineFileNext(lf, &line, NULL))
{
if (startsWith("##fileformat=VCF", line))
hasVcfHeader = TRUE;
lineFileReuse(lf);
}
return hasVcfHeader;
}
static struct tempName *checkAndSaveVcf(struct lineFile *lf, struct dnaSeq *refGenome,
struct slName **maskSites, struct seqInfo **retSeqInfoList,
struct slName **retSampleIds)
/* Save the contents of lf to a trash file. If it has a reasonable number of genotype columns
* with recognizable genotypes, and the coordinates seem to be in range, then return the path
* to the trash file. Otherwise complain and return NULL. */
{
struct tempName *tn;
AllocVar(tn);
trashDirFile(tn, "ct", "ct_pp", ".vcf");
FILE *f = mustOpen(tn->forCgi, "w");
struct seqInfo *seqInfoList = NULL;
struct slName *sampleIds = NULL;
struct errCatch *errCatch = errCatchNew();
if (errCatchStart(errCatch))
{
char *line;
int lineSize;
int sampleCount = 0;
while (lineFileNext(lf, &line, &lineSize))
{
if (startsWith("#CHROM\t", line))
{
//#*** TODO: if the user uploads a sample with the same ID as one already in the
//#*** saved assignment file, then usher will ignore it!
//#*** Better check for that and warn the user.
- int colCount = chopTabs(line, NULL);
+ int colCount = chopTabsLen(line);
if (colCount == 1)
{
lineFileAbort(lf, "VCF requires tab-separated columns, but no tabs found");
}
sampleCount = colCount - VCF_NUM_COLS_BEFORE_GENOTYPES;
if (sampleCount < 1 || sampleCount > maxGenotypes)
{
if (sampleCount < 1)
lineFileAbort(lf, "VCF header #CHROM line has %d columns; expecting at least %d "
"columns including sample IDs for genotype columns",
colCount, 10);
else
lineFileAbort(lf, "VCF header #CHROM line defines %d samples but only up to %d "
"are supported",
sampleCount, maxGenotypes);
}
char lineCopy[lineSize+1];
safecpy(lineCopy, sizeof lineCopy, line);
char *words[colCount];
chopTabs(lineCopy, words);
struct hash *uniqNames = hashNew(0);
int i;
for (i = VCF_NUM_COLS_BEFORE_GENOTYPES; i < colCount; i++)
{
if (hashLookup(uniqNames, words[i]))
lineFileAbort(lf, "VCF sample names in #CHROM line must be unique, but '%s' "
"appears more than once", words[i]);
hashAdd(uniqNames, words[i], NULL);
slNameAddHead(&sampleIds, words[i]);
struct seqInfo *si;
AllocVar(si);
si->seq = cloneDnaSeq(refGenome);
si->seq->name = cloneString(words[i]);
slAddHead(&seqInfoList, si);
}
slReverse(&seqInfoList);
slReverse(&sampleIds);
hashFree(&uniqNames);
fputs(line, f);
fputc('\n', f);
}
else if (line[0] == '#')
{
fputs(line, f);
fputc('\n', f);
}
else
{
if (sampleCount < 1)
{
lineFileAbort(lf, "VCF header did not include #CHROM line defining sample IDs for "
"genotype columns");
}
- int colCount = chopTabs(line, NULL);
+ int colCount = chopTabsLen(line);
int genotypeCount = colCount - VCF_NUM_COLS_BEFORE_GENOTYPES;
if (genotypeCount != sampleCount)
{
lineFileAbort(lf, "VCF header defines %d samples but there are %d genotype columns",
sampleCount, genotypeCount);
}
char *words[colCount];
chopTabs(line, words);
//#*** TODO: check that POS is sorted
int pos = strtol(words[1], NULL, 10);
if (pos > refGenome->size)
{
lineFileAbort(lf, "VCF POS value %d exceeds size of reference sequence (%d)",
pos, refGenome->size);
}
// make sure REF value (if given) matches reference genome
int chromStart = pos - 1;
struct slName *maskedReasons = maskSites[chromStart];
char *ref = words[3];
if (strlen(ref) != 1)
{
// Not an SNV -- skip it.
//#*** should probably report or at least count these...
continue;
}
char refBase = toupper(refGenome->dna[chromStart]);
if (ref[0] == '*' || ref[0] == '.')
ref[0] = refBase;
else if (ref[0] != refBase)
lineFileAbort(lf, "VCF REF value at position %d is '%s', expecting '%c' "
"(or '*' or '.')",
pos, ref, refBase);
char altStrCopy[strlen(words[4])+1];
safecpy(altStrCopy, sizeof altStrCopy, words[4]);
char *alts[strlen(words[4])+1];
chopCommas(altStrCopy, alts);
//#*** Would be nice to trim out indels from ALT column -- but that would require
//#*** adjusting genotype codes below.
struct seqInfo *si = seqInfoList;
int i;
for (i = VCF_NUM_COLS_BEFORE_GENOTYPES; i < colCount; i++, si = si->next)
{
if (words[i][0] != '.' && !isdigit(words[i][0]))
{
lineFileAbort(lf, "VCF genotype columns must contain numeric allele codes; "
"can't parse '%s'", words[i]);
}
else
{
if (words[i][0] == '.')
{
si->seq->dna[chromStart] = 'n';
si->nCountMiddle++;
}
else
{
int alIx = atol(words[i]);
if (alIx > 0)
{
char *alt = alts[alIx-1];
if (strlen(alt) == 1)
{
si->seq->dna[chromStart] = alt[0];
struct singleNucChange *snc = sncNew(chromStart, ref[0], '\0',
alt[0]);
if (maskedReasons)
{
slAddHead(&si->maskedSncList, snc);
slAddHead(&si->maskedReasonsList, slRefNew(maskedReasons));
}
else
{
if (isIupacAmbiguous(alt[0]))
si->ambigCount++;
slAddHead(&si->sncList, snc);
}
}
}
}
}
}
if (!maskedReasons)
{
fputs(refGenome->name, f);
for (i = 1; i < colCount; i++)
{
fputc('\t', f);
fputs(words[i], f);
}
fputc('\n', f);
}
}
}
}
errCatchEnd(errCatch);
carefulClose(&f);
if (errCatch->gotError)
{
warn("%s", errCatch->message->string);
unlink(tn->forCgi);
freez(&tn);
}
errCatchFree(&errCatch);
struct seqInfo *si;
for (si = seqInfoList; si != NULL; si = si->next)
slReverse(&si->sncList);
*retSeqInfoList = seqInfoList;
*retSampleIds = sampleIds;
return tn;
}
static void addSampleMutsFromSeqInfo(struct hash *samplePlacements, struct hash *seqInfoHash)
/* We used to get placementInfo->sampleMuts from DEBUG mode usher output in runUsher.c, but
* the DEBUG mode output went away with the change to server-mode usher. Instead, now we fill
* it in from seqInfo->sncList which has the same info. */
{
struct hashEl *hel;
struct hashCookie cookie = hashFirst(samplePlacements);
while ((hel = hashNext(&cookie)) != NULL)
{
struct placementInfo *pi = hel->val;
struct seqInfo *si = hashFindVal(seqInfoHash, pi->sampleId);
if (si)
{
struct slName *sampleMuts = NULL;
struct singleNucChange *snc;
for (snc = si->sncList; snc != NULL; snc = snc->next)
{
char mutStr[64];
safef(mutStr, sizeof mutStr, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase);
slNameAddHead(&sampleMuts, mutStr);
}
slReverse(&sampleMuts);
pi->sampleMuts = sampleMuts;
}
else
{
errAbort("addSampleMutsFromSeqInfo: no seqInfo for placed sequence '%s'", pi->sampleId);
}
}
}
static void beginCollapsibleSpan()
/* Make a button to expand a section that is hidden by default. */
{
static int seqNum = 0;
char idBase[32];
safef(idBase, sizeof idBase, "collapsible_%d", seqNum);
char buttonId[64];
safef(buttonId, sizeof buttonId, "%s_button", idBase);
printf("\n",
buttonId);
jsOnEventByIdF("click", buttonId, "let button = document.getElementById('%s'); "
"let container = document.getElementById('%s'); "
"if (button && container) {"
" let src = button.getAttribute('src');"
" if (src) {"
" if (src.indexOf('remove') > 0) {"
" container.style.display = 'none';"
" button.setAttribute('src', src.replace('/remove', '/add'));"
" } else {"
" container.style.display = 'block';"
" button.setAttribute('src', src.replace('/add', '/remove'));"
" }"
" }"
" return false;"
"} return true;",
buttonId, idBase);
printf("\n", idBase);
seqNum++;
}
static void endCollapsibleSpan()
{
puts("");
}
static void displaySampleMuts(struct placementInfo *info, char *refAcc)
{
printf("
Differences from the reference genome "
"(%s): ",
refAcc, refAcc);
if (info->sampleMuts == NULL)
printf("(None; identical to reference)");
else
{
boolean makeCollapsible = (slCount(info->sampleMuts) > 20);
if (makeCollapsible)
beginCollapsibleSpan();
struct slName *sln;
for (sln = info->sampleMuts; sln != NULL; sln = sln->next)
{
if (sln != info->sampleMuts)
printf(", ");
printf("%s", sln->name);
}
if (makeCollapsible)
endCollapsibleSpan();
}
puts("
");
}
static int variantPathCountMuts(struct variantPathNode *variantPath)
/* Return the total count of mutations along variantPath. */
{
int mutCount = 0;
struct variantPathNode *vpn;
for (vpn = variantPath; vpn != NULL; vpn = vpn->next)
{
mutCount += slCount(vpn->sncList);
}
return mutCount;
}
boolean isInternalNodeName(char *nodeName, int minNewNode)
/* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric
* or _ and, if minNewNode > 0, number is less than minNewNode. */
{
if (startsWith(USHER_NODE_PREFIX, nodeName))
nodeName += strlen(USHER_NODE_PREFIX);
return isAllDigits(nodeName) && (minNewNode <= 0 || (atoi(nodeName) < minNewNode));
}
static void variantPathPrint(struct variantPathNode *variantPath)
/* Print out a variantPath; print nodeName only if non-numeric
* (i.e. a sample ID not internal node) */
{
struct variantPathNode *vpn;
for (vpn = variantPath; vpn != NULL; vpn = vpn->next)
{
if (vpn != variantPath)
printf(" > ");
if (!isInternalNodeName(vpn->nodeName, 0))
printf("%s: ", vpn->nodeName);
struct singleNucChange *snc;
for (snc = vpn->sncList; snc != NULL; snc = snc->next)
{
if (snc != vpn->sncList)
printf(", ");
printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
}
}
}
static void displayVariantPath(struct variantPathNode *variantPath, char *sampleId)
/* Display mutations on the path to this sample. */
{
printf("
Mutations along the path from the root of the phylogenetic tree to %s:\n",
sampleId);
if (variantPath)
{
boolean makeCollapsible = (variantPathCountMuts(variantPath) > 20);
if (makeCollapsible)
beginCollapsibleSpan();
puts(" ");
variantPathPrint(variantPath);
if (makeCollapsible)
endCollapsibleSpan();
puts(" ");
}
else
puts("(None; your sample was placed at the root of the phylogenetic tree)");
puts("
");
}
static int mutCountCmp(const void *a, const void *b)
/* Compare number of mutations of phyloTree nodes a and b. */
{
const struct phyloTree *nodeA = *(struct phyloTree * const *)a;
const struct phyloTree *nodeB = *(struct phyloTree * const *)b;
return slCount(nodeA->priv) - slCount(nodeB->priv);
}
static char *leafWithLeastMuts(struct phyloTree *node, struct hash *excludeHash)
/* If node has a leaf child (not in excludeHash) with no mutations of its own, return that leaf name.
* Otherwise, if node has leaf children, return the name of the leaf with the fewest mutations.
* Otherwise return NULL. */
{
char *leafName = NULL;
int leafCount = 0;
int i;
for (i = 0; i < node->numEdges; i++)
{
struct phyloTree *kid = node->edges[i];
if (kid->numEdges == 0 && !hashLookup(excludeHash, kid->ident->name))
{
leafCount++;
struct singleNucChange *kidMuts = kid->priv;
if (!kidMuts)
{
leafName = cloneString(kid->ident->name);
break;
}
}
}
if (leafName == NULL && leafCount)
{
// Pick the leaf with the fewest mutations.
struct phyloTree *leafKids[leafCount];
int leafIx = 0;
for (i = 0; i < node->numEdges; i++)
{
struct phyloTree *kid = node->edges[i];
if (kid->numEdges == 0 && !hashLookup(excludeHash, kid->ident->name))
leafKids[leafIx++] = kid;
}
qsort(leafKids, leafCount, sizeof(leafKids[0]), mutCountCmp);
leafName = cloneString(leafKids[0]->ident->name);
}
return leafName;
}
static char *findNearestNeighbor(struct phyloTree *tree, char *sampleId,
struct hash *samplePlacements)
/* Find sampleId's parent node in bigTree, then look for most similar leaf sibling(s),
* excluding other uploaded samples (which can be found in samplePlacements). */
{
struct phyloTree *sampleNode = phyloFindName(tree, sampleId);
if (!sampleNode || ! sampleNode->parent)
return NULL;
struct phyloTree *node = sampleNode->parent;
char *nearestNeighbor = leafWithLeastMuts(node, samplePlacements);
while (nearestNeighbor == NULL && node->parent != NULL)
{
node = node->parent;
nearestNeighbor = leafWithLeastMuts(node, samplePlacements);
}
return nearestNeighbor;
}
static void printVariantPathNoNodeNames(FILE *f, struct variantPathNode *variantPath)
/* Print out variant path with no node names (even if non-numeric) to f. */
{
struct variantPathNode *vpn;
for (vpn = variantPath; vpn != NULL; vpn = vpn->next)
{
if (vpn != variantPath)
fprintf(f, " > ");
struct singleNucChange *snc;
for (snc = vpn->sncList; snc != NULL; snc = snc->next)
{
if (snc != vpn->sncList)
fprintf(f, ", ");
fprintf(f, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
}
}
}
static void getSampleMetadataHash(char *metadataFile, struct sampleMetadataStore *sms)
/* If config.ra defines a metadataFile, load its contents into a hash indexed by name (and INSDC
* nucleotide accession sans dot if present) and return the hash; otherwise return NULL. */
{
if (isNotEmpty(metadataFile) && fileExists(metadataFile))
{
struct hash *sampleMetadata = hashNew(0);
struct lineFile *lf = lineFileOpen(metadataFile, TRUE);
int headerWordCount = 0;
char **headerWords = NULL;
char *line;
// Check for header line
if (lineFileNext(lf, &line, NULL))
{
if (startsWithWord("strain", line))
{
char *headerLine = cloneString(line);
headerWordCount = chopByChar(headerLine, '\t', NULL, 0);
AllocArray(headerWords, headerWordCount);
chopByChar(headerLine, '\t', headerWords, headerWordCount);
}
else
errAbort("Missing header line from metadataFile %s", metadataFile);
}
// Look for genbank_accession in header line so we can index by that as well
int genbankIx = stringArrayIx("genbank_accession", headerWords, headerWordCount);
while (lineFileNext(lf, &line, NULL))
{
char **columnValues = NULL;
AllocArray(columnValues, headerWordCount);
char *lineCopy = cloneString(line);
int wordCount = chopByChar(lineCopy, '\t', columnValues, headerWordCount);
lineFileExpectWords(lf, headerWordCount, wordCount);
if (genbankIx >= 0 && !isEmpty(columnValues[genbankIx]) &&
!sameString("?", columnValues[genbankIx]))
{
if (strchr(columnValues[genbankIx], '.'))
{
// Index by versionless accession
char copy[strlen(columnValues[genbankIx])+1];
safecpy(copy, sizeof copy, columnValues[genbankIx]);
char *dot = strchr(copy, '.');
*dot = '\0';
hashAdd(sampleMetadata, copy, columnValues);
}
else
hashAdd(sampleMetadata, columnValues[genbankIx], columnValues);
}
hashAdd(sampleMetadata, columnValues[0], columnValues);
}
lineFileClose(&lf);
sms->hash = sampleMetadata;
sms->columnCount = headerWordCount;
sms->columnNames = headerWords;
}
}
static struct sampleMetadataStore *getSampleMetadata(char *metadataFile)
/* If config.ra defines a metadataFile, load its contents into a hash indexed by name (and INSDC
* nucleotide accession sans dot if present) and return the hash; otherwise return NULL. */
{
struct sampleMetadataStore *sampleMetadata = NULL;
if (isNotEmpty(metadataFile) && fileExists(metadataFile))
{
AllocVar(sampleMetadata);
if (endsWith(metadataFile, ".mmh"))
sampleMetadata->mmh = mmHashFromFile(metadataFile);
else
getSampleMetadataHash(metadataFile, sampleMetadata);
}
return sampleMetadata;
}
char *gbIdFromSampleName(char *sampleId)
/* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */
{
char *gbId = NULL;
regmatch_t substrs[2];
// If it's preceded by anything, make sure it's a | so we ignore isolate names that glom on the
// reference's GenBank accession in them (e.g. ..._MN908947.3/2022|OQ070230.1).
if (regexMatchSubstr(sampleId, "^(.*\\|)?([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs)))
{
// Make sure there is a word boundary at the end of the match too
if (!isalnum(sampleId[substrs[1].rm_eo]))
gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so);
}
return gbId;
}
static char **metadataForSampleHash(struct hash *sampleMetadata, char *sampleId)
/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
{
char **met = hashFindVal(sampleMetadata, sampleId);
if (met)
return met;
if (!met)
{
char *gbId = gbIdFromSampleName(sampleId);
if (gbId)
met = hashFindVal(sampleMetadata, gbId);
}
if (!met && strchr(sampleId, '|'))
{
char copy[strlen(sampleId)+1];
safecpy(copy, sizeof copy, sampleId);
char *words[4];
int wordCount = chopByChar(copy, '|', words, ArraySize(words));
if (isNotEmpty(words[0]))
met = hashFindVal(sampleMetadata, words[0]);
if (met == NULL && wordCount > 1 && isNotEmpty(words[1]))
met = hashFindVal(sampleMetadata, words[1]);
}
return met;
}
static char **metadataForSampleMmHash(struct sampleMetadataStore *sms, char *sampleId)
/* Look up sampleId in mmHash and parse string value into struct sampleMetadata, or return NULL if
* not found. */
{
char **met = NULL;
const char *metadataLine = mmHashFindVal(sms->mmh, sampleId);
if (metadataLine)
{
if (sms->columnNames == NULL)
{
const char *headerLine = mmHashFindVal(sms->mmh, "strain");
if (headerLine == NULL)
errAbort("metadataForSampleMmHash: can't find column definitions in file "
"(no line starts with 'strain')");
char *headerLineCopy = cloneString(headerLine);
sms->columnCount = chopByChar(headerLineCopy, '\t', NULL, 0);
AllocArray(sms->columnNames, sms->columnCount);
chopByChar(headerLineCopy, '\t', sms->columnNames, sms->columnCount);
}
char *lineCopy = cloneString(metadataLine);
int metWordCount = chopByChar(lineCopy, '\t', NULL, 0);
if (metWordCount != sms->columnCount)
errAbort("metadataForSampleMmHash: found %lu header columns but %d data columns for '%s'"
" ('%s')",
sms->columnCount, metWordCount, sampleId, metadataLine);
AllocArray(met, sms->columnCount);
chopByChar(lineCopy, '\t', met, sms->columnCount);
}
return met;
}
char **metadataForSample(struct sampleMetadataStore *sampleMetadata, char *sampleId)
/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
{
if (sampleMetadata == NULL)
return NULL;
if (sampleMetadata->hash)
return metadataForSampleHash(sampleMetadata->hash, sampleId);
else
return metadataForSampleMmHash(sampleMetadata, sampleId);
}
static char *lineageForSample(struct sampleMetadataStore *sms, char *sampleId)
/* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */
{
char *lineage = NULL;
char **met = metadataForSample(sms, sampleId);
if (met)
{
int ix;
if ((ix = stringArrayIx("pangolin_lineage", sms->columnNames, sms->columnCount)) >= 0 ||
(ix = stringArrayIx("pango_lineage", sms->columnNames, sms->columnCount)) >= 0 ||
(ix = stringArrayIx("Nextstrain_lineage", sms->columnNames, sms->columnCount)) >= 0 ||
(ix = stringArrayIx("GCC_nextclade", sms->columnNames, sms->columnCount)) >= 0)
lineage = met[ix];
}
return lineage;
}
static void printLineageLink(char *lineage, char *db)
/* Print lineage with link to outbreak.info or pango-designation issue if appropriate.
* Caller must handle case of empty/NULL lineage. */
{
boolean isSarsCov2 = sameString(db, "wuhCor1");
if (isEmpty(lineage))
errAbort("programmer error:printLineageLink called with empty lineage");
else if (isSarsCov2 && startsWith("proposed", lineage))
printf("%s",
lineage+strlen("proposed"), lineage);
else if (isSarsCov2 && differentString(lineage, "None") && differentString(lineage, "Unassigned"))
{
// Trim _suffix that I add to extra tree annotations for problematic branches, if present.
char lineageCopy[strlen(lineage)+1];
safecpy(lineageCopy, sizeof lineageCopy, lineage);
char *p = strchr(lineageCopy, '_');
if (p != NULL)
*p = '\0';
printf("%s", lineageCopy, lineageCopy);
}
else
printf("%s", lineage);
}
static void maybePrintLineageLink(char *lineage, char *db)
/* If lineage is not empty/NULL, print ": lineage " and link to outbreak.info
* (unless lineage is "None") */
{
if (isNotEmpty(lineage))
{
printf(": lineage ");
printLineageLink(lineage, db);
}
}
static void displayNearestNeighbors(struct placementInfo *info, char *source, char *db)
/* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */
{
if (isNotEmpty(info->nearestNeighbor))
{
printf("
");
}
}
static int placementInfoRefCmpSampleMuts(const void *va, const void *vb)
/* Compare slRef->placementInfo->sampleMuts lists. Shorter lists first. Using alpha sort
* to distinguish between different sampleMuts contents arbitrarily; the purpose is to
* clump samples with identical lists. */
{
struct slRef * const *rra = va;
struct slRef * const *rrb = vb;
struct placementInfo *pa = (*rra)->val;
struct placementInfo *pb = (*rrb)->val;
int diff = slCount(pa->sampleMuts) - slCount(pb->sampleMuts);
if (diff == 0)
{
struct slName *slnA, *slnB;
for (slnA = pa->sampleMuts, slnB = pb->sampleMuts; slnA != NULL;
slnA = slnA->next, slnB = slnB->next)
{
diff = strcmp(slnA->name, slnB->name);
if (diff != 0)
break;
}
}
return diff;
}
static struct slRef *getPlacementRefList(struct slName *sampleIds, struct hash *samplePlacements)
/* Look up sampleIds in samplePlacements and return ref list of placements. */
{
struct slRef *placementRefs = NULL;
struct slName *sample;
for (sample = sampleIds; sample != NULL; sample = sample->next)
{
struct placementInfo *info = hashFindVal(samplePlacements, sample->name);
if (!info)
{
// Usher might have added a prefix to distinguish from a sequence with the same name
// already in the tree.
char nameWithPrefix[strlen(USHER_DEDUP_PREFIX) + strlen(sample->name) + 1];
safef(nameWithPrefix, sizeof nameWithPrefix, "%s%s", USHER_DEDUP_PREFIX, sample->name);
info = hashFindVal(samplePlacements, nameWithPrefix);
if (!info)
errAbort("getPlacementRefList: can't find placement info for sample '%s'",
sample->name);
}
slAddHead(&placementRefs, slRefNew(info));
}
slReverse(&placementRefs);
return placementRefs;
}
static int countIdentical(struct slRef *placementRefs)
/* Return the number of placements that have identical sampleMuts lists. */
{
int clumpCount = 0;
struct slRef *ref;
for (ref = placementRefs; ref != NULL; ref = ref->next)
{
clumpCount++;
if (ref->next == NULL || placementInfoRefCmpSampleMuts(&ref, &ref->next))
break;
}
return clumpCount;
}
static void asciiTree(struct phyloTree *node, char *indent, boolean isLast,
struct dyString *dyLine, struct slPair **pRowList)
/* Until we can make a real graphic, at least print an ascii tree build up a (reversed) list of
* lines so that we can add some text to the right later. */
{
if (isNotEmpty(indent) || isNotEmpty(node->ident->name))
{
if (node->ident->name && !isInternalNodeName(node->ident->name, 0))
{
dyStringPrintf(dyLine, "%s %s", indent, node->ident->name);
slPairAdd(pRowList, node->ident->name, cloneString(dyLine->string));
dyStringClear(dyLine);
}
}
int indentLen = strlen(indent);
char indentForKids[indentLen+1];
safecpy(indentForKids, sizeof indentForKids, indent);
if (indentLen >= 4)
{
if (isLast)
safecpy(indentForKids+indentLen - 4, 4 + 1, " ");
else
safecpy(indentForKids+indentLen - 4, 4 + 1, "| ");
}
if (node->numEdges > 0)
{
char kidIndent[strlen(indent)+5];
safef(kidIndent, sizeof kidIndent, "%s%s", indentForKids, "+---");
int i;
for (i = 0; i < node->numEdges; i++)
asciiTree(node->edges[i], kidIndent, (i == node->numEdges - 1), dyLine, pRowList);
}
}
static void asciiTreeWithNeighborInfo(struct phyloTree *subtree, struct hash *samplePlacements)
/* Print out an ascii tree with nearest neighbor & lineage to the right as suggested by Joe deRisi */
{
struct dyString *dy = dyStringNew(0);
struct slPair *rowList = NULL;
asciiTree(subtree, "", TRUE, dy, &rowList);
slReverse(&rowList);
int maxLen = 0;
struct slPair *row;
for (row = rowList; row != NULL; row = row->next)
{
char *asciiRow = row->val;
int len = strlen(asciiRow);
if (len > maxLen)
maxLen = len;
}
for (row = rowList; row != NULL; row = row->next)
{
char *asciiRow = row->val;
char *neighbor = "?";
char *lineage = "?";
struct placementInfo *info = hashFindVal(samplePlacements, row->name);
if (info)
{
if (isNotEmpty(info->nearestNeighbor))
neighbor = info->nearestNeighbor;
if (isNotEmpty(info->neighborLineage))
lineage = info->neighborLineage;
}
printf("%-*s %s %s\n", maxLen, asciiRow, neighbor, lineage);
}
slNameFreeList(&rowList);
dyStringFree(&dy);
}
static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements,
struct phyloTree *subtree,
char *source, char *refAcc, char *db)
/* Report how each sample fits into the big tree. */
{
// Sort sample placements by sampleMuts so we can group identical samples.
struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements);
slSort(&placementRefs, placementInfoRefCmpSampleMuts);
int relatedCount = slCount(placementRefs);
int clumpSize = countIdentical(placementRefs);
if (clumpSize < relatedCount && relatedCount > 2)
{
// Not all of the related sequences are identical, so they will be broken down into
// separate "clumps". List all related samples first to avoid confusion.
puts("
\n", info->parsimonyScore);
//#*** TODO: explain parsimony score
}
}
struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds)
/* Prune all descendants of node that have no leaf descendants in sampleIds. */
{
if (node->numEdges)
{
struct phyloTree *prunedKids = NULL;
int i;
for (i = 0; i < node->numEdges; i++)
{
struct phyloTree *kidIntersected = phyloPruneToIds(node->edges[i], sampleIds);
if (kidIntersected)
slAddHead(&prunedKids, kidIntersected);
}
int kidCount = slCount(prunedKids);
assert(kidCount <= node->numEdges);
if (kidCount > 1)
{
slReverse(&prunedKids);
// There is no phyloTreeFree, but if we ever add one, should use it here.
node->numEdges = kidCount;
struct phyloTree *kid;
for (i = 0, kid = prunedKids; i < kidCount; i++, kid = kid->next)
{
node->edges[i] = kid;
kid->parent = node;
}
}
else
return prunedKids;
}
else if (! (node->ident->name && slNameInListUseCase(sampleIds, node->ident->name)))
node = NULL;
return node;
}
// NOTE: it would be more efficient to associate a subtree with each sample after parsing
// and sorting subtrees, e.g. hash and/or store a link in placementInfo so we don't have to search
// subtrees for every sample like this, but this will do for now.
static struct subtreeInfo *subtreeInfoForSample(struct subtreeInfo *subtreeInfoList, char *name,
int *retIx)
/* Find the subtree that contains sample name and set *retIx to its index in the list.
* If we can't find it, return NULL and set *retIx to -1. */
{
struct subtreeInfo *ti;
int ix;
for (ti = subtreeInfoList, ix = 0; ti != NULL; ti = ti->next, ix++)
if (slNameInListUseCase(ti->subtreeUserSampleIds, name))
break;
if (ti == NULL)
ix = -1;
*retIx = ix;
return ti;
}
static void findNearestNeighbors(struct usherResults *results, struct sampleMetadataStore *sms)
/* For each placed sample, find the nearest neighbor in the subtree and its assigned lineage,
* and fill in those two fields of placementInfo. */
{
struct hashCookie cookie = hashFirst(results->samplePlacements);
struct hashEl *hel;
while ((hel = hashNext(&cookie)) != NULL)
{
struct placementInfo *info = hel->val;
int ix;
struct subtreeInfo *ti = subtreeInfoForSample(results->subtreeInfoList, info->sampleId, &ix);
if (!ti)
continue;
info->nearestNeighbor = findNearestNeighbor(ti->subtree, info->sampleId,
results->samplePlacements);
if (isNotEmpty(info->nearestNeighbor))
info->neighborLineage = lineageForSample(sms, info->nearestNeighbor);
}
}
static void lookForCladesAndLineages(struct hash *samplePlacements,
boolean *retGotClades, boolean *retGotLineages)
/* See if UShER has annotated any clades and/or lineages for seqs. */
{
boolean gotClades = FALSE, gotLineages = FALSE;
struct hashEl *hel;
struct hashCookie cookie = hashFirst(samplePlacements);
while ((hel = hashNext(&cookie)) != NULL)
{
struct placementInfo *pi = hel->val;
if (pi)
{
if (isNotEmpty(pi->nextClade))
gotClades = TRUE;
if (isNotEmpty(pi->pangoLineage))
{
gotLineages = TRUE;
}
if (gotClades && gotLineages)
break;
}
}
*retGotClades = gotClades;
*retGotLineages = gotLineages;
}
static char *nextstrainHost()
/* Return the nextstrain hostname from an hg.conf param, or NULL if missing. Do not free result. */
{
return cfgOption("nextstrainHost");
}
static char *nextstrainUrlBase()
/* Alloc & return the part of the nextstrain URL before the JSON filename. */
{
struct dyString *dy = dyStringCreate("%s/fetch/", nextstrainHost());
return dyStringCannibalize(&dy);
}
static char *skipProtocol(char *url)
/* Skip the protocol:// part at the beginning of url if found. Do not free result. */
{
char *protocol = strstr(url, "://");
return protocol ? protocol + strlen("://") : url;
}
static char *nextstrainUrlFromTn(struct tempName *jsonTn)
/* Return a link to Nextstrain to view an annotated subtree. */
{
char *jsonUrlForNextstrain = urlFromTn(jsonTn);
// Nextstrain doesn't accept .json.gz files -- only .json, optionally compressed in HTTPS with
// Content-Encoding: gzip in the headers. Apache can be config'd to serve that up from .json.gz
// files on disk, see https://github.com/nextstrain/nextstrain.org/issues/1058
if (endsWith(jsonUrlForNextstrain, ".json.gz"))
chopSuffix(jsonUrlForNextstrain);
char *urlBase = nextstrainUrlBase();
struct dyString *dy = dyStringCreate("%s%s%s", urlBase, skipProtocol(jsonUrlForNextstrain),
NEXTSTRAIN_URL_PARAMS);
freeMem(jsonUrlForNextstrain);
freeMem(urlBase);
return dyStringCannibalize(&dy);
}
static void makeNextstrainButton(char *id, struct tempName *tn, char *label, char *mouseover)
/* Make a button to view an auspice JSON file in Nextstrain. */
{
char *nextstrainUrl = nextstrainUrlFromTn(tn);
struct dyString *js = dyStringCreate("window.open('%s');", nextstrainUrl);
cgiMakeOnClickButtonWithMsg(id, js->string, label, mouseover);
dyStringFree(&js);
freeMem(nextstrainUrl);
}
static void makeNextstrainButtonN(char *idBase, int ix, int userSampleCount, int subtreeSize,
struct tempName *jsonTns[])
/* Make a button to view one subtree in Nextstrain. idBase is a short string and
* ix is 0-based subtree number. */
{
char buttonId[256];
safef(buttonId, sizeof buttonId, "%s%d", idBase, ix+1);
char buttonLabel[256];
safef(buttonLabel, sizeof buttonLabel, "view subtree %d in Nextstrain", ix+1);
struct dyString *dyMo = dyStringCreate("view subtree %d with %d of your sequences and %d other "
"sequences from the phylogenetic tree for context",
ix+1, userSampleCount, subtreeSize - userSampleCount);
makeNextstrainButton(buttonId, jsonTns[ix], buttonLabel, dyMo->string);
dyStringFree(&dyMo);
}
static void makeNsSingleTreeButton(struct tempName *tn)
/* Make a button to view single subtree (with all uploaded samples) in Nextstrain. */
{
static int serial = 0;
char buttonId[256];
safef(buttonId, sizeof buttonId, "viewNextstrainSingleSubtree%d", serial++);
makeNextstrainButton(buttonId, tn,
"view downsampled global tree in Nextstrain",
"view one subtree that includes all of your uploaded sequences plus "
SINGLE_SUBTREE_SIZE" randomly selected sequences from the global phylogenetic "
"tree for context");
}
char *microbeTraceHost()
/* Return the MicrobeTrace hostname from an hg.conf param, or NULL if missing. Do not free result. */
{
return cfgOption("microbeTraceHost");
}
static char *microbeTraceUrlBase()
/* Alloc & return the part of the MicrobeTrace URL before the JSON filename. */
{
struct dyString *dy = dyStringCreate("%s/MicrobeTrace/?url=", microbeTraceHost());
return dyStringCannibalize(&dy);
}
static char *microbeTraceUrlFromTn(struct tempName *jsonTn)
/* Return a link to MicrobeTrace to view an annotated subtree. */
{
char *jsonUrl = urlFromTn(jsonTn);
char *urlBase = microbeTraceUrlBase();
struct dyString *dy = dyStringCreate("%s%s", urlBase, jsonUrl);
freeMem(jsonUrl);
freeMem(urlBase);
return dyStringCannibalize(&dy);
}
static char *makeSubtreeDropdown(struct subtreeInfo *subtreeInfoList, struct tempName **jsonTns)
/* Let user choose subtree to view */
{
static int serial = 0;
char subtreeDropdownName[128];
safef(subtreeDropdownName, sizeof subtreeDropdownName, "subtreeSelect%d", serial++);
int count = slCount(subtreeInfoList);
char *labels[count];
char *values[count];
struct subtreeInfo *ti;
int ix;
for (ix = 0, ti = subtreeInfoList; ti != NULL; ti = ti->next, ix++)
{
struct dyString *dy = dyStringCreate("subtree %d", ix+1);
labels[ix] = dyStringCannibalize(&dy);
values[ix] = urlFromTn(jsonTns[ix]);
}
cgiMakeDropListWithVals(subtreeDropdownName, labels, values, count, NULL);
for (ix = 0; ix < count; ix++)
{
freeMem(labels[ix]);
}
return cloneString(subtreeDropdownName);
}
static void makeSubtreeJumpButton(char *subtreeDropdownName, char *dest, char *destUrlBase,
char *destUrlParams, boolean skipProtocol, boolean skipGz)
/* Make a button with javascript to get a JSON filename from a dropdown element, format a link
* to dest, and jump to that dest when clicked. */
{
static int serial = 0;
char *mouseover = "view selected subtree with your sequences and other sequences from the "
"full phylogenetic tree for context";
struct dyString *js = dyStringCreate("jsonUrl = document.querySelector('select[name=\"%s\"]').value;"
"if (%d) { ix = jsonUrl.indexOf('://');"
" if (ix >= 0) { jsonUrl = jsonUrl.substr(ix+3); } }"
"if (%d) { ix = jsonUrl.indexOf('.gz');"
" if (ix >= 0) { jsonUrl = jsonUrl.substr(0, ix); } }"
"window.open('%s' + jsonUrl + '%s');",
subtreeDropdownName, skipProtocol, skipGz, destUrlBase, destUrlParams);
struct dyString *id = dyStringCreate("jumpTo%s_%d", dest, serial++);
printf("",
id->string, dest, mouseover);
jsOnEventById("click", id->string, js->string);
dyStringFree(&js);
dyStringFree(&id);
}
static void makeButtonRow(struct tempName *singleSubtreeJsonTn, struct tempName *jsonTns[],
struct subtreeInfo *subtreeInfoList, int subtreeSize, boolean isFasta,
boolean offerCustomTrack)
/* Russ's suggestion: row of buttons at the top to view results in GB, Nextstrain, Nextclade. */
{
puts("
");
puts("
");
if (offerCustomTrack)
{
puts("
");
cgiMakeButtonWithMsg("submit", "view in Genome Browser",
"view your uploaded sequences, their phylogenetic relationship and their "
"mutations along with many other datasets available in the Genome Browser");
puts("
");
}
// SingleSubtree -- only for Nextstrain, not really applicable to MicrobeTrace
if (nextstrainHost())
{
puts("
");
}
// If both Nextstrain and MicrobeTrace are configured then make a subtree dropdown and buttons
// to view in Nextstrain or MicrobeTrace
if (nextstrainHost() && microbeTraceHost())
{
puts("
Size"
TOOLTIP("Length of uploaded sequence in bases, excluding runs of N bases at "
"beginning and/or end")
"
\n
#Ns"
TOOLTIP("Number of 'N' bases in uploaded sequence, excluding runs of N bases at "
"beginning and/or end")
"
");
else
puts("
VCF Sample
\n"
"
#Ns"
TOOLTIP("Number of no-call variants for this sample in uploaded VCF, "
"i.e. '.' used in genotype column")
"
");
puts("
#Mixed"
TOOLTIP("Number of IUPAC ambiguous bases, e.g. 'R' for 'A or G'")
"
");
if (isFasta)
printf("
Bases aligned"
TOOLTIP("Number of bases aligned to reference %s, including "
"matches and mismatches")
"
\n
Inserted bases"
TOOLTIP("Number of bases in aligned portion of uploaded sequence that are not present in "
"reference %s")
"
\n
Deleted bases"
TOOLTIP("Number of bases in reference %s that are not "
"present in aligned portion of uploaded sequence")
"
", refName, refName, refName);
puts("
#SNVs used for placement"
TOOLTIP("Number of single-nucleotide variants in uploaded sample "
"(does not include N's or mixed bases) used by UShER to place sample "
"in phylogenetic tree")
"
\n
#Masked SNVs"
TOOLTIP("Number of single-nucleotide variants in uploaded sample that are masked "
"(not used for placement) because they occur at known "
"Problematic Sites"));;
boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) ||
startsWith("RGCC", db));
if (gotClades)
{
if (sameString(db, "wuhCor1"))
puts("
\n
Nextstrain clade"
TOOLTIP("The Nextstrain clade assigned to the sample by "
"placement in the tree"));
else if (isRsv)
puts("
Nextstrain lineage"
TOOLTIP("The Nextstrain lineage assigned by "
"placement in the tree"));
}
if (gotLineages)
{
if (isRsv)
puts("
\n
RGCC 2023 clade"
TOOLTIP("The RSV Genotyping Consensus Consortium clade (manuscript in preparation) "
"assigned by placement in the tree"));
else
puts("
\n
Pango lineage"
TOOLTIP("The Pango lineage assigned to the sample by UShER"));
}
puts("
\n
Neighboring sample in tree"
TOOLTIP("A sample already in the tree that is a child of the node at which the uploaded "
"sample was placed, to give an example of a closely related sample")
"
\n
Lineage of neighbor");
if (sameString(db, "wuhCor1"))
puts(TOOLTIP("The "
"Pango lineage assigned by pangolin "
"to the nearest neighboring sample already in the tree"));
else if (isRsv)
puts(TOOLTIP("The RGCC 2023 clade assigned by Nextclade "
"to the nearest neighboring sample already in the tree"));
else
puts(TOOLTIP("The lineage assigned by Nextclade "
"to the nearest neighboring sample already in the tree"));
puts("
\n
#Imputed values for mixed bases"
TOOLTIP("If the uploaded sequence contains mixed/ambiguous bases, then UShER may assign "
"values based on maximum parsimony")
"
\n
#Maximally parsimonious placements"
TOOLTIP("Number of potential placements in the tree with minimal parsimony score; "
"the higher the number, the less confident the placement")
"
\n
Parsimony score"
TOOLTIP("Number of mutations/changes that must be added to the tree when placing the "
"uploaded sample; the higher the number, the more diverged the sample")
"
\n
Subtree number"
TOOLTIP("Sequence number of subtree that contains this sample")
"
");
}
// Default QC thresholds for calling an input sequence excellent/good/fair/bad [/fail]:
static int qcThresholdsMinLength[] = { 29750, 29500, 29000, 28000 };
static int qcThresholdsMaxNs[] = { 0, 5, 20, 100 };
static int qcThresholdsMaxMixed[] = { 0, 5, 20, 100 };
static int qcThresholdsMaxIndel[] = { 9, 18, 24, 36 };
static int qcThresholdsMaxSNVs[] = { 25, 35, 45, 55 };
static int qcThresholdsMaxMaskedSNVs[] = { 0, 1, 2, 3 };
static int qcThresholdsMaxImputed[] = { 0, 5, 20, 100 };
static int qcThresholdsMaxPlacements[] = { 1, 2, 3, 4 };
static int qcThresholdsMaxPScore[] = { 0, 2, 5, 10 };
static void wordsToQcThresholds(char **words, int *thresholds)
/* Parse words from file into thresholds array. Caller must ensure words and thresholds each
* have 4 items. */
{
int i;
for (i = 0; i < 4; i++)
thresholds[i] = atoi(words[i]);
}
static void readQcThresholds(char *org, char *ref)
/* If config.ra specifies a file with QC thresholds for excellent/good/fair/bad [/fail],
* parse it and replace the default values in qcThresholds arrays. */
{
char *qcThresholdsFile = phyloPlaceRefSettingPath(org, ref, "qcThresholds");
if (isNotEmpty(qcThresholdsFile))
{
if (fileExists(qcThresholdsFile))
{
struct lineFile *lf = lineFileOpen(qcThresholdsFile, TRUE);
char *line;
while (lineFileNext(lf, &line, NULL))
{
char *words[16];
int wordCount = chopTabs(line, words);
lineFileExpectWords(lf, 5, wordCount);
if (sameWord(words[0], "length"))
wordsToQcThresholds(words+1, qcThresholdsMinLength);
else if (sameWord(words[0], "nCount"))
wordsToQcThresholds(words+1, qcThresholdsMaxNs);
else if (sameWord(words[0], "mixedCount"))
wordsToQcThresholds(words+1, qcThresholdsMaxMixed);
else if (sameWord(words[0], "indelCount"))
wordsToQcThresholds(words+1, qcThresholdsMaxIndel);
else if (sameWord(words[0], "snvCount"))
wordsToQcThresholds(words+1, qcThresholdsMaxSNVs);
else if (sameWord(words[0], "maskedSnvCount"))
wordsToQcThresholds(words+1, qcThresholdsMaxMaskedSNVs);
else if (sameWord(words[0], "imputedBases"))
wordsToQcThresholds(words+1, qcThresholdsMaxImputed);
else if (sameWord(words[0], "placementCount"))
wordsToQcThresholds(words+1, qcThresholdsMaxPlacements);
else if (sameWord(words[0], "parsimony"))
wordsToQcThresholds(words+1, qcThresholdsMaxPScore);
else
warn("qcThresholds file %s: unrecognized parameter '%s', skipping",
qcThresholdsFile, words[0]);
}
lineFileClose(&lf);
}
else
warn("qcThresholds %s: file not found", qcThresholdsFile);
}
}
static char *qcClassForIntMin(int n, int thresholds[])
/* Return {qcExcellent, qcGood, qcMeh, qcBad or qcFail} depending on how n compares to the
* thresholds. Don't free result. */
{
if (n >= thresholds[0])
return "qcExcellent";
else if (n >= thresholds[1])
return "qcGood";
else if (n >= thresholds[2])
return "qcMeh";
else if (n >= thresholds[3])
return "qcBad";
else
return "qcFail";
}
static char *qcClassForLength(int length)
/* Return qc class for length of sequence. */
{
return qcClassForIntMin(length, qcThresholdsMinLength);
}
static char *qcClassForIntMax(int n, int thresholds[])
/* Return {qcExcellent, qcGood, qcMeh, qcBad or qcFail} depending on how n compares to the
* thresholds. Don't free result. */
{
if (n <= thresholds[0])
return "qcExcellent";
else if (n <= thresholds[1])
return "qcGood";
else if (n <= thresholds[2])
return "qcMeh";
else if (n <= thresholds[3])
return "qcBad";
else
return "qcFail";
}
static char *qcClassForNs(int nCount)
/* Return qc class for #Ns in sample. */
{
return qcClassForIntMax(nCount, qcThresholdsMaxNs);
}
static char *qcClassForMixed(int mixedCount)
/* Return qc class for #ambiguous bases in sample. */
{
return qcClassForIntMax(mixedCount, qcThresholdsMaxMixed);
}
static char *qcClassForIndel(int indelCount)
/* Return qc class for #inserted or deleted bases. */
{
return qcClassForIntMax(indelCount, qcThresholdsMaxIndel);
}
static char *qcClassForSNVs(int snvCount)
/* Return qc class for #SNVs in sample. */
{
return qcClassForIntMax(snvCount, qcThresholdsMaxSNVs);
}
static char *qcClassForMaskedSNVs(int maskedCount)
/* Return qc class for #SNVs at problematic sites. */
{
return qcClassForIntMax(maskedCount, qcThresholdsMaxMaskedSNVs);
}
static char *qcClassForImputedBases(int imputedCount)
/* Return qc class for #ambiguous bases for which UShER imputed values based on placement. */
{
return qcClassForIntMax(imputedCount, qcThresholdsMaxImputed);
}
static char *qcClassForPlacements(int placementCount)
/* Return qc class for number of equally parsimonious placements. */
{
return qcClassForIntMax(placementCount, qcThresholdsMaxPlacements);
}
static char *qcClassForPScore(int parsimonyScore)
/* Return qc class for parsimonyScore. */
{
return qcClassForIntMax(parsimonyScore, qcThresholdsMaxPScore);
}
static void printTooltip(char *text)
/* Print a tooltip with explanatory text. */
{
printf(TOOLTIP("%s"), text);
}
static void appendExcludingNs(struct dyString *dy, struct seqInfo *si)
/* Append a note to dy about how many N bases and start and/or end are excluded from statistic. */
{
dyStringAppend(dy, "excluding ");
if (si->nCountStart)
dyStringPrintf(dy, "%d N bases at start", si->nCountStart);
if (si->nCountStart && si->nCountEnd)
dyStringAppend(dy, " and ");
if (si->nCountEnd)
dyStringPrintf(dy, "%d N bases at end", si->nCountEnd);
}
static void printLineageTd(char *lineage, char *alt, char *db)
/* Print a table cell with lineage (& link to outbreak.info if not 'None') or alt if no lineage. */
{
if (isEmpty(lineage))
printf("
%s
", alt);
else
{
printf("
");
printLineageLink(lineage, db);
printf("
");
}
}
static void printSubtreeTd(struct subtreeInfo *subtreeInfoList, struct tempName *jsonTns[],
char *seqName, int subtreeSize)
/* Print a table cell with subtree (& link if possible) if found. */
{
int ix;
struct subtreeInfo *ti = subtreeInfoForSample(subtreeInfoList, seqName, &ix);
if (ix < 0)
//#*** Probably an error.
printf("
");
}
static int subTreeInfoUserSampleCmp(const void *pa, const void *pb)
/* Compare subtreeInfo by number of user sample IDs (highest number first). */
{
struct subtreeInfo *tiA = *(struct subtreeInfo **)pa;
struct subtreeInfo *tiB = *(struct subtreeInfo **)pb;
return slCount(tiB->subtreeUserSampleIds) - slCount(tiA->subtreeUserSampleIds);
}
static void getProtobufMetadataSource(struct treeChoices *treeChoices,
char *protobufFile, char **retProtobufPath,
char **retMetadataFile, char **retSource,
char **retAliasFile, char **retSampleNameFile)
/* If the config file specifies a list of tree choices, and protobufFile is a valid choice, then
* set ret* to the files associated with that choice. Otherwise fall back on older conf settings.
* Return the selected treeChoice if there is one. */
{
if (treeChoices)
{
*retProtobufPath = protobufFile;
if (isEmpty(*retProtobufPath))
*retProtobufPath = treeChoices->protobufFiles[0];
int i;
for (i = 0; i < treeChoices->count; i++)
if (sameString(treeChoices->protobufFiles[i], *retProtobufPath))
{
*retMetadataFile = treeChoices->metadataFiles[i];
*retSource = treeChoices->sources[i];
*retAliasFile = treeChoices->aliasFiles[i];
*retSampleNameFile = treeChoices->sampleNameFiles[i];
break;
}
if (i == treeChoices->count)
{
*retProtobufPath = treeChoices->protobufFiles[0];
*retMetadataFile = treeChoices->metadataFiles[0];
*retSource = treeChoices->sources[0];
*retAliasFile = treeChoices->aliasFiles[0];
*retSampleNameFile = treeChoices->sampleNameFiles[0];
}
}
else
{
errAbort("getProtobufMetadataSource: treeChoices is NULL (missing protobufs.tab file?)");
}
}
static void addModVersion(struct hash *nameHash, char *id, char *fullName)
/* Map id to fullName. If id has .version, strip that (modifying id!) and map versionless id
* to fullName. */
{
hashAdd(nameHash, id, fullName);
char *p = strchr(id, '.');
if (p)
{
*p = '\0';
hashAdd(nameHash, id, fullName);
}
}
static void maybeAddIsolate(struct hash *nameHash, char *name, char *fullName)
/* If name looks like country/isolate/year and isolate looks sufficiently distinctive
* then also map isolate to fullName. */
{
char *firstSlash = strchr(name, '/');
char *lastSlash = strrchr(name, '/');
if (firstSlash && lastSlash && (lastSlash - firstSlash) >= 4)
{
int len = strlen(name);
char isolate[len+1];
safencpy(isolate, sizeof isolate, firstSlash+1, lastSlash - firstSlash - 1);
hashAdd(nameHash, isolate, fullName);
}
}
static void addNameMaybeComponents(struct hash *nameHash, char *fullName, boolean addComponents)
/* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName.
* If addComponents and fullName is |-separated name|ID|date, add name and ID to nameHash. */
{
char *fullNameHashStored = hashStoreName(nameHash, fullName);
// Now that we have hash storage for fullName, make it point to itself.
struct hashEl *hel = hashLookup(nameHash, fullName);
if (hel == NULL)
errAbort("Can't look up '%s' right after adding it", fullName);
hel->val = fullNameHashStored;
if (addComponents)
{
char *words[4];
char copy[strlen(fullName)+1];
safecpy(copy, sizeof copy, fullName);
int wordCount = chopByChar(copy, '|', words, ArraySize(words));
if (wordCount == 3)
{
// name|ID|date
hashAdd(nameHash, words[0], fullNameHashStored);
maybeAddIsolate(nameHash, words[0], fullNameHashStored);
addModVersion(nameHash, words[1], fullNameHashStored);
}
else if (wordCount == 2)
{
// ID|date
addModVersion(nameHash, words[0], fullNameHashStored);
// ID might be a COG-UK country/isolate/year
maybeAddIsolate(nameHash, words[0], fullNameHashStored);
}
}
}
static void rAddLeafNames(struct phyloTree *node, struct hash *condensedNodes, struct hash *nameHash,
boolean addComponents)
/* Recursively descend tree, adding leaf node names to nameHash (including all names of condensed
* leaf nodes). Also map components of full names (country/isolate/year|ID|date) to full names. */
{
if (node->numEdges == 0)
{
char *leafName = node->ident->name;
struct slName *nodeList = hashFindVal(condensedNodes, leafName);
if (nodeList)
{
struct slName *sample;
for (sample = nodeList; sample != NULL; sample = sample->next)
addNameMaybeComponents(nameHash, sample->name, addComponents);
}
else
addNameMaybeComponents(nameHash, leafName, addComponents);
}
else
{
int i;
for (i = 0; i < node->numEdges; i++)
rAddLeafNames(node->edges[i], condensedNodes, nameHash, addComponents);
}
}
static void addAliases(struct hash *nameHash, char *aliasFile)
/* If there is an aliasFile, then add its mappings of ID/alias to full tree name to nameHash. */
{
if (isNotEmpty(aliasFile) && fileExists(aliasFile))
{
struct lineFile *lf = lineFileOpen(aliasFile, TRUE);
char *missExample = NULL;
char *line;
while (lineFileNextReal(lf, &line))
{
char *words[3];
int wordCount = chopTabs(line, words);
lineFileExpectWords(lf, 2, wordCount);
char *fullName = hashFindVal(nameHash, words[1]);
if (fullName)
hashAdd(nameHash, words[0], fullName);
else
{
if (missExample == NULL)
missExample = cloneString(words[1]);
}
}
lineFileClose(&lf);
}
}
static struct hash *getTreeNamesHash(char *nameFile, char *protobufFile,
struct mutationAnnotatedTree **pBigTree,
boolean addComponents, int *pStartTime)
/* Make a hash of full names of leaves of bigTree, using nameFile if it exists, otherwise
* falling back on the bigTree itself, loading it if necessary. If addComponents, also map
* components of those names to the full names in case the user gives us partial names/IDs. */
{
struct hash *nameHash = NULL;
if (isNotEmpty(nameFile) && fileExists(nameFile))
{
nameHash = hashNew(26);
struct lineFile *lf = lineFileOpen(nameFile, TRUE);
char *line;
while (lineFileNext(lf, &line, NULL))
addNameMaybeComponents(nameHash, line, addComponents);
lineFileClose(&lf);
}
else
{
if (*pBigTree == NULL)
{
*pBigTree = parseParsimonyProtobuf(protobufFile);
reportTiming(pStartTime, "parse protobuf file (at startup, because sample names file was "
"not provided)");
}
struct mutationAnnotatedTree *bigTree = *pBigTree;
int nodeCount = bigTree->nodeHash->elCount;
nameHash = hashNew(digitsBaseTwo(nodeCount) + 3);
rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash, addComponents);
}
return nameHash;
}
static struct hashOrMmHash *getTreeNames(char *nameFile, char *protobufFile,
struct mutationAnnotatedTree **pBigTree,
boolean addComponents, int *pStartTime)
/* Make a hash of full names of leaves of bigTree, using nameFile if it exists, otherwise
* falling back on the bigTree itself, loading it if necessary. If addComponents, also map
* components of those names to the full names in case the user gives us partial names/IDs. */
{
struct hashOrMmHash *treeNames = NULL;
AllocVar(treeNames);
if (isNotEmpty(nameFile) && fileExists(nameFile) && endsWith(nameFile, ".mmh"))
treeNames->mmh = mmHashFromFile(nameFile);
else
treeNames->hash = getTreeNamesHash(nameFile, protobufFile, pBigTree, addComponents, pStartTime);
reportTiming(pStartTime, "get tree names");
return treeNames;
}
static const char *hashOrMmHashFindVal(struct hashOrMmHash *homh, char *key)
/* Look up string value for key in homh; return NULL if not found. */
{
const char *val = NULL;
if (homh)
{
if (homh->hash)
val = hashFindVal(homh->hash, key);
else
val = mmHashFindVal(homh->mmh, key);
}
return val;
}
static const char *matchName(struct hashOrMmHash *nameHash, char *name)
/* Look for a possibly partial name or ID provided by the user in nameHash. Return the result,
* possibly NULL. If the full name doesn't match, try components of the name. */
{
name = trimSpaces(name);
// GISAID fasta headers all have hCoV-19/country/isolate/year|EPI_ISL_#|date; strip the hCoV-19
// because Nextstrain strips it in nextmeta/nextfasta download files, and so do I when building
// UCSC's tree.
if (startsWithNoCase("hCoV-19/", name))
name += strlen("hCoV-19/");
const char *match = hashOrMmHashFindVal(nameHash, name);
int minWordSize=5;
if (match == NULL && strchr(name, '|'))
{
// GISAID fasta headers have name|ID|date, and so do our tree IDs; try ID and name separately.
char *words[4];
char copy[strlen(name)+1];
safecpy(copy, sizeof copy, name);
int wordCount = chopByChar(copy, '|', words, ArraySize(words));
if (wordCount == 3)
{
// name|ID|date; try ID first.
if (strlen(words[1]) > minWordSize)
match = hashOrMmHashFindVal(nameHash, words[1]);
if (match == NULL && strlen(words[0]) > minWordSize)
{
match = hashOrMmHashFindVal(nameHash, words[0]);
// Sometimes country/isolate names have spaces... strip out if present.
if (match == NULL && strchr(words[0], ' '))
{
stripChar(words[0], ' ');
match = hashOrMmHashFindVal(nameHash, words[0]);
}
}
}
else if (wordCount == 2)
{
// ID|date
if (strlen(words[0]) > minWordSize)
match = hashOrMmHashFindVal(nameHash, words[0]);
}
}
else if (match == NULL && strchr(name, ' '))
{
// GISAID sequence names may include spaces, in both country names ("South Korea") and
// isolate names. That messes up FASTA headers, so Nextstrain strips out spaces when
// making the nextmeta and nextfasta download files for GISAID. Try stripping out spaces:
char copy[strlen(name)+1];
safecpy(copy, sizeof copy, name);
stripChar(copy, ' ');
match = hashOrMmHashFindVal(nameHash, copy);
}
return match;
}
static boolean tallyMatch(const char *match, char *term,
struct slName **retMatches, struct slName **retUnmatched)
/* If match is non-NULL, add result to retMatches and return TRUE, otherwise add term to
* retUnmatched and return FALSE. */
{
boolean foundIt = FALSE;
if (match)
{
foundIt = TRUE;
slNameAddHead(retMatches, match);
}
else
slNameAddHead(retUnmatched, term);
return foundIt;
}
static boolean matchIdRange(struct hashOrMmHash *nameHash, char *line,
struct slName **retMatches, struct slName **retUnmatched)
/* If line looks like it might contain a range of IDs, for example EPI_ISL_123-129 from an EPI_SET,
* then expand the range(s) into individual IDs, look up the IDs, set retMatches and retUnmatched
* to per-ID results, and return TRUE. */
{
boolean foundAny = FALSE;
*retMatches = *retUnmatched = NULL;
regmatch_t substrArr[7];
// Line may contain a list of distinct IDs and/or ID ranges
#define oneIdExp "([A-Z_]+)([0-9]+)"
#define rangeEndExp "- *([A-Z_]*)([0-9]+)"
#define rangeListExp "^("oneIdExp",? *("rangeEndExp")?),? *"
while (regexMatchSubstr(line, rangeListExp, substrArr, ArraySize(substrArr)))
{
char *prefixA = regexSubstringClone(line, substrArr[2]);
char *numberA = regexSubstringClone(line, substrArr[3]);
if (regexSubstrMatched(substrArr[4]))
{
// Looks like a well-formed ID range
char *prefixB = regexSubstringClone(line, substrArr[5]);
char *numberB = regexSubstringClone(line, substrArr[6]);
int start = atol(numberA);
int end = atol(numberB);
if ((isEmpty(prefixB) || sameString(prefixA, prefixB)) && end >= start)
{
char oneId[strlen(line)+1];
int num;
for (num = start; num <= end; num++)
{
safef(oneId, sizeof oneId, "%s%d", prefixA, num);
const char *match = hashOrMmHashFindVal(nameHash, oneId);
foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
}
}
else
{
// It matched the regex but the prefixes don't match and/or numbers are out of order
// so we don't know what to do with it -- try matchName just in case.
char *regMatch = regexSubstringClone(line, substrArr[1]);
const char *match = matchName(nameHash, regMatch);
foundAny |= tallyMatch(match, regMatch, retMatches, retUnmatched);
}
}
else
{
// Just one ID
char oneId[strlen(line)+1];
safef(oneId, sizeof oneId, "%s%s", prefixA, numberA);
const char *match = hashOrMmHashFindVal(nameHash, oneId);
foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
}
// Skip past this match to see if the line has another range next.
line += (substrArr[0].rm_eo - substrArr[0].rm_so);
}
return foundAny;
}
static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash,
struct slName **retUnmatched)
/* Read a file of sample names/IDs from the user; typically these will not be exactly the same
* as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt
* to find component matches if an exact match isn't found. */
{
struct slName *sampleIds = NULL;
struct slName *unmatched = NULL;
char *line;
while (lineFileNext(lf, &line, NULL))
{
// If tab-sep, just try first word in line
char *tab = strchr(line, '\t');
if (tab)
*tab = '\0';
const char *match = matchName(nameHash, line);
if (match)
slNameAddHead(&sampleIds, match);
else
{
struct slName *rangeMatches = NULL, *rangeUnmatched = NULL;
if (matchIdRange(nameHash, line, &rangeMatches, &rangeUnmatched))
{
sampleIds = slCat(rangeMatches, sampleIds);
unmatched = slCat(rangeUnmatched, unmatched);
}
else
{
// If comma-sep, just try first word in line
char *comma = strchr(line, ',');
if (comma)
{
*comma = '\0';
match = matchName(nameHash, line);
if (match)
slNameAddHead(&sampleIds, match);
else
slNameAddHead(&unmatched, line);
}
else
slNameAddHead(&unmatched, line);
}
}
}
if (retUnmatched != NULL)
*retUnmatched = unmatched;
else
slNameFreeList(&unmatched);
return sampleIds;
}
static void reportUnmatched(struct slName *unmatched, boolean foundNone)
/* Warn the user that some of their names/IDs could not be found in the tree. If it's a long list,
* show only the first handful. If nothing was unmatched but nothing was found either, warn about
* empty input. */
{
if (unmatched)
{
struct dyString *firstFew = dyStringNew(0);
int maxExamples = 5;
struct slName *example;
int i;
for (i = 0, example = unmatched; example != NULL && i < maxExamples;
i++, example = example->next)
{
dyStringAppendSep(firstFew, ", ");
dyStringPrintf(firstFew, "'%s'", example->name);
}
warn("Unable to find %d of your sequence names/IDs, e.g. %s",
slCount(unmatched), firstFew->string);
dyStringFree(&firstFew);
}
else if (foundNone)
warn("Could not find any names in input; empty file uploaded?");
}
void *loadMetadataWorker(void *arg)
/* pthread worker function to read a tab-sep metadata file and return it hashed by name. */
{
char *metadataFile = arg;
int startTime = clock1000();
struct sampleMetadataStore *sampleMetadata = getSampleMetadata(metadataFile);
reportTiming(&startTime, "read sample metadata (in a pthread)");
return sampleMetadata;
}
static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *))
/* Fork off a child process that parses a file and returns the resulting data structure. */
{
pthread_t *pt;
AllocVar(pt);
if (! pthreadMayCreate(pt, NULL, workerFunction, filename))
pt = NULL;
return pt;
}
static struct dnaSeq *getChromSeq(char *org, char *ref, char *db)
/* Get the reference sequence for ref, using a .2bit file if configured,
* otherwise hdb lib functions (requires ref to be the same as db). */
{
char *twoBitName = phyloPlaceOrgSettingPath(org, "twoBitFile");
struct dnaSeq *seq = NULL;
if (isNotEmpty(twoBitName) && fileExists(twoBitName))
{
char *chrom = ref;
struct slName *seqNames = twoBitSeqNames(twoBitName);
if (!slNameFind(seqNames, chrom))
chrom = seqNames->name;
struct twoBitFile *tbf = twoBitOpen(twoBitName);
seq = twoBitReadSeqFrag(tbf, chrom, 0, 0);
// Convert to lower case so genoFind doesn't index it as containing no tiles.
tolowers(seq->dna);
twoBitClose(&tbf);
slNameFreeList(&seqNames);
}
else if (sameString(db, ref))
{
char *chrom = phyloPlaceRefSetting(org, ref, "chrom");
if (isEmpty(chrom))
chrom = hDefaultChrom(db);
seq = hChromSeq(db, chrom, 0, hChromSize(db, chrom));
}
else
errAbort("No twoBitFile or db/hub found for %s", ref);
return seq;
}
static struct phyloTree *uploadedSamplesTree(char *singleSubtreeFile, struct slName *sampleIds)
/* If the user uploaded enough samples to make a meaningful tree, then read in singleSubtreeFile
* and prune all nodes that have no leaf descendants in sampleIds to get the tree of only the
* uploaded samples. */
{
struct phyloTree *tree = NULL;
if (slCount(sampleIds) >= minSamplesForOwnTree)
{
tree = phyloOpenTree(singleSubtreeFile);
tree = phyloPruneToIds(tree, sampleIds);
}
return tree;
}
static void saveTrashFile(struct tempName *tn)
/* If hg.conf specifies a long-term storage place for user data, then save tn there and make its
* original location a symbolic link to the new location. Note: tn has "hgPhyloPlace" in the path
* already, and we're not associating these with hgLogin user IDs, so there's no need to build
* the path up past sessionDataDir (unlike in hgSession). */
{
char *sessionDataDir = cfgOption("sessionDataDir");
if (isNotEmpty(sessionDataDir))
{
sessionDataSaveTrashFile(tn->forCgi, sessionDataDir);
}
}
static void phyloPlaceSamplesOneRef(struct lineFile *lf, char *db, char *org,
char *refName, char *defaultProtobuf,
boolean doMeasureTiming, int subtreeSize,
struct trackLayout *tl, struct cart *cart, boolean *retSuccess)
/* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids:
* If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results,
* make custom track files and return the top-level custom track file.
* If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils
* to make subtrees, show subtree results, and return NULL. Set retSuccess to TRUE if we were
* able to get at least some results for the user's input. */
{
char *ctFile = NULL;
measureTiming = doMeasureTiming;
int startTime = clock1000();
struct tempName *vcfTn = NULL;
struct slName *sampleIds = NULL;
char *usherPath = getUsherPath(TRUE);
char *protobufPath = NULL;
char *source = NULL;
char *metadataFile = NULL;
char *aliasFile = NULL;
char *sampleNameFile = NULL;
struct treeChoices *treeChoices = loadTreeChoices(org, refName);
getProtobufMetadataSource(treeChoices, defaultProtobuf,
&protobufPath, &metadataFile, &source, &aliasFile, &sampleNameFile);
reportTiming(&startTime, "start up and find the tree etc. files");
struct mutationAnnotatedTree *bigTree = NULL;
lineFileCarefulNewlines(lf);
struct dnaSeq *refGenome = getChromSeq(org, refName, db);
struct slName **maskSites = getProblematicSites(org, refName, refGenome->name, refGenome->size);
//#*** TODO: add CGI param option for this almost-never-needed tweak:
if (0)
{
bigTree = parseParsimonyProtobuf(protobufPath);
reportTiming(&startTime, "parse protobuf file (at startup, for excluding informativeBases "
"from maskSites)");
informativeBasesFromTree(bigTree->tree, maskSites, refGenome->size);
reportTiming(&startTime, "remove any informative bases in tree from maskSites");
}
boolean isFasta = FALSE;
boolean subtreesOnly = FALSE;
struct seqInfo *seqInfoList = NULL;
if (lfLooksLikeFasta(lf))
{
struct slPair *failedSeqs;
struct slPair *failedPsls;
struct hashOrMmHash *treeNames = NULL;
// We need to check uploaded names in fasta only for original usher, not usher-sampled(-server).
if (!serverIsConfigured(org) && !endsWith(usherPath, "-sampled"))
treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, FALSE, &startTime);
vcfTn = vcfFromFasta(lf, org, refName, refGenome, maskSites, treeNames,
&sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime);
if (failedSeqs)
{
puts("
");
}
int refCount = slCount(refFiles);
boolean doNav = (refCount > 1);
struct refMatch *ref;
if (doNav)
{
// Make some navigation links at the top
puts("");
printf("
Your uploaded sequences matched %d different reference sequences. "
"Click on these links to jump to the results for each reference.\n",
refCount);
puts("