7c1221c572248594a25af2c26e99c123415efe26
angie
Tue Feb 23 11:38:29 2021 -0800
The hardcoded QC thresholds for the green/yellow/red coloring in the summary table are out of date. Loosen the defaults a bit to reflect what is now a normal amount of difference from the reference, and add a config file to make them easily adjustable in the future.
diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 2e5feaf..00013b9 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -1146,114 +1146,180 @@
"Pangolin lineage assigned to the nearest neighboring sample already in the tree")
"\n
#Imputed values for mixed bases"
TOOLTIP("If the uploaded sequence contains mixed/ambiguous bases, then UShER may assign "
"values based on maximum parsimony")
" | \n#Maximally parsimonious placements"
TOOLTIP("Number of potential placements in the tree with minimal parsimony score; "
"the higher the number, the less confident the placement")
" | \nParsimony score"
TOOLTIP("Number of mutations/changes that must be added to the tree when placing the "
"uploaded sample; the higher the number, the more diverged the sample")
" | \nSubtree number"
TOOLTIP("Sequence number of subtree that contains this sample")
" | ");
}
-static char *qcClassForIntMin(int n, int minExc, int minGood, int minMeh, int minBad)
+
+// Default QC thresholds for calling an input sequence excellent/good/fair/bad [/fail]:
+static int qcThresholdsMinLength[] = { 29750, 29500, 29000, 28000 };
+static int qcThresholdsMaxNs[] = { 0, 5, 20, 100 };
+static int qcThresholdsMaxMixed[] = { 0, 5, 20, 100 };
+static int qcThresholdsMaxIndel[] = { 9, 18, 24, 36 };
+static int qcThresholdsMaxSNVs[] = { 25, 35, 45, 55 };
+static int qcThresholdsMaxMaskedSNVs[] = { 0, 1, 2, 3 };
+static int qcThresholdsMaxImputed[] = { 0, 5, 20, 100 };
+static int qcThresholdsMaxPlacements[] = { 1, 2, 3, 4 };
+static int qcThresholdsMaxPScore[] = { 0, 2, 5, 10 };
+
+static void wordsToQcThresholds(char **words, int *thresholds)
+/* Parse words from file into thresholds array. Caller must ensure words and thresholds each
+ * have 4 items. */
+{
+int i;
+for (i = 0; i < 4; i++)
+ thresholds[i] = atoi(words[i]);
+}
+
+static void readQcThresholds(char *db)
+/* If config.ra specifies a file with QC thresholds for excellent/good/fair/bad [/fail],
+ * parse it and replace the default values in qcThresholds arrays. */
+{
+char *qcThresholdsFile = phyloPlaceDbSettingPath(db, "qcThresholds");
+if (isNotEmpty(qcThresholdsFile))
+ {
+ if (fileExists(qcThresholdsFile))
+ {
+ struct lineFile *lf = lineFileOpen(qcThresholdsFile, TRUE);
+ char *line;
+ while (lineFileNext(lf, &line, NULL))
+ {
+ char *words[16];
+ int wordCount = chopTabs(line, words);
+ lineFileExpectWords(lf, 5, wordCount);
+ if (sameWord(words[0], "length"))
+ wordsToQcThresholds(words+1, qcThresholdsMinLength);
+ else if (sameWord(words[0], "nCount"))
+ wordsToQcThresholds(words+1, qcThresholdsMaxNs);
+ else if (sameWord(words[0], "mixedCount"))
+ wordsToQcThresholds(words+1, qcThresholdsMaxMixed);
+ else if (sameWord(words[0], "indelCount"))
+ wordsToQcThresholds(words+1, qcThresholdsMaxIndel);
+ else if (sameWord(words[0], "snvCount"))
+ wordsToQcThresholds(words+1, qcThresholdsMaxSNVs);
+ else if (sameWord(words[0], "maskedSnvCount"))
+ wordsToQcThresholds(words+1, qcThresholdsMaxMaskedSNVs);
+ else if (sameWord(words[0], "imputedBases"))
+ wordsToQcThresholds(words+1, qcThresholdsMaxImputed);
+ else if (sameWord(words[0], "placementCount"))
+ wordsToQcThresholds(words+1, qcThresholdsMaxPlacements);
+ else if (sameWord(words[0], "parsimony"))
+ wordsToQcThresholds(words+1, qcThresholdsMaxPScore);
+ else
+ warn("qcThresholds file %s: unrecognized parameter '%s', skipping",
+ qcThresholdsFile, words[0]);
+ }
+ lineFileClose(&lf);
+ }
+ else
+ warn("qcThresholds %s: file not found", qcThresholdsFile);
+ }
+}
+
+static char *qcClassForIntMin(int n, int thresholds[])
/* Return {qcExcellent, qcGood, qcMeh, qcBad or qcFail} depending on how n compares to the
* thresholds. Don't free result. */
{
-if (n >= minExc)
+if (n >= thresholds[0])
return "qcExcellent";
-else if (n >= minGood)
+else if (n >= thresholds[1])
return "qcGood";
-else if (n >= minMeh)
+else if (n >= thresholds[2])
return "qcMeh";
-else if (n >= minBad)
+else if (n >= thresholds[3])
return "qcBad";
else
return "qcFail";
}
static char *qcClassForLength(int length)
/* Return qc class for length of sequence. */
{
-return qcClassForIntMin(length, 29750, 29500, 29000, 28000);
+return qcClassForIntMin(length, qcThresholdsMinLength);
}
-static char *qcClassForIntMax(int n, int maxExc, int maxGood, int maxMeh, int maxBad)
+static char *qcClassForIntMax(int n, int thresholds[])
/* Return {qcExcellent, qcGood, qcMeh, qcBad or qcFail} depending on how n compares to the
* thresholds. Don't free result. */
{
-if (n <= maxExc)
+if (n <= thresholds[0])
return "qcExcellent";
-else if (n <= maxGood)
+else if (n <= thresholds[1])
return "qcGood";
-else if (n <= maxMeh)
+else if (n <= thresholds[2])
return "qcMeh";
-else if (n <= maxBad)
+else if (n <= thresholds[3])
return "qcBad";
else
return "qcFail";
}
static char *qcClassForNs(int nCount)
/* Return qc class for #Ns in sample. */
{
-return qcClassForIntMax(nCount, 0, 5, 20, 100);
+return qcClassForIntMax(nCount, qcThresholdsMaxNs);
}
static char *qcClassForMixed(int mixedCount)
/* Return qc class for #ambiguous bases in sample. */
{
-return qcClassForIntMax(mixedCount, 0, 5, 20, 100);
+return qcClassForIntMax(mixedCount, qcThresholdsMaxMixed);
}
static char *qcClassForIndel(int indelCount)
/* Return qc class for #inserted or deleted bases. */
{
-return qcClassForIntMax(indelCount, 0, 2, 5, 10);
+return qcClassForIntMax(indelCount, qcThresholdsMaxIndel);
}
static char *qcClassForSNVs(int snvCount)
/* Return qc class for #SNVs in sample. */
{
-return qcClassForIntMax(snvCount, 10, 20, 30, 40);
+return qcClassForIntMax(snvCount, qcThresholdsMaxSNVs);
}
static char *qcClassForMaskedSNVs(int maskedCount)
/* Return qc class for #SNVs at problematic sites. */
{
-return qcClassForIntMax(maskedCount, 0, 1, 2, 3);
+return qcClassForIntMax(maskedCount, qcThresholdsMaxMaskedSNVs);
}
static char *qcClassForImputedBases(int imputedCount)
/* Return qc class for #ambiguous bases for which UShER imputed values based on placement. */
{
-return qcClassForMixed(imputedCount);
+return qcClassForIntMax(imputedCount, qcThresholdsMaxImputed);
}
static char *qcClassForPlacements(int placementCount)
/* Return qc class for number of equally parsimonious placements. */
{
-return qcClassForIntMax(placementCount, 1, 2, 3, 4);
+return qcClassForIntMax(placementCount, qcThresholdsMaxPlacements);
}
static char *qcClassForPScore(int parsimonyScore)
/* Return qc class for parsimonyScore. */
{
-return qcClassForIntMax(parsimonyScore, 0, 2, 5, 10);
+return qcClassForIntMax(parsimonyScore, qcThresholdsMaxPScore);
}
static void printTooltip(char *text)
/* Print a tooltip with explanatory text. */
{
printf(TOOLTIP("%s"), text);
}
static void appendExcludingNs(struct dyString *dy, struct seqInfo *si)
/* Append a note to dy about how many N bases and start and/or end are excluded from statistic. */
{
dyStringAppend(dy, "excluding ");
if (si->nCountStart)
dyStringPrintf(dy, "%d N bases at start", si->nCountStart);
if (si->nCountStart && si->nCountEnd)
@@ -1769,30 +1835,31 @@
if (isNotEmpty(lf->fileName))
warn("Sorry, can't recognize your file %s as FASTA or VCF.\n", lf->fileName);
else
warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n");
}
lineFileClose(&lf);
if (vcfTn)
{
fflush(stdout);
int seqCount = slCount(seqInfoList);
struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi,
subtreeSize, sampleIds, bigTree->condensedNodes,
&startTime);
if (results->subtreeInfoList || seqCount > MAX_SEQ_DETAILS)
{
+ readQcThresholds(db);
int subtreeCount = slCount(results->subtreeInfoList);
// Sort subtrees by number of user samples (largest first).
slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp);
// Make Nextstrain/auspice JSON file for each subtree.
char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile");
struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome);
struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize);
struct hash *sampleMetadata = getSampleMetadata(metadataFile);
struct tempName *jsonTns[subtreeCount];
struct subtreeInfo *ti;
int ix;
for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++)
{
AllocVar(jsonTns[ix]);
trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json");