7c1221c572248594a25af2c26e99c123415efe26 angie Tue Feb 23 11:38:29 2021 -0800 The hardcoded QC thresholds for the green/yellow/red coloring in the summary table are out of date. Loosen the defaults a bit to reflect what is now a normal amount of difference from the reference, and add a config file to make them easily adjustable in the future. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 2e5feaf..00013b9 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1146,114 +1146,180 @@ "Pangolin lineage</a> assigned to the nearest neighboring sample already in the tree") "</th>\n<th>#Imputed values for mixed bases" TOOLTIP("If the uploaded sequence contains mixed/ambiguous bases, then UShER may assign " "values based on maximum parsimony") "</th>\n<th>#Maximally parsimonious placements" TOOLTIP("Number of potential placements in the tree with minimal parsimony score; " "the higher the number, the less confident the placement") "</th>\n<th>Parsimony score" TOOLTIP("Number of mutations/changes that must be added to the tree when placing the " "uploaded sample; the higher the number, the more diverged the sample") "</th>\n<th>Subtree number" TOOLTIP("Sequence number of subtree that contains this sample") "</th></tr></thead>"); } -static char *qcClassForIntMin(int n, int minExc, int minGood, int minMeh, int minBad) + +// Default QC thresholds for calling an input sequence excellent/good/fair/bad [/fail]: +static int qcThresholdsMinLength[] = { 29750, 29500, 29000, 28000 }; +static int qcThresholdsMaxNs[] = { 0, 5, 20, 100 }; +static int qcThresholdsMaxMixed[] = { 0, 5, 20, 100 }; +static int qcThresholdsMaxIndel[] = { 9, 18, 24, 36 }; +static int qcThresholdsMaxSNVs[] = { 25, 35, 45, 55 }; +static int qcThresholdsMaxMaskedSNVs[] = { 0, 1, 2, 3 }; +static int qcThresholdsMaxImputed[] = { 0, 5, 20, 100 }; +static int qcThresholdsMaxPlacements[] = { 1, 2, 3, 4 }; +static int qcThresholdsMaxPScore[] = { 0, 2, 5, 10 }; + +static void wordsToQcThresholds(char **words, int *thresholds) +/* Parse words from file into thresholds array. Caller must ensure words and thresholds each + * have 4 items. */ +{ +int i; +for (i = 0; i < 4; i++) + thresholds[i] = atoi(words[i]); +} + +static void readQcThresholds(char *db) +/* If config.ra specifies a file with QC thresholds for excellent/good/fair/bad [/fail], + * parse it and replace the default values in qcThresholds arrays. */ +{ +char *qcThresholdsFile = phyloPlaceDbSettingPath(db, "qcThresholds"); +if (isNotEmpty(qcThresholdsFile)) + { + if (fileExists(qcThresholdsFile)) + { + struct lineFile *lf = lineFileOpen(qcThresholdsFile, TRUE); + char *line; + while (lineFileNext(lf, &line, NULL)) + { + char *words[16]; + int wordCount = chopTabs(line, words); + lineFileExpectWords(lf, 5, wordCount); + if (sameWord(words[0], "length")) + wordsToQcThresholds(words+1, qcThresholdsMinLength); + else if (sameWord(words[0], "nCount")) + wordsToQcThresholds(words+1, qcThresholdsMaxNs); + else if (sameWord(words[0], "mixedCount")) + wordsToQcThresholds(words+1, qcThresholdsMaxMixed); + else if (sameWord(words[0], "indelCount")) + wordsToQcThresholds(words+1, qcThresholdsMaxIndel); + else if (sameWord(words[0], "snvCount")) + wordsToQcThresholds(words+1, qcThresholdsMaxSNVs); + else if (sameWord(words[0], "maskedSnvCount")) + wordsToQcThresholds(words+1, qcThresholdsMaxMaskedSNVs); + else if (sameWord(words[0], "imputedBases")) + wordsToQcThresholds(words+1, qcThresholdsMaxImputed); + else if (sameWord(words[0], "placementCount")) + wordsToQcThresholds(words+1, qcThresholdsMaxPlacements); + else if (sameWord(words[0], "parsimony")) + wordsToQcThresholds(words+1, qcThresholdsMaxPScore); + else + warn("qcThresholds file %s: unrecognized parameter '%s', skipping", + qcThresholdsFile, words[0]); + } + lineFileClose(&lf); + } + else + warn("qcThresholds %s: file not found", qcThresholdsFile); + } +} + +static char *qcClassForIntMin(int n, int thresholds[]) /* Return {qcExcellent, qcGood, qcMeh, qcBad or qcFail} depending on how n compares to the * thresholds. Don't free result. */ { -if (n >= minExc) +if (n >= thresholds[0]) return "qcExcellent"; -else if (n >= minGood) +else if (n >= thresholds[1]) return "qcGood"; -else if (n >= minMeh) +else if (n >= thresholds[2]) return "qcMeh"; -else if (n >= minBad) +else if (n >= thresholds[3]) return "qcBad"; else return "qcFail"; } static char *qcClassForLength(int length) /* Return qc class for length of sequence. */ { -return qcClassForIntMin(length, 29750, 29500, 29000, 28000); +return qcClassForIntMin(length, qcThresholdsMinLength); } -static char *qcClassForIntMax(int n, int maxExc, int maxGood, int maxMeh, int maxBad) +static char *qcClassForIntMax(int n, int thresholds[]) /* Return {qcExcellent, qcGood, qcMeh, qcBad or qcFail} depending on how n compares to the * thresholds. Don't free result. */ { -if (n <= maxExc) +if (n <= thresholds[0]) return "qcExcellent"; -else if (n <= maxGood) +else if (n <= thresholds[1]) return "qcGood"; -else if (n <= maxMeh) +else if (n <= thresholds[2]) return "qcMeh"; -else if (n <= maxBad) +else if (n <= thresholds[3]) return "qcBad"; else return "qcFail"; } static char *qcClassForNs(int nCount) /* Return qc class for #Ns in sample. */ { -return qcClassForIntMax(nCount, 0, 5, 20, 100); +return qcClassForIntMax(nCount, qcThresholdsMaxNs); } static char *qcClassForMixed(int mixedCount) /* Return qc class for #ambiguous bases in sample. */ { -return qcClassForIntMax(mixedCount, 0, 5, 20, 100); +return qcClassForIntMax(mixedCount, qcThresholdsMaxMixed); } static char *qcClassForIndel(int indelCount) /* Return qc class for #inserted or deleted bases. */ { -return qcClassForIntMax(indelCount, 0, 2, 5, 10); +return qcClassForIntMax(indelCount, qcThresholdsMaxIndel); } static char *qcClassForSNVs(int snvCount) /* Return qc class for #SNVs in sample. */ { -return qcClassForIntMax(snvCount, 10, 20, 30, 40); +return qcClassForIntMax(snvCount, qcThresholdsMaxSNVs); } static char *qcClassForMaskedSNVs(int maskedCount) /* Return qc class for #SNVs at problematic sites. */ { -return qcClassForIntMax(maskedCount, 0, 1, 2, 3); +return qcClassForIntMax(maskedCount, qcThresholdsMaxMaskedSNVs); } static char *qcClassForImputedBases(int imputedCount) /* Return qc class for #ambiguous bases for which UShER imputed values based on placement. */ { -return qcClassForMixed(imputedCount); +return qcClassForIntMax(imputedCount, qcThresholdsMaxImputed); } static char *qcClassForPlacements(int placementCount) /* Return qc class for number of equally parsimonious placements. */ { -return qcClassForIntMax(placementCount, 1, 2, 3, 4); +return qcClassForIntMax(placementCount, qcThresholdsMaxPlacements); } static char *qcClassForPScore(int parsimonyScore) /* Return qc class for parsimonyScore. */ { -return qcClassForIntMax(parsimonyScore, 0, 2, 5, 10); +return qcClassForIntMax(parsimonyScore, qcThresholdsMaxPScore); } static void printTooltip(char *text) /* Print a tooltip with explanatory text. */ { printf(TOOLTIP("%s"), text); } static void appendExcludingNs(struct dyString *dy, struct seqInfo *si) /* Append a note to dy about how many N bases and start and/or end are excluded from statistic. */ { dyStringAppend(dy, "excluding "); if (si->nCountStart) dyStringPrintf(dy, "%d N bases at start", si->nCountStart); if (si->nCountStart && si->nCountEnd) @@ -1769,30 +1835,31 @@ if (isNotEmpty(lf->fileName)) warn("Sorry, can't recognize your file %s as FASTA or VCF.\n", lf->fileName); else warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n"); } lineFileClose(&lf); if (vcfTn) { fflush(stdout); int seqCount = slCount(seqInfoList); struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); if (results->subtreeInfoList || seqCount > MAX_SEQ_DETAILS) { + readQcThresholds(db); int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome); struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize); struct hash *sampleMetadata = getSampleMetadata(metadataFile); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json");