5956c40955854fd83b75627124f6420518a78960 angie Wed Feb 24 20:51:42 2021 -0800 Add more columns from summary table to TSV summary file. Move PSL indel-tallying code upstream & store results for summaries. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 6f49fe1..5efe19e 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1340,31 +1340,30 @@ dyStringPrintf(dy, "%d N bases at end", si->nCountEnd); } static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta, struct usherResults *ur, struct tempName *jsonTns[], struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree, struct dnaSeq *refGenome) /* Show a table with composition & alignment stats for each sequence that passed basic QC. */ { if (seqInfoList) { puts("
%s", replaceChars(si->seq->name, "|", " | ")); if (isFasta) { if (si->nCountStart || si->nCountEnd) { int effectiveLength = si->seq->size - (si->nCountStart + si->nCountEnd); dyStringClear(dy); dyStringPrintf(dy, "%d ", effectiveLength); appendExcludingNs(dy, si); dyStringPrintf(dy, " (original size %d)", si->seq->size); printf(" | %d", qcClassForLength(effectiveLength), effectiveLength); @@ -1405,98 +1404,41 @@ si->ambigCount - alignedAmbigCount); printTooltip(dy->string); } printf(" | "); if (isFasta) { struct psl *psl = si->psl; if (psl) { int aliCount = psl->match + psl->misMatch + psl->repMatch; printf("%d ", qcClassForLength(aliCount), aliCount); dyStringClear(dy); dyStringPrintf(dy, "bases %d - %d align to reference bases %d - %d", psl->qStart+1, psl->qEnd, psl->tStart+1, psl->tEnd); printTooltip(dy->string); - int insBases = 0, insCount = 0, delBases = 0, delCount = 0; - if (psl->qBaseInsert || psl->tBaseInsert) - { - // Tally up actual insertions and deletions; ignore skipped N bases. - dyStringClear(dy); - dyStringClear(dyExtra); - int ix; - for (ix = 0; ix < psl->blockCount - 1; ix++) - { - int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix]; - int qGapEnd = psl->qStarts[ix+1]; - int qGapLen = qGapEnd - qGapStart; - int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix]; - int tGapEnd = psl->tStarts[ix+1]; - int tGapLen = tGapEnd - tGapStart; - if (qGapLen > tGapLen) - { - insCount++; - int insLen = qGapLen - tGapLen; - insBases += insLen; - if (isNotEmpty(dy->string)) - dyStringAppend(dy, ", "); - if (insLen <= 12) - { - char insSeq[insLen+1]; - safencpy(insSeq, sizeof insSeq, si->seq->dna + qGapEnd - insLen, - insLen); - touppers(insSeq); - dyStringPrintf(dy, "%d-%d:%s", - tGapEnd, tGapEnd+1, insSeq); - } - else - dyStringPrintf(dy, "%d-%d:%d bases", - tGapEnd, tGapEnd+1, insLen); - } - else if (tGapLen > qGapLen) - { - delCount++; - int delLen = tGapLen - qGapLen;; - delBases += delLen; - if (isNotEmpty(dyExtra->string)) - dyStringAppend(dyExtra, ", "); - if (delLen <= 12) - { - char delSeq[delLen+1]; - safencpy(delSeq, sizeof delSeq, refGenome->dna + tGapEnd - delLen, - delLen); - touppers(delSeq); - dyStringPrintf(dyExtra, "%d-%d:%s", - tGapEnd - delLen + 1, tGapEnd, delSeq); - } - else - dyStringPrintf(dyExtra, "%d-%d:%d bases", - tGapEnd - delLen + 1, tGapEnd, delLen); - } - } - } printf(" | %d ", - qcClassForIndel(insBases), insBases); - if (insBases) + qcClassForIndel(si->insBases), si->insBases); + if (si->insBases) { - printTooltip(dy->string); + printTooltip(si->insRanges); } printf(" | %d ", - qcClassForIndel(delBases), delBases); - if (delBases) + qcClassForIndel(si->delBases), si->delBases); + if (si->delBases) { - printTooltip(dyExtra->string); + printTooltip(si->delRanges); } printf(" | "); } else printf("not alignable | ", qcClassForLength(0)); } int snvCount = slCount(si->sncList) - alignedAmbigCount; printf("%d", qcClassForSNVs(snvCount), snvCount);
if (snvCount > 0)
{
dyStringClear(dy);
struct singleNucChange *snc;
for (snc = si->sncList; snc != NULL; snc = snc->next)
{
@@ -1618,31 +1560,32 @@
struct baseVal *bv;
for (bv = imputedBases; bv != NULL; bv = bv->next)
{
struct singleNucChange *snc;
AllocVar(snc);
snc->chromStart = bv->chromStart;
snc->refBase = '?';
snc->newBase = bv->val[0];
slAddHead(&sncList, snc);
}
slReverse(&sncList);
return sncList;
}
static void writeOneTsvRow(FILE *f, char *sampleId, struct usherResults *results,
- struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
+ struct hash *seqInfoHash, struct geneInfo *geneInfoList,
+ struct seqWindow *gSeqWin)
/* Write one row of tab-separate summary for sampleId. */
{
struct placementInfo *info = hashFindVal(results->samplePlacements, sampleId);
// sample name / ID
fprintf(f, "%s\t", sampleId);
// nucleotide mutations
struct slName *mut;
for (mut = info->sampleMuts; mut != NULL; mut = mut->next)
{
if (mut != info->sampleMuts)
fputc(',', f);
fputs(mut->name, f);
}
fputc('\t', f);
// AA mutations
@@ -1663,73 +1606,125 @@
fprintf(f, "%s:%s", geneAaMut->name, aaMut->name);
}
}
fputc('\t', f);
// imputed bases (if any)
struct baseVal *bv;
for (bv = info->imputedBases; bv != NULL; bv = bv->next)
{
if (bv != info->imputedBases)
fputc(',', f);
fprintf(f, "%d%s", bv->chromStart+1, bv->val);
}
fputc('\t', f);
// path through tree to sample
printVariantPathNoNodeNames(f, info->variantPath);
+ // number of equally parsimonious placements
+ fprintf(f, "\t%d", info->bestNodeCount);
+ // parsimony score
+ fprintf(f, "\t%d", info->parsimonyScore);
+ struct seqInfo *si = hashFindVal(seqInfoHash, sampleId);
+ if (si)
+ {
+ if (si->psl)
+ {
+ // length
+ fprintf(f, "\t%d", si->seq->size);
+ struct psl *psl = si->psl;
+ // aligned bases, indel counts & ranges
+ int aliCount = psl->match + psl->misMatch + psl->repMatch;
+ fprintf(f, "\t%d\t%d\t%s\t%d\t%s",
+ aliCount, si->insBases, emptyForNull(si->insRanges),
+ si->delBases, emptyForNull(si->delRanges));
+ }
+ else
+ fprintf(f, "\t\t\t\t\t\t");
+ // SNVs that were masked (Problematic Sites track), not used in placement
+ fputc('\t', f);
+ struct singleNucChange *snc;
+ for (snc = si->maskedSncList; snc != NULL; snc = snc->next)
+ {
+ if (snc != si->maskedSncList)
+ fputc(',', f);
+ fprintf(f, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase);
+ }
+ }
+ else
+ {
+ warn("writeOneTsvRow: no sequenceInfo for sample '%s'", sampleId);
+ fprintf(f, "\t\t\t\t\t\t\t");
+ }
fputc('\n', f);
}
static void rWriteTsvSummaryTreeOrder(struct phyloTree *node, FILE *f, struct usherResults *results,
- struct geneInfo *geneInfoList, struct seqWindow *gSeqWin)
+ struct hash *seqInfoHash, struct geneInfo *geneInfoList,
+ struct seqWindow *gSeqWin)
/* As we encounter leaves (user-uploaded samples) in depth-first search order, write out a line
* of TSV summary for each one. */
{
if (node->numEdges)
{
int i;
for (i = 0; i < node->numEdges; i++)
- rWriteTsvSummaryTreeOrder(node->edges[i], f, results, geneInfoList, gSeqWin);
+ rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin);
}
else
{
- writeOneTsvRow(f, node->ident->name, results, geneInfoList, gSeqWin);
+ writeOneTsvRow(f, node->ident->name, results, seqInfoHash, geneInfoList, gSeqWin);
}
}
+static struct hash *hashFromSeqInfoList(struct seqInfo *seqInfoList)
+/* Hash sequence name to seqInfo for quick lookup. */
+{
+struct hash *hash = hashNew(0);
+struct seqInfo *si;
+for (si = seqInfoList; si != NULL; si = si->next)
+ hashAdd(hash, si->seq->name, si);
+return hash;
+}
+
static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree,
- struct slName *sampleIds, struct geneInfo *geneInfoList,
- struct seqWindow *gSeqWin, int *pStartTime)
+ struct slName *sampleIds, struct seqInfo *seqInfoList,
+ struct geneInfo *geneInfoList, struct seqWindow *gSeqWin,
+ int *pStartTime)
/* Write a tab-separated summary file for download. If the user uploaded enough samples to make
* a tree, then write out samples in tree order; otherwise use sampleIds list. */
{
struct tempName *tsvTn = NULL;
AllocVar(tsvTn);
trashDirFile(tsvTn, "ct", "usher", ".tsv");
FILE *f = mustOpen(tsvTn->forCgi, "w");
-fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path\n");
+fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path"
+ "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases"
+ "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations"
+ "\n");
+struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList);
if (sampleTree)
{
- rWriteTsvSummaryTreeOrder(sampleTree, f, results, geneInfoList, gSeqWin);
+ rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin);
}
else
{
struct slName *sample;
for (sample = sampleIds; sample != NULL; sample = sample->next)
- writeOneTsvRow(f, sample->name, results, geneInfoList, gSeqWin);
+ writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin);
}
carefulClose(&f);
+hashFree(&seqInfoHash);
reportTiming(pStartTime, "write tsv summary");
return tsvTn;
}
static struct slName **getProblematicSites(char *db)
/* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that
* masking is recommended, one per position in genome; otherwise return NULL. */
{
struct slName **pSites = NULL;
char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile");
if (isNotEmpty(pSitesFile) && fileExists(pSitesFile))
{
AllocArray(pSites, chromSize);
struct bbiFile *bbi = bigBedFileOpen(pSitesFile);
struct lm *lm = lmInit(0);
@@ -1917,32 +1912,32 @@
}
reportTiming(&startTime, "describe placements");
}
else
printf(" (Skipping details and subtrees; " "you uploaded %d sequences, and details/subtrees are shown only when " "you upload at most %d sequences.) \n", seqCount, MAX_SEQ_DETAILS); // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, source, fontHeight, &sampleTree, &startTime); // Make a TSV summary file - struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, geneInfoList, - gSeqWin, &startTime); + struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoList, + geneInfoList, gSeqWin, &startTime); // Offer big tree w/new samples for download puts("Downloads"); puts("
|
---|