5956c40955854fd83b75627124f6420518a78960 angie Wed Feb 24 20:51:42 2021 -0800 Add more columns from summary table to TSV summary file. Move PSL indel-tallying code upstream & store results for summaries. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 6f49fe1..5efe19e 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -1340,31 +1340,30 @@ dyStringPrintf(dy, "%d N bases at end", si->nCountEnd); } static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta, struct usherResults *ur, struct tempName *jsonTns[], struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree, struct dnaSeq *refGenome) /* Show a table with composition & alignment stats for each sequence that passed basic QC. */ { if (seqInfoList) { puts("<table class='seqSummary'>"); printSummaryHeader(isFasta); puts("<tbody>"); struct dyString *dy = dyStringNew(0); - struct dyString *dyExtra = dyStringNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) { puts("<tr>"); printf("<th>%s</td>", replaceChars(si->seq->name, "|", " | ")); if (isFasta) { if (si->nCountStart || si->nCountEnd) { int effectiveLength = si->seq->size - (si->nCountStart + si->nCountEnd); dyStringClear(dy); dyStringPrintf(dy, "%d ", effectiveLength); appendExcludingNs(dy, si); dyStringPrintf(dy, " (original size %d)", si->seq->size); printf("<td class='%s'>%d", qcClassForLength(effectiveLength), effectiveLength); @@ -1405,98 +1404,41 @@ si->ambigCount - alignedAmbigCount); printTooltip(dy->string); } printf("</td>"); if (isFasta) { struct psl *psl = si->psl; if (psl) { int aliCount = psl->match + psl->misMatch + psl->repMatch; printf("<td class='%s'>%d ", qcClassForLength(aliCount), aliCount); dyStringClear(dy); dyStringPrintf(dy, "bases %d - %d align to reference bases %d - %d", psl->qStart+1, psl->qEnd, psl->tStart+1, psl->tEnd); printTooltip(dy->string); - int insBases = 0, insCount = 0, delBases = 0, delCount = 0; - if (psl->qBaseInsert || psl->tBaseInsert) - { - // Tally up actual insertions and deletions; ignore skipped N bases. - dyStringClear(dy); - dyStringClear(dyExtra); - int ix; - for (ix = 0; ix < psl->blockCount - 1; ix++) - { - int qGapStart = psl->qStarts[ix] + psl->blockSizes[ix]; - int qGapEnd = psl->qStarts[ix+1]; - int qGapLen = qGapEnd - qGapStart; - int tGapStart = psl->tStarts[ix] + psl->blockSizes[ix]; - int tGapEnd = psl->tStarts[ix+1]; - int tGapLen = tGapEnd - tGapStart; - if (qGapLen > tGapLen) - { - insCount++; - int insLen = qGapLen - tGapLen; - insBases += insLen; - if (isNotEmpty(dy->string)) - dyStringAppend(dy, ", "); - if (insLen <= 12) - { - char insSeq[insLen+1]; - safencpy(insSeq, sizeof insSeq, si->seq->dna + qGapEnd - insLen, - insLen); - touppers(insSeq); - dyStringPrintf(dy, "%d-%d:%s", - tGapEnd, tGapEnd+1, insSeq); - } - else - dyStringPrintf(dy, "%d-%d:%d bases", - tGapEnd, tGapEnd+1, insLen); - } - else if (tGapLen > qGapLen) - { - delCount++; - int delLen = tGapLen - qGapLen;; - delBases += delLen; - if (isNotEmpty(dyExtra->string)) - dyStringAppend(dyExtra, ", "); - if (delLen <= 12) - { - char delSeq[delLen+1]; - safencpy(delSeq, sizeof delSeq, refGenome->dna + tGapEnd - delLen, - delLen); - touppers(delSeq); - dyStringPrintf(dyExtra, "%d-%d:%s", - tGapEnd - delLen + 1, tGapEnd, delSeq); - } - else - dyStringPrintf(dyExtra, "%d-%d:%d bases", - tGapEnd - delLen + 1, tGapEnd, delLen); - } - } - } printf("</td><td class='%s'>%d ", - qcClassForIndel(insBases), insBases); - if (insBases) + qcClassForIndel(si->insBases), si->insBases); + if (si->insBases) { - printTooltip(dy->string); + printTooltip(si->insRanges); } printf("</td><td class='%s'>%d ", - qcClassForIndel(delBases), delBases); - if (delBases) + qcClassForIndel(si->delBases), si->delBases); + if (si->delBases) { - printTooltip(dyExtra->string); + printTooltip(si->delRanges); } printf("</td>"); } else printf("<td colspan=3 class='%s'> not alignable </td>", qcClassForLength(0)); } int snvCount = slCount(si->sncList) - alignedAmbigCount; printf("<td class='%s'>%d", qcClassForSNVs(snvCount), snvCount); if (snvCount > 0) { dyStringClear(dy); struct singleNucChange *snc; for (snc = si->sncList; snc != NULL; snc = snc->next) { @@ -1618,31 +1560,32 @@ struct baseVal *bv; for (bv = imputedBases; bv != NULL; bv = bv->next) { struct singleNucChange *snc; AllocVar(snc); snc->chromStart = bv->chromStart; snc->refBase = '?'; snc->newBase = bv->val[0]; slAddHead(&sncList, snc); } slReverse(&sncList); return sncList; } static void writeOneTsvRow(FILE *f, char *sampleId, struct usherResults *results, - struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) + struct hash *seqInfoHash, struct geneInfo *geneInfoList, + struct seqWindow *gSeqWin) /* Write one row of tab-separate summary for sampleId. */ { struct placementInfo *info = hashFindVal(results->samplePlacements, sampleId); // sample name / ID fprintf(f, "%s\t", sampleId); // nucleotide mutations struct slName *mut; for (mut = info->sampleMuts; mut != NULL; mut = mut->next) { if (mut != info->sampleMuts) fputc(',', f); fputs(mut->name, f); } fputc('\t', f); // AA mutations @@ -1663,73 +1606,125 @@ fprintf(f, "%s:%s", geneAaMut->name, aaMut->name); } } fputc('\t', f); // imputed bases (if any) struct baseVal *bv; for (bv = info->imputedBases; bv != NULL; bv = bv->next) { if (bv != info->imputedBases) fputc(',', f); fprintf(f, "%d%s", bv->chromStart+1, bv->val); } fputc('\t', f); // path through tree to sample printVariantPathNoNodeNames(f, info->variantPath); + // number of equally parsimonious placements + fprintf(f, "\t%d", info->bestNodeCount); + // parsimony score + fprintf(f, "\t%d", info->parsimonyScore); + struct seqInfo *si = hashFindVal(seqInfoHash, sampleId); + if (si) + { + if (si->psl) + { + // length + fprintf(f, "\t%d", si->seq->size); + struct psl *psl = si->psl; + // aligned bases, indel counts & ranges + int aliCount = psl->match + psl->misMatch + psl->repMatch; + fprintf(f, "\t%d\t%d\t%s\t%d\t%s", + aliCount, si->insBases, emptyForNull(si->insRanges), + si->delBases, emptyForNull(si->delRanges)); + } + else + fprintf(f, "\t\t\t\t\t\t"); + // SNVs that were masked (Problematic Sites track), not used in placement + fputc('\t', f); + struct singleNucChange *snc; + for (snc = si->maskedSncList; snc != NULL; snc = snc->next) + { + if (snc != si->maskedSncList) + fputc(',', f); + fprintf(f, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); + } + } + else + { + warn("writeOneTsvRow: no sequenceInfo for sample '%s'", sampleId); + fprintf(f, "\t\t\t\t\t\t\t"); + } fputc('\n', f); } static void rWriteTsvSummaryTreeOrder(struct phyloTree *node, FILE *f, struct usherResults *results, - struct geneInfo *geneInfoList, struct seqWindow *gSeqWin) + struct hash *seqInfoHash, struct geneInfo *geneInfoList, + struct seqWindow *gSeqWin) /* As we encounter leaves (user-uploaded samples) in depth-first search order, write out a line * of TSV summary for each one. */ { if (node->numEdges) { int i; for (i = 0; i < node->numEdges; i++) - rWriteTsvSummaryTreeOrder(node->edges[i], f, results, geneInfoList, gSeqWin); + rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin); } else { - writeOneTsvRow(f, node->ident->name, results, geneInfoList, gSeqWin); + writeOneTsvRow(f, node->ident->name, results, seqInfoHash, geneInfoList, gSeqWin); } } +static struct hash *hashFromSeqInfoList(struct seqInfo *seqInfoList) +/* Hash sequence name to seqInfo for quick lookup. */ +{ +struct hash *hash = hashNew(0); +struct seqInfo *si; +for (si = seqInfoList; si != NULL; si = si->next) + hashAdd(hash, si->seq->name, si); +return hash; +} + static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree, - struct slName *sampleIds, struct geneInfo *geneInfoList, - struct seqWindow *gSeqWin, int *pStartTime) + struct slName *sampleIds, struct seqInfo *seqInfoList, + struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, + int *pStartTime) /* Write a tab-separated summary file for download. If the user uploaded enough samples to make * a tree, then write out samples in tree order; otherwise use sampleIds list. */ { struct tempName *tsvTn = NULL; AllocVar(tsvTn); trashDirFile(tsvTn, "ct", "usher", ".tsv"); FILE *f = mustOpen(tsvTn->forCgi, "w"); -fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path\n"); +fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path" + "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases" + "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations" + "\n"); +struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList); if (sampleTree) { - rWriteTsvSummaryTreeOrder(sampleTree, f, results, geneInfoList, gSeqWin); + rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin); } else { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) - writeOneTsvRow(f, sample->name, results, geneInfoList, gSeqWin); + writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin); } carefulClose(&f); +hashFree(&seqInfoHash); reportTiming(pStartTime, "write tsv summary"); return tsvTn; } static struct slName **getProblematicSites(char *db) /* If config.ra specfies maskFile them return array of lists (usually NULL) of reasons that * masking is recommended, one per position in genome; otherwise return NULL. */ { struct slName **pSites = NULL; char *pSitesFile = phyloPlaceDbSettingPath(db, "maskFile"); if (isNotEmpty(pSitesFile) && fileExists(pSitesFile)) { AllocArray(pSites, chromSize); struct bbiFile *bbi = bigBedFileOpen(pSitesFile); struct lm *lm = lmInit(0); @@ -1917,32 +1912,32 @@ } reportTiming(&startTime, "describe placements"); } else printf("<p>(Skipping details and subtrees; " "you uploaded %d sequences, and details/subtrees are shown only when " "you upload at most %d sequences.)</p>\n", seqCount, MAX_SEQ_DETAILS); // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, source, fontHeight, &sampleTree, &startTime); // Make a TSV summary file - struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, geneInfoList, - gSeqWin, &startTime); + struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoList, + geneInfoList, gSeqWin, &startTime); // Offer big tree w/new samples for download puts("<h3>Downloads</h3>"); puts("<ul>"); printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree " "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml); printf("<li><a href='%s' download>TSV summary of sequences and placements</a>\n", tsvTn->forHtml); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml, ti->subtreeUserSampleIds->name); if (subtreeUserSampleCount > 10) printf(" and %d other samples", subtreeUserSampleCount - 1);