dc33f1e79e44f3d7ac0c8bc63c3c43847325c548 braney Thu May 26 12:40:57 2022 -0700 add a new click handler for TOGA tracks from Michael Hiller's group diff --git src/hg/hgc/togaClick.c src/hg/hgc/togaClick.c index 62b7627..e3d53c8 100644 --- src/hg/hgc/togaClick.c +++ src/hg/hgc/togaClick.c @@ -1,24 +1,60 @@ /* togaClick - click handling for TOGA tracks */ #include "common.h" #include "hgc.h" #include "togaClick.h" #include "string.h" #include "htmshell.h" #include "chromAlias.h" +struct togaDataBB *togaDataBBLoad(char **row) +/* Load a togaData from row fetched with select * from togaData + * from database. Dispose of this with togaDataFree(). */ +{ + struct togaDataBB *ret; + AllocVar(ret); + ret->projection = cloneString(row[0]); + ret->ref_trans_id = cloneString(row[1]); + ret->ref_region = cloneString(row[2]); + ret->query_region = cloneString(row[3]); + ret->chain_score = cloneString(row[4]); + + ret->chain_synteny = cloneString(row[5]); + ret->chain_flank = cloneString(row[6]); + ret->chain_gl_cds_fract = cloneString(row[7]); + ret->chain_loc_cds_fract = cloneString(row[8]); + ret->chain_exon_cov = cloneString(row[9]); + + ret->chain_intron_cov = cloneString(row[10]); + ret->status = cloneString(row[11]); + ret->perc_intact_ign_M = cloneString(row[12]); + ret->perc_intact_int_M = cloneString(row[13]); + ret->intact_codon_prop = cloneString(row[14]); + + ret->ouf_prop = cloneString(row[15]); + ret->mid_intact = cloneString(row[16]); + ret->mid_pres = cloneString(row[17]); + ret->prot_alignment = cloneString(row[18]); + ret->svg_line = cloneString(row[19]); + ret->ref_link = cloneString(row[20]); + ret->inact_mut_html_table = cloneString(row[21]); + ret->exon_ali_html = cloneString(row[22]); + return ret; +} + + struct togaData *togaDataLoad(char **row) /* Load a togaData from row fetched with select * from togaData * from database. Dispose of this with togaDataFree(). */ { struct togaData *ret; AllocVar(ret); ret->projection = cloneString(row[0]); ret->ref_trans_id = cloneString(row[1]); ret->ref_region = cloneString(row[2]); ret->query_region = cloneString(row[3]); ret->chain_score = cloneString(row[4]); ret->chain_synteny = cloneString(row[5]); ret->chain_flank = cloneString(row[6]); ret->chain_gl_cds_fract = cloneString(row[7]); @@ -28,30 +64,67 @@ ret->chain_intron_cov = cloneString(row[10]); ret->status = cloneString(row[11]); ret->perc_intact_ign_M = cloneString(row[12]); ret->perc_intact_int_M = cloneString(row[13]); ret->intact_codon_prop = cloneString(row[14]); ret->ouf_prop = cloneString(row[15]); ret->mid_intact = cloneString(row[16]); ret->mid_pres = cloneString(row[17]); ret->prot_alignment = cloneString(row[18]); ret->svg_line = cloneString(row[19]); return ret; } +void togaDataBBFree(struct togaDataBB **pEl) +/* Free a single dynamically allocated togaDatasuch as created + * with togaDataLoad(). */ +{ + struct togaDataBB *el; + + if ((el = *pEl) == NULL) return; + freeMem(el->projection); + freeMem(el->ref_trans_id); + freeMem(el->ref_region); + freeMem(el->query_region); + freeMem(el->chain_score); + + freeMem(el->chain_synteny); + freeMem(el->chain_flank); + freeMem(el->chain_gl_cds_fract); + freeMem(el->chain_loc_cds_fract); + freeMem(el->chain_exon_cov); + + freeMem(el->chain_intron_cov); + freeMem(el->status); + freeMem(el->perc_intact_ign_M); + freeMem(el->perc_intact_int_M); + freeMem(el->intact_codon_prop); + + freeMem(el->ouf_prop); + freeMem(el->mid_intact); + freeMem(el->mid_pres); + freeMem(el->prot_alignment); + freeMem(el->svg_line); + freeMem(el->ref_link); + freeMem(el->inact_mut_html_table); + freeMem(el->exon_ali_html); + freez(pEl); +} + + void togaDataFree(struct togaData **pEl) /* Free a single dynamically allocated togaDatasuch as created * with togaDataLoad(). */ { struct togaData *el; if ((el = *pEl) == NULL) return; freeMem(el->projection); freeMem(el->ref_trans_id); freeMem(el->ref_region); freeMem(el->query_region); freeMem(el->chain_score); freeMem(el->chain_synteny); freeMem(el->chain_flank); @@ -158,233 +231,240 @@ { int suff_len = strlen(suffix); if (suff_len <= HLTOGA_BED_PREFIX_LEN) // we cannot chop first PREFIX_LEN characters { // TODO: NOT SURE IF IT WORKS; but this must not happen char empty[5] = { '\0' }; strcpy(suffix, empty); } else { // just start the string 11 characters upstream memmove(suffix, suffix + HLTOGA_BED_PREFIX_LEN, suff_len - HLTOGA_BED_PREFIX_LEN + 1); } } +void HLprintQueryProtSeqForAli(char *proteinAlignment) { + // take protein sequence alignment + // print only the query sequence + char *str = proteinAlignment; + int printed_char_num = 0; + while ((str = strstr(str, "que:")) != NULL) + { + str += 10; + char ch; + while ((ch = *str++) != '<') { + if (ch != '-') { + putchar(ch); + ++printed_char_num; + } + if (printed_char_num == 80) { + printed_char_num = 0; + printf("<BR>"); + } + } + } +} + + + void doHillerLabTOGAGeneBig(char *database, struct trackDb *tdb, char *item, char *table_name) /* Put up TOGA Gene track info. */ +// To think about -> put into a single bigBed +// string: HTML formatted inact mut +// string: HTML formatted exon ali section { int start = cartInt(cart, "o"); int end = cartInt(cart, "t"); char *chrom = cartString(cart, "c"); char *fileName = bbiNameFromSettingOrTable(tdb, NULL, tdb->table); struct bbiFile *bbi = bigBedFileOpenAlias(hReplaceGbdb(fileName), chromAliasFindAliases); struct lm *lm = lmInit(0); struct bigBedInterval *bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); struct bigBedInterval *bb; char *fields[bbi->fieldCount]; for (bb = bbList; bb != NULL; bb = bb->next) { if (!(bb->start == start && bb->end == end)) continue; // our names are unique char *name = cloneFirstWordByDelimiterNoSkip(bb->rest, '\t'); boolean match = (isEmpty(name) && isEmpty(item)) || sameOk(name, item); if (!match) continue; char startBuf[16], endBuf[16]; bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, bbi->fieldCount); break; } printf("<h3>Projection %s</h3><BR>\n", item); -struct togaData *info = togaDataLoad(&fields[11]); -// fill HTML template: -printf("<B>Projected via: </B><A HREF=\"http://www.ensembl.org/Homo_sapiens/transview?transcript=%s\" target=_blank>%s</A><BR>", - info->ref_trans_id, info->ref_trans_id); -printf("<B>Region in reference: </B>%s<BR>\n", info->ref_region); -printf("<B>Region in query: </B>%s<BR>\n", info->query_region); +struct togaDataBB *info = togaDataBBLoad(&fields[11]); // Bogdan: why 11? 0-11 are bed-like fields likely -printf("<B>Projection class: </B>%s<BR>\n", info->status); -printf("<B>Chain score: </B>%s<BR>\n", info->chain_score); +printf("<B>Reference transcript: </B>%s<BR>", info->ref_link); +printf("<B>Genomic locus in reference: </B>%s<BR>\n", info->ref_region); +printf("<B>Genomic locus in query: </B>%s<BR>\n", info->query_region); + +printf("<B>Projection classification: </B>%s<BR>\n", info->status); +printf("<B>Probability that query locus is orthologous: </B>%s<BR>\n", info->chain_score); // list of chain features (for orthology classification) -printf("<a data-toggle=\"collapse\" href=\"#collapseChain\">Show chain features for classification</a>\n"); +printf("<a data-toggle=\"collapse\" href=\"#collapseChain\">Show features used for ortholog probability</a>\n"); printf("<div id=\"collapseChain\" class=\"panel-collapse collapse\">\n"); printf("<ul>\n"); -printf("<li>Synteny: %s</li>\n", info->chain_synteny); +printf("<li>Synteny (log10 value): %s</li>\n", info->chain_synteny); printf("<li>Global CDS fraction: %s</li>\n", info->chain_gl_cds_fract); printf("<li>Local CDS fraction: %s</li>\n", info->chain_loc_cds_fract); printf("<li>Local intron fraction: %s</li>\n", info->chain_intron_cov); printf("<li>Local CDS coverage: %s</li>\n", info->chain_exon_cov); printf("<li>Flank fraction: %s</li>\n", info->chain_flank); +printf("</ul>\n"); + +printf("<br>\n<b>Feature description:</b>\n"); +printf("For each projection (one reference transcript and one overlapping chain),\n"); +printf("TOGA computes the following features by intersecting the reference coordinates of aligning\n"); +printf("blocks in the chain with different gene parts (coding exons, UTR (untranslated region) exons, introns)\n"); +printf("and the respective intergenic regions.\n<br>\n"); + +printf("We define the following variables:\n<ul>\n"); +printf("<li>c: number of reference bases in the intersection between chain blocks and coding exons of the gene under consideration.</li>\n"); +printf("<li>C: number of reference bases in the intersection between chain blocks and coding exons of all genes. </li>\n"); +printf("<li>a: number of reference bases in the intersection between chain blocks and coding exons and introns of the gene under consideration. </li>\n"); +printf("<li>A: number of reference bases in the intersection between chain blocks and coding exons and introns of all genes and the intersection\n"); +printf("between chain blocks and intergenic regions (excludes UTRs). </li>\n"); +printf("<li>f: number of reference bases in chain blocks overlapping the 10 kb flanks of the gene under consideration.\n"); +printf("Alignment blocks overlapping exons of another gene that is located in these 10 kb flanks are ignored. </li>\n"); +printf("<li>i: number of reference bases in the intersection between chain blocks and introns of the gene under consideration. </li>\n"); +printf("<li>CDS (coding sequence): length of the coding region of the gene under consideration. </li>\n"); +printf("<li>I: sum of all intron lengths of the gene under consideration. </li>\n"); +printf("</ul>\n"); +printf("Using these variables, TOGA computes the following features:\n"); +printf("<ul>\n"); +printf("<li>“global CDS fraction” as C / A. Chains with a high value have alignments that largely overlap coding exons,"); +printf("which is a hallmark of paralogous or processed pseudogene chains. In contrast, chains with a low value also align many "); +printf("intronic and intergenic regions, which is a hallmark of orthologous chains. </li>\n"); +printf("<li>“local CDS fraction” as c / a. Orthologous chains tend to have a lower value, as intronic "); +printf("regions partially align. This feature is not computed for single-exon genes. </li>\n"); +printf("<li>“local intron fraction” as i / I. Orthologous chains tend to have a higher value."); +printf("This feature is not computed for single-exon genes. </li>\n"); +printf("<li>“flank fraction” as f / 20,000. Orthologous chains tend to have higher values,"); +printf("as flanking intergenic regions partially align. This feature is important to detect orthologous loci of single-exon genes. </li>\n"); +printf("<li>“synteny” as log10 of the number of genes, whose coding exons overlap by at least one base aligning"); +printf("blocks of this chain. Orthologous chains tend to cover several genes located in a conserved order, resulting in higher synteny values. </li>\n"); +printf("<li>“local CDS coverage” as c / CDS, which is only used for single-exon genes. </li>\n"); +printf("</ul>\n"); + + printf("</ul>\n</div>\n<BR>\n"); htmlHorizontalLine(); // show inact mut plot -printf("<h4>Inactivating mutations plot</h4>\n"); +printf("<h4>Visualization of inactivating mutations on exon-intron structure</h4>\n"); printf("%s<BR>\n", info->svg_line); +printf("<BR>Exons shown in grey are missing (often overlap assembly gaps).\nExons shown in"); +printf(" red or blue are deleted or do not align at all.\nRed indicates that the exon deletion "); +printf("shifts the reading frame, while blue indicates that exon deletion(s) are framepreserving.<br>\n"); // GLP features -printf("<a data-toggle=\"collapse\" href=\"#collapseGLP\">Show GLP features</a>\n"); +printf("<a data-toggle=\"collapse\" href=\"#collapseGLP\">Show features used for transcript classification</a>\n"); printf("<div id=\"collapseGLP\" class=\"panel-collapse collapse\">\n"); printf("<ul>\n"); -printf("<li>Percent intact ignoring missing seq: %s</li>\n", info->perc_intact_ign_M); -printf("<li>Percent intact (miss == intact): %s</li>\n", info->perc_intact_int_M); -printf("<li>Intact codon proportion %s</li>\n", info->intact_codon_prop); -printf("<li>Out of chain proportion: %s</li>\n", info->ouf_prop); +printf("<li>Percent intact, ignoring missing sequence: %s</li>\n", info->perc_intact_ign_M); +printf("<li>Percent intact, treating missing as intact sequence: %s</li>\n", info->perc_intact_int_M); +printf("<li>Proportion of intact codons: %s</li>\n", info->intact_codon_prop); +printf("<li>Percent of CDS not covered by this chain (0 unless the chain covers only a part of the gene): %s</li>\n", info->ouf_prop); if (sameWord(info->mid_intact, ONE_)) { - printf("<li>Middle 80 percent intact: %s</li>\n", YES_); + printf("<li>Middle 80 percent of CDS intact: %s</li>\n", YES_); } else { - printf("<li>Middle 80 percent intact: %s</li>\n", NO_); + printf("<li>Middle 80 percent of CDS intact: %s</li>\n", NO_); } if (sameWord(info->mid_pres, ONE_)) { - printf("<li>Middle 80 percent present: %s</li>\n", YES_); + printf("<li>Middle 80 percent of CDS present: %s</li>\n", YES_); } else { - printf("<li>Middle 80 percent present: %s</li>\n", NO_); + printf("<li>Middle 80 percent of CDS present: %s</li>\n", NO_); } printf("</ul>\n</div>\n<BR>\n"); -// and show protein sequence + htmlHorizontalLine(); -printf("<h4>Protein sequence</h4><BR>\n"); -printf("<a data-toggle=\"collapse\" href=\"#collapseProt\">Show protein alignment</a>\n"); + +printf("<h4>Predicted protein sequence</h4><BR>\n"); + +printf("<a data-toggle=\"collapse\" href=\"#collapseProt\">Show protein sequence of query</a>\n"); printf("<div id=\"collapseProt\" class=\"panel-collapse collapse\">\n"); +// printf("<TT>{protein seq of the query without dashes or other things. Should end with *}\n"); +printf("<TT>"); +HLprintQueryProtSeqForAli(info->prot_alignment); +printf("\n<BR>\n</TT>\n</div>\n"); + +// and show protein sequence +htmlHorizontalLine(); +printf("<h4>Protein sequence alignment</h4><BR>\n"); +printf("<a data-toggle=\"collapse\" href=\"#collapseProtAli\">Show alignment between reference and query</a>\n"); +printf("<div id=\"collapseProtAli\" class=\"panel-collapse collapse\">\n"); printf("<TT>%s</TT><BR>\n", info->prot_alignment); printf("</div>\n<BR><BR>\n"); // show inactivating mutations if required -printf("<h4>Inactivating mutations</h4><BR>\n"); +printf("<h4>List of inactivating mutations</h4><BR>\n"); printf("<a data-toggle=\"collapse\" href=\"#collapseMuts\">Show inactivating mutations</a>\n"); printf("<div id=\"collapseMuts\" class=\"panel-collapse collapse\">\n"); printf("<table border = \"1\" width = \"640\">\n"); // init table -printf("<tr><th>exon</th><th>pos</th><th>m_class</th><th>mut</th><th>is_inact</th><th>mut_id</th>\n"); +printf("<tr><th>Exon number</th><th>Codon number</th><th>Mutation class</th><th>Mutation</th><th>Treated as inactivating</th><th>Mutation ID</th>\n"); printf("</tr>\n"); -fileName = trackDbSetting(tdb, "inactMutUrl"); -bbi = bigBedFileOpenAlias(hReplaceGbdb(fileName), chromAliasFindAliases); -//struct lm *lm = lmInit(0); -bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); -for (bb = bbList; bb != NULL; bb = bb->next) - { - if (!(bb->start == start && bb->end == end)) - continue; - - // our names are unique - char *name = cloneFirstWordByDelimiterNoSkip(bb->rest, '\t'); - boolean match = (isEmpty(name) && isEmpty(item)) || sameOk(name, item); - if (!match) - continue; - - char startBuf[16], endBuf[16]; - bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, bbi->fieldCount); - struct togaInactMut *info = NULL; - info = togaInactMutLoad(&fields[3]); - printf("<tr>\n"); - printf("<td>%s</td>\n", info->exon_num); - printf("<td>%s</td>\n", info->position); - printf("<td>%s</td>\n", info->mut_class); - printf("<td>%s</td>\n", info->mutation); - if (sameWord(info->is_inact, ONE_)){ - printf("<td>%s</td>\n", YES_); - } else { - printf("<td>%s</td>\n", NO_); - } - printf("<td>%s</td>\n", info->mut_id); - printf("</tr>\n"); - togaInactMutFree(&info); - } - //sqlFreeResult(&sr); +printf("%s\n", info->inact_mut_html_table); printf("</table>\n"); printf("</div>\n<BR>\n"); // show exons data htmlHorizontalLine(); -printf("<h4>Exons data</h4><BR>\n"); +printf("<h4>Exon alignments</h4><BR>\n"); -printf("<a data-toggle=\"collapse\" href=\"#collapseExons\">Show exon sequences and features</a>\n"); +printf("<a data-toggle=\"collapse\" href=\"#collapseExons\">Show exon sequences and features</a><BR><BR>\n"); printf("<div id=\"collapseExons\" class=\"panel-collapse collapse\">\n"); -fileName = trackDbSetting(tdb, "nuclUrl"); -bbi = bigBedFileOpenAlias(hReplaceGbdb(fileName), chromAliasFindAliases); -//struct lm *lm = lmInit(0); -bbList = bigBedIntervalQuery(bbi, chrom, start, end, 0, lm); -for (bb = bbList; bb != NULL; bb = bb->next) - { - if (!(bb->start == start && bb->end == end)) - continue; - - // our names are unique - char *name = cloneFirstWordByDelimiterNoSkip(bb->rest, '\t'); - boolean match = (isEmpty(name) && isEmpty(item)) || sameOk(name, item); - if (!match) - continue; - - char startBuf[16], endBuf[16]; - bigBedIntervalToRow(bb, chrom, startBuf, endBuf, fields, bbi->fieldCount); - struct togaNucl *info = NULL; - info = togaNuclLoad(&fields[3]); - printf("<h5>Exon number: %s</h5><BR>\n", info->exon_num); - printf("<B>Exon region:</B> %s<BR>\n", info->exon_region); - printf("<B>Nucleotide percent identity:</B> %s | <B>BLOSUM:</B> %s <BR>\n", info->pid, info->blosum); - if (sameWord(info->gaps, ONE_)){ - printf("<B>Intersects assembly gaps:</B> %s<BR>\n", YES_); - } else { - printf("<B>Intersects assembly gaps:</B> %s<BR>\n", NO_); - } - printf("<B>Exon alignment class:</B> %s<BR>\n", info->ali_class); - if (sameWord(info->in_exp_region, ONE_)){ - printf("<B>Detected within expected region:</B> %s<BR>\n", YES_); - } else { - printf("<B>Detected within expected region:</B> %s<BR>\n", NO_); - } - printf("<B>Expected region:</B> %s<BR>\n", info->exp_region); - printf("<BR>\n"); - printf("<B>Sequence alignment:</B><BR>\n"); - printf("%s<BR>\n", info->alignment); - togaNuclFree(&info); - } - - //sqlFreeResult(&sr); - printf("</div>\n<BR><BR>\n"); +// printf("%s\n", info->exon_ali_string); +printf("%s\n", info->exon_ali_html); htmlHorizontalLine(); // TODO: check whether I need this printf("%s", hgTracksPathAndSettings()); hPrintf("<link rel=\"stylesheet\" href=\"https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css\">"); hPrintf("<script src=\"https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js\"></script>"); hPrintf("<script src=\"https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js\"></script>"); printTrackHtml(tdb); // and do I need this? } + void doHillerLabTOGAGene(char *database, struct trackDb *tdb, char *item, char *table_name) /* Put up TOGA Gene track info. */ { //int start = cartInt(cart, "o"); char headerTitle[512]; char suffix[512]; strcpy(suffix, table_name); extractHLTOGAsuffix(suffix); safef(headerTitle, sizeof(headerTitle), "%s", item); genericHeader(tdb, headerTitle); printf("<h2>TOGA gene annotation</h2>\n"); // htmlHorizontalLine(); + if (startsWith("bigBed", tdb->type)) { doHillerLabTOGAGeneBig(database, tdb, item, table_name); return; } struct sqlConnection *conn = hAllocConn(database); // define TOGA table names: initate with pre-defined prefixes char togaDataTableName[256]; char togaNuclTableName[256]; char togaInactMutTableName[256]; strcpy(togaDataTableName, HLTOGA_DATA_PREFIX); strcpy(togaNuclTableName, HLTOGA_NUCL_PREFIX); strcpy(togaInactMutTableName, HLTOGA_INACT_PREFIX); @@ -396,175 +476,220 @@ if (hTableExists(database, togaDataTableName)) { printf("<h3>Projection %s</h3><BR>\n", item); char query[256]; struct sqlResult *sr = NULL; char **row; struct togaData *info = NULL; sqlSafef(query, sizeof(query), "select * from %s where transcript='%s'", togaDataTableName, item); sr = sqlGetResult(conn, query); if ((row = sqlNextRow(sr)) != NULL) { info = togaDataLoad(row); // parse sql output // fill HTML template: - printf("<B>Projected via: </B><A HREF=\"http://www.ensembl.org/Homo_sapiens/transview?transcript=%s\" target=_blank>%s</A><BR>", + printf("<B>Reference transcript: </B><A HREF=\"http://www.ensembl.org/Homo_sapiens/transview?transcript=%s\" target=_blank>%s</A><BR>", info->ref_trans_id, info->ref_trans_id); - printf("<B>Region in reference: </B>%s<BR>\n", info->ref_region); - printf("<B>Region in query: </B>%s<BR>\n", info->query_region); + printf("<B>Genomic locus in reference: </B>%s<BR>\n", info->ref_region); + printf("<B>Genomic locus in query: </B>%s<BR>\n", info->query_region); - printf("<B>Projection class: </B>%s<BR>\n", info->status); - printf("<B>Chain score: </B>%s<BR>\n", info->chain_score); + printf("<B>Projection classification: </B>%s<BR>\n", info->status); + printf("<B>Probability that query locus is orthologous: </B>%s<BR>\n", info->chain_score); // list of chain features (for orthology classification) - printf("<a data-toggle=\"collapse\" href=\"#collapseChain\">Show chain features for classification</a>\n"); + printf("<a data-toggle=\"collapse\" href=\"#collapseChain\">Show features used for ortholog probability</a>\n"); printf("<div id=\"collapseChain\" class=\"panel-collapse collapse\">\n"); printf("<ul>\n"); - printf("<li>Synteny: %s</li>\n", info->chain_synteny); + printf("<li>Synteny (log10 value): %s</li>\n", info->chain_synteny); printf("<li>Global CDS fraction: %s</li>\n", info->chain_gl_cds_fract); printf("<li>Local CDS fraction: %s</li>\n", info->chain_loc_cds_fract); printf("<li>Local intron fraction: %s</li>\n", info->chain_intron_cov); printf("<li>Local CDS coverage: %s</li>\n", info->chain_exon_cov); printf("<li>Flank fraction: %s</li>\n", info->chain_flank); + printf("</ul>\n"); + + printf("<br>\n<b>Feature description:</b>\n"); + printf("For each projection (one reference transcript and one overlapping chain),\n"); + printf("TOGA computes the following features by intersecting the reference coordinates of aligning\n"); + printf("blocks in the chain with different gene parts (coding exons, UTR (untranslated region) exons, introns)\n"); + printf("and the respective intergenic regions.\n<br>\n"); + + printf("We define the following variables:\n<ul>\n"); + printf("<li>c: number of reference bases in the intersection between chain blocks and coding exons of the gene under consideration.</li>\n"); + printf("<li>C: number of reference bases in the intersection between chain blocks and coding exons of all genes. </li>\n"); + printf("<li>a: number of reference bases in the intersection between chain blocks and coding exons and introns of the gene under consideration. </li>\n"); + printf("<li>A: number of reference bases in the intersection between chain blocks and coding exons and introns of all genes and the intersection\n"); + printf("between chain blocks and intergenic regions (excludes UTRs). </li>\n"); + printf("<li>f: number of reference bases in chain blocks overlapping the 10 kb flanks of the gene under consideration.\n"); + printf("Alignment blocks overlapping exons of another gene that is located in these 10 kb flanks are ignored. </li>\n"); + printf("<li>i: number of reference bases in the intersection between chain blocks and introns of the gene under consideration. </li>\n"); + printf("<li>CDS (coding sequence): length of the coding region of the gene under consideration. </li>\n"); + printf("<li>I: sum of all intron lengths of the gene under consideration. </li>\n"); + printf("</ul>\n"); + printf("Using these variables, TOGA computes the following features:\n"); + printf("<ul>\n"); + printf("<li>“global CDS fraction” as C / A. Chains with a high value have alignments that largely overlap coding exons,"); + printf("which is a hallmark of paralogous or processed pseudogene chains. In contrast, chains with a low value also align many "); + printf("intronic and intergenic regions, which is a hallmark of orthologous chains. </li>\n"); + printf("<li>“local CDS fraction” as c / a. Orthologous chains tend to have a lower value, as intronic "); + printf("regions partially align. This feature is not computed for single-exon genes. </li>\n"); + printf("<li>“local intron fraction” as i / I. Orthologous chains tend to have a higher value."); + printf("This feature is not computed for single-exon genes. </li>\n"); + printf("<li>“flank fraction” as f / 20,000. Orthologous chains tend to have higher values,"); + printf("as flanking intergenic regions partially align. This feature is important to detect orthologous loci of single-exon genes. </li>\n"); + printf("<li>“synteny” as log10 of the number of genes, whose coding exons overlap by at least one base aligning"); + printf("blocks of this chain. Orthologous chains tend to cover several genes located in a conserved order, resulting in higher synteny values. </li>\n"); + printf("<li>“local CDS coverage” as c / CDS, which is only used for single-exon genes. </li>\n"); + printf("</ul>\n"); + + printf("</ul>\n</div>\n<BR>\n"); htmlHorizontalLine(); // show inact mut plot - printf("<h4>Inactivating mutations plot</h4>\n"); + printf("<h4>Visualization of inactivating mutations on exon-intron structure</h4>\n"); printf("%s<BR>\n", info->svg_line); + printf("<BR>Exons shown in grey are missing (often overlap assembly gaps).\nExons shown in"); + printf(" red or blue are deleted or do not align at all.\nRed indicates that the exon deletion "); + printf("shifts the reading frame, while blue indicates that exon deletion(s) are framepreserving.<br>\n"); // GLP features - printf("<a data-toggle=\"collapse\" href=\"#collapseGLP\">Show GLP features</a>\n"); + printf("<a data-toggle=\"collapse\" href=\"#collapseGLP\">Show features used for transcript classification</a>\n"); printf("<div id=\"collapseGLP\" class=\"panel-collapse collapse\">\n"); printf("<ul>\n"); - printf("<li>Percent intact ignoring missing seq: %s</li>\n", info->perc_intact_ign_M); - printf("<li>Percent intact (miss == intact): %s</li>\n", info->perc_intact_int_M); - printf("<li>Intact codon proportion %s</li>\n", info->intact_codon_prop); - printf("<li>Out of chain proportion: %s</li>\n", info->ouf_prop); + printf("<li>Percent intact, ignoring missing sequence: %s</li>\n", info->perc_intact_ign_M); + printf("<li>Percent intact, treating missing as intact sequence: %s</li>\n", info->perc_intact_int_M); + printf("<li>Proportion of intact codons: %s</li>\n", info->intact_codon_prop); + printf("<li>Percent of CDS not covered by this chain (0 unless the chain covers only a part of the gene): %s</li>\n", info->ouf_prop); if (sameWord(info->mid_intact, ONE_)) { - printf("<li>Middle 80 percent intact: %s</li>\n", YES_); + printf("<li>Middle 80 percent of CDS intact: %s</li>\n", YES_); } else { - printf("<li>Middle 80 percent intact: %s</li>\n", NO_); + printf("<li>Middle 80 percent of CDS intact: %s</li>\n", NO_); } if (sameWord(info->mid_pres, ONE_)) { - printf("<li>Middle 80 percent present: %s</li>\n", YES_); + printf("<li>Middle 80 percent of CDS present: %s</li>\n", YES_); } else { - printf("<li>Middle 80 percent present: %s</li>\n", NO_); + printf("<li>Middle 80 percent of CDS present: %s</li>\n", NO_); } printf("</ul>\n</div>\n<BR>\n"); + printf("<HR ALIGN=\"CENTER\"><h4>Query protein sequence</h4><BR>"); + + printf("<a data-toggle=\"collapse\" href=\"#collapseProt\">Show protein sequence of query</a>\n"); + printf("<div id=\"collapseProt\" class=\"panel-collapse collapse\">\n"); + printf("<TT>{protein seq of the query without dashes or other things. Should end with *}\n"); + printf("<BR>\n</TT>\n</div>\n"); // and show protein sequence htmlHorizontalLine(); - printf("<h4>Protein sequence</h4><BR>\n"); - printf("<a data-toggle=\"collapse\" href=\"#collapseProt\">Show protein alignment</a>\n"); - printf("<div id=\"collapseProt\" class=\"panel-collapse collapse\">\n"); + printf("<h4>Protein sequence alignment</h4><BR>\n"); + printf("<a data-toggle=\"collapse\" href=\"#collapseProtAli\">Show alignment between reference and query</a>\n"); + printf("<div id=\"collapseProtAli\" class=\"panel-collapse collapse\">\n"); printf("<TT>%s</TT><BR>\n", info->prot_alignment); printf("</div>\n<BR><BR>\n"); // do not forget to free toga data struct togaDataFree(&info); } else { // no data found, need to report this printf("Not found data for %s\n", item); } sqlFreeResult(&sr); } // show inactivating mutations if required - printf("<h4>Inactivating mutations</h4><BR>\n"); + printf("<h4>List of inactivating mutations</h4><BR>\n"); if (hTableExists(database, togaInactMutTableName)) { char query[256]; struct sqlResult *sr = NULL; char **row; sqlSafef(query, sizeof(query), "select * from %s where transcript='%s'", togaInactMutTableName, item); sr = sqlGetResult(conn, query); printf("<a data-toggle=\"collapse\" href=\"#collapseMuts\">Show inactivating mutations</a>\n"); printf("<div id=\"collapseMuts\" class=\"panel-collapse collapse\">\n"); printf("<table border = \"1\" width = \"640\">\n"); // init table - printf("<tr><th>exon</th><th>pos</th><th>m_class</th><th>mut</th><th>is_inact</th><th>mut_id</th>\n"); + printf("<tr><th>Exon number</th><th>Codon number</th><th>Mutation class</th><th>Mutation</th><th>Treated as inactivating</th><th>Mutation ID</th>\n"); printf("</tr>\n"); while ((row = sqlNextRow(sr)) != NULL) { struct togaInactMut *info = NULL; info = togaInactMutLoad(row); printf("<tr>\n"); printf("<td>%s</td>\n", info->exon_num); printf("<td>%s</td>\n", info->position); printf("<td>%s</td>\n", info->mut_class); printf("<td>%s</td>\n", info->mutation); if (sameWord(info->is_inact, ONE_)){ printf("<td>%s</td>\n", YES_); } else { printf("<td>%s</td>\n", NO_); } printf("<td>%s</td>\n", info->mut_id); printf("</tr>\n"); togaInactMutFree(&info); } sqlFreeResult(&sr); printf("</table>\n"); printf("</div>\n<BR>\n"); } else { printf("<B>Sorry, cannot find TOGAInactMut table.</B><BR>\n"); } // show exons data htmlHorizontalLine(); - printf("<h4>Exons data</h4><BR>\n"); + printf("<h4>Exon alignments</h4><BR>\n"); if (hTableExists(database, togaNuclTableName)) { char query[256]; struct sqlResult *sr = NULL; char **row; printf("<a data-toggle=\"collapse\" href=\"#collapseExons\">Show exon sequences and features</a>\n"); printf("<div id=\"collapseExons\" class=\"panel-collapse collapse\">\n"); sqlSafef(query, sizeof(query), "select * from %s where transcript='%s'", togaNuclTableName, item); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { struct togaNucl *info = NULL; info = togaNuclLoad(row); printf("<h5>Exon number: %s</h5><BR>\n", info->exon_num); printf("<B>Exon region:</B> %s<BR>\n", info->exon_region); printf("<B>Nucleotide percent identity:</B> %s | <B>BLOSUM:</B> %s <BR>\n", info->pid, info->blosum); if (sameWord(info->gaps, ONE_)){ printf("<B>Intersects assembly gaps:</B> %s<BR>\n", YES_); } else { printf("<B>Intersects assembly gaps:</B> %s<BR>\n", NO_); } printf("<B>Exon alignment class:</B> %s<BR>\n", info->ali_class); if (sameWord(info->in_exp_region, ONE_)){ - printf("<B>Detected within expected region:</B> %s<BR>\n", YES_); + printf("<B>Detected within expected region (%s):</B> %s<BR>\n", info->exp_region, YES_); } else { - printf("<B>Detected within expected region:</B> %s<BR>\n", NO_); + printf("<B>Detected within expected region (%s):</B> %s<BR>\n", info->exp_region, NO_); } - printf("<B>Expected region:</B> %s<BR>\n", info->exp_region); + // printf("<B>Expected region:</B> %s<BR>\n", info->exp_region); printf("<BR>\n"); - printf("<B>Sequence alignment:</B><BR>\n"); + printf("<B>Sequence alignment between reference and query exon:</B><BR>\n"); printf("%s<BR>\n", info->alignment); togaNuclFree(&info); } sqlFreeResult(&sr); printf("</div>\n<BR><BR>\n"); } else { printf("<B>Sorry, cannot find TOGANucl table.</B><BR>\n"); } htmlHorizontalLine(); // TODO: check whether I need this printf("%s", hgTracksPathAndSettings()); hPrintf("<link rel=\"stylesheet\" href=\"https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css\">"); hPrintf("<script src=\"https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js\"></script>"); hPrintf("<script src=\"https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js\"></script>"); printTrackHtml(tdb); // and do I need this? hFreeConn(&conn); } -