000226be7ecaa7674455d3968af3590ff2a047b3 kate Thu Apr 28 13:13:40 2016 -0700 Add GTEx gene expression boxplot to hg38 and hg19 hgGene pages. Remove old GTEx link. refs #17244 diff --git src/hg/hgc/gtexClick.c src/hg/hgc/gtexClick.c index 8c37bad..1a73b20 100644 --- src/hg/hgc/gtexClick.c +++ src/hg/hgc/gtexClick.c @@ -1,277 +1,132 @@ /* Details pages for GTEx tracks */ /* Copyright (C) 2015 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hash.h" #include "hdb.h" #include "hvGfx.h" #include "trashDir.h" #include "hgc.h" #include "hCommon.h" #include "gtexGeneBed.h" #include "gtexTissue.h" -#include "gtexSampleData.h" #include "gtexUi.h" #include "gtexInfo.h" -struct tissueSampleVals -/* RPKM expression values for multiple samples */ - { - struct tissueSampleVals *next; - char *name; /* GTEx tissue name */ - char *description; /* GTEx tissue description */ - int color; /* GTEx tissue color */ - int count; /* number of samples */ - double *vals; /* RPKM values */ - double min, max; /* minimum, maximum value */ - double q1, median, q3; /* quartiles */ - struct slDouble *valList; /* used to create val array */ - }; - char *geneClassColorCode(char *geneClass) /* Get HTML color code used by GENCODE for transcript class * WARNING: should share code with gene color handling in hgTracks */ { char *unknown = "#010101"; if (geneClass == NULL) return unknown; if (sameString(geneClass, "coding")) return "#0C0C78"; if (sameString(geneClass, "nonCoding")) return "#006400"; if (sameString(geneClass, "pseudo")) return "#FF33FF"; if (sameString(geneClass, "problem")) return "#FE0000"; return unknown; } -/********************************************************/ -/* R implementation. Invokes R script */ - -boolean drawGtexRBoxplot(char *geneName, struct tissueSampleVals *tsvList, - boolean doLogTransform, char *version, struct tempName *pngTn) -/* Draw a box-and-whiskers plot from GTEx sample data, using R boxplot */ -{ -/* Create R data frame. This is a tab-sep file, one row per sample, - * with columns for sample, tissue, rpkm */ -struct tempName dfTn; -trashDirFile(&dfTn, "hgc", "gtexGene", ".df.txt"); -FILE *f = fopen(dfTn.forCgi, "w"); -if (f == NULL) - errAbort("can't create temp file %s", dfTn.forCgi); -fprintf(f, "sample\ttissue\trpkm\n"); -struct tissueSampleVals *tsv; -int sampleId=1; -int i; -for (tsv = tsvList; tsv != NULL; tsv = tsv->next) - { - int count = tsv->count; - // remove trailing parenthesized phrases as not worth label length - chopSuffixAt(tsv->description, '('); - for (i=0; idescription, tsv->vals[i]); - } -fclose(f); - -// Plot to PNG file -if (!pngTn) - return FALSE; -trashDirFile(pngTn, "hgc", "gtexGene", ".png"); -char cmd[256]; - -/* Exec R in quiet mode, without reading/saving environment or workspace */ -safef(cmd, sizeof(cmd), "Rscript --vanilla --slave hgcData/gtexBoxplot.R %s %s %s %s %s %s", - geneName, dfTn.forCgi, pngTn->forHtml, - doLogTransform ? "log=TRUE" : "log=FALSE", "order=alpha", version); -//NOTE: use "order=score" to order bargraph by median RPKM, descending - -int ret = system(cmd); -if (ret == 0) - return TRUE; -return FALSE; -} static struct gtexGeneBed *getGtexGene(char *item, char *chrom, int start, int end, char *table) /* Retrieve gene info for this item from the main track table */ { struct gtexGeneBed *gtexGene = NULL; struct sqlConnection *conn = hAllocConn(database); char **row; char query[512]; struct sqlResult *sr; if (sqlTableExists(conn, table)) { sqlSafef(query, sizeof query, "select * from %s where name = '%s' and chrom = '%s' " " and chromStart = %d and chromEnd = %d", table, item, chrom, start, end); sr = sqlGetResult(conn, query); row = sqlNextRow(sr); if (row != NULL) { gtexGene = gtexGeneBedLoad(row); } sqlFreeResult(&sr); } hFreeConn(&conn); return gtexGene; } -struct tissueSampleVals *getTissueSampleVals(char *geneId, boolean doLogTransform, - char *version, double *maxValRet) -/* Get sample data for the gene. Optionally log10 it. Return maximum value seen */ -{ -struct hash *tsHash = hashNew(0); -struct tissueSampleVals *tsv; -struct hashEl *hel; -struct slDouble *val; -double maxVal = 0; -struct gtexSampleData *sd = NULL; -char query[256]; -char **row; -char buf[256]; -char *sampleDataTable = "gtexSampleData"; -safef(buf, sizeof(buf), "%s%s", sampleDataTable, gtexVersionSuffixFromVersion(version)); -struct sqlConnection *conn = hAllocConn("hgFixed"); -assert(sqlTableExists(conn, buf)); -sqlSafef(query, sizeof(query), "select * from %s where geneId='%s'", buf, geneId); -struct sqlResult *sr = sqlGetResult(conn, query); -while ((row = sqlNextRow(sr)) != NULL) - { - sd = gtexSampleDataLoad(row); - if ((hel = hashLookup(tsHash, sd->tissue)) == NULL) - { - AllocVar(tsv); - hashAdd(tsHash, sd->tissue, tsv); - } - else - tsv = (struct tissueSampleVals *)hel->val; - maxVal = max(maxVal, sd->score); - val = slDoubleNew(sd->score); - slAddHead(&tsv->valList, val); - } -/* Fill in tissue descriptions, fill values array and calculate stats for plotting - Then make a list, suitable for sorting by tissue or score - NOTE: Most of this not needed for R implementation */ -struct gtexTissue *tis = NULL, *tissues = gtexGetTissues(version); -struct tissueSampleVals *tsList = NULL; -int i; -if (doLogTransform) - maxVal = log10(maxVal+1.0); -for (tis = tissues; tis != NULL; tis = tis->next) - { - tsv = hashFindVal(tsHash, tis->name); - if (tsv == NULL) - { - /* no non-zero values for this tissue/gene */ - AllocVar(tsv); - val = slDoubleNew(0.0); - slAddHead(&tsv->valList, val); - } - tsv->name = tis->name; - tsv->description = tis->description; - tsv->color = tis->color; - int count = tsv->count = slCount(tsv->valList); - double *vals = AllocArray(tsv->vals, count); - for (i=0; ivalList); - if (doLogTransform) - vals[i] = log10(val->val+1.0); - else - vals[i] = val->val; - } - doubleBoxWhiskerCalc(tsv->count, tsv->vals, - &tsv->min, &tsv->q1, &tsv->median, &tsv->q3, &tsv->max); - slAddHead(&tsList, tsv); - } -if (maxValRet != NULL) - *maxValRet = maxVal; -return tsList; -} - -boolean gtexGeneBoxplot(char *geneId, char *geneName, char *version, - boolean doLogTransform, struct tempName *pngTn) -/* Create a png temp file with boxplot of GTEx expression values for this gene. - * GeneId is the Ensembl gene ID. GeneName is the HUGO name, used for graph title; - * If NULL, label with the Ensembl gene ID */ -{ -struct tissueSampleVals *tsvs; -tsvs = getTissueSampleVals(geneId, doLogTransform, version, NULL); -char *label = geneName ? geneName : geneId; -return drawGtexRBoxplot(label, tsvs, doLogTransform, version, pngTn); -} - char *getGeneDescription(struct gtexGeneBed *gtexGene) /* Get description for gene. Needed because knownGene table semantics have changed in hg38 */ { char query[256]; if (sameString(database, "hg38")) { char *geneId = cloneString(gtexGene->geneId); chopSuffix(geneId); sqlSafef(query, sizeof(query), "select kgXref.description from kgXref, knownCanonical where knownCanonical.protein like '%%%s%%' and knownCanonical.transcript=kgXref.kgID", geneId); } else { sqlSafef(query, sizeof(query), "select kgXref.description from kgXref where geneSymbol='%s'", gtexGene->name); } struct sqlConnection *conn = hAllocConn(database); char *desc = sqlQuickString(conn, query); hFreeConn(&conn); return desc; } void doGtexGeneExpr(struct trackDb *tdb, char *item) /* Details of GTEx gene expression item */ { int start = cartInt(cart, "o"); int end = cartInt(cart, "t"); struct gtexGeneBed *gtexGene = getGtexGene(item, seqName, start, end, tdb->table); if (gtexGene == NULL) errAbort("Can't find gene %s in GTEx gene table %s\n", item, tdb->table); genericHeader(tdb, item); printf("Gene: "); char *desc = getGeneDescription(gtexGene); if (desc == NULL) printf("%s
\n", gtexGene->name); else { printf("%s
\n", hgGeneName(), database, gtexGene->name, gtexGene->name); printf("Description: %s
\n", desc); } printf("Ensembl gene ID: %s
\n", gtexGene->geneId); // The actual transcript model is a union, so this identification is approximate // (used just to find a transcript class) char *geneClass = gtexGeneClass(gtexGene); printf("GENCODE biotype: %s
\n", gtexGene->geneType); printf("Gene class: %s
\n", geneClassColorCode(geneClass), geneClass); printf("Total median expression: %0.2f RPKM
\n", gtexGeneTotalMedianExpression(gtexGene)); printf("Score: %d
\n", gtexGene->score); printf("Genomic position: %s %s:%d-%d
\n", database, hgTracksPathAndSettings(), database, gtexGene->chrom, gtexGene->chromStart+1, gtexGene->chromEnd, gtexGene->chrom, gtexGene->chromStart+1, gtexGene->chromEnd); puts("

"); // set gtexDetails (e.g. to 'log') to show log transformed details page // if hgTracks is log-transformed boolean doLogTransform = (trackDbSetting(tdb, "gtexDetails") && cartUsualBooleanClosestToHome(cart, tdb, FALSE, GTEX_LOG_TRANSFORM, GTEX_LOG_TRANSFORM_DEFAULT)); char *version = gtexVersion(tdb->table); struct tempName pngTn; if (gtexGeneBoxplot(gtexGene->geneId, gtexGene->name, version, doLogTransform, &pngTn)) printf("
\n", pngTn.forHtml); -printf("
View at GTEx portal\n", gtexGene->name); +gtexPortalLink(gtexGene->geneId); printTrackHtml(tdb); }