44ccfacbe3a3d4b300f80d48651c77837a4b571e galt Tue Apr 26 11:12:02 2022 -0700 SQL INJECTION Prevention Version 2 - this improves our methods by making subclauses of SQL that get passed around be both easy and correct to use. The way that was achieved was by getting rid of the obscure and not well used functions sqlSafefFrag and sqlDyStringPrintfFrag and replacing them with the plain versions of those functions, since these are not needed anymore. The new version checks for NOSQLINJ in unquoted %-s which is used to include SQL clauses, and will give an error the NOSQLINJ clause is not present, and this will automatically require the correct behavior by developers. sqlDyStringPrint is a very useful function, however because it was not enforced, users could use various other dyString functions and they operated without any awareness or checking for SQL correct use. Now those dyString functions are prohibited and it will produce an error if you try to use a dyString function on a SQL string, which is simply detected by the presence of the NOSQLINJ prefix. diff --git src/hg/hgc/gencodeClick.c src/hg/hgc/gencodeClick.c index aa2389b..bab8f88 100644 --- src/hg/hgc/gencodeClick.c +++ src/hg/hgc/gencodeClick.c @@ -1,960 +1,960 @@ /* gencodeClick - click handling for GENCODE tracks */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "hgc.h" #include "gencodeClick.h" #include "ccdsClick.h" #include "genePred.h" #include "genePredReader.h" #include "ensFace.h" #include "htmshell.h" #include "jksql.h" #include "regexHelper.h" #include "encode/wgEncodeGencodeAttrs.h" #include "encode/wgEncodeGencodeGeneSource.h" #include "encode/wgEncodeGencodePdb.h" #include "encode/wgEncodeGencodePubMed.h" #include "encode/wgEncodeGencodeRefSeq.h" #include "encode/wgEncodeGencodeTag.h" #include "encode/wgEncodeGencodeTranscriptSource.h" #include "encode/wgEncodeGencodeTranscriptSupport.h" #include "encode/wgEncodeGencodeExonSupport.h" #include "encode/wgEncodeGencodeUniProt.h" #include "encode/wgEncodeGencodeEntrezGene.h" #include "encode/wgEncodeGencodeAnnotationRemark.h" #include "encode/wgEncodeGencodeTranscriptionSupportLevel.h" /* * General notes: * - this will be integrated into hgGene at some point, however this was * done as part of hgc for timing reasons and to allow more time to design * the hgGene part. * - Tables below will output at least one row even if no data is available. * */ /* Various URLs and URL templates. At one time, these were in the ra file, * but that didn't prove that helpful and end up requiring updated the ra * files for every GENCODE version if a URL was added or changed. */ //FIXME: clean up RA files when CGIs no longer need them static char *ensemblTranscriptIdUrl = "http://www.ensembl.org/%s/Transcript/Summary?db=core;t=%s"; static char *ensemblGeneIdUrl = "http://www.ensembl.org/%s/Gene/Summary?db=core;t=%s"; static char *ensemblProteinIdUrl = "http://www.ensembl.org/%s/Transcript/ProteinSummary?db=core;t=%s"; static char *ensemblSupportingEvidUrl = "http://www.ensembl.org/%s/Transcript/SupportingEvidence?db=core;t=%s"; static char *ensemblH37TranscriptIdUrl = "http://grch37.ensembl.org/%s/Transcript/Summary?db=core;t=%s"; static char *ensemblH37GeneIdUrl = "http://grch37.ensembl.org/%s/Gene/Summary?db=core;t=%s"; static char *ensemblH37ProteinIdUrl = "http://grch37.ensembl.org/%s/Transcript/ProteinSummary?db=core;t=%s"; static char *ensemblH37SupportingEvidUrl = "http://grch37.ensembl.org/%s/Transcript/SupportingEvidence?db=core;t=%s"; static char *gencodeBiotypesUrl = "http://www.gencodegenes.org/pages/biotypes.html"; static char *gencodeTagsUrl = "http://www.gencodegenes.org/pages/tags.html"; static char *yalePseudoUrl = "http://tables.pseudogene.org/%s"; static char *hgncUrl = " https://www.genenames.org/data/gene-symbol-report/#!/symbol/%s"; static char *geneCardsUrl = "http://www.genecards.org/cgi-bin/carddisp.pl?gene=%s"; static char *apprisHomeUrl = "http://appris-tools.org/"; static char *apprisGeneUrl = "http://appris-tools.org/#/database/id/%s/%s?sc=ensembl"; static char* UNKNOWN = "unknown"; static char *getBaseAcc(char *acc, char *accBuf, int accBufSize) /* get the accession with version number dropped. */ { safecpy(accBuf, accBufSize, acc); char *dot = strchr(accBuf, '.'); if (dot != NULL) *dot = '\0'; return accBuf; } static bool haveGencodeTable(struct trackDb *tdb, char *tableBase) /* determine if table is in settings and thus in this gencode release */ { return trackDbSetting(tdb, tableBase) != NULL; } static char *getGencodeTable(struct trackDb *tdb, char *tableBase) /* get a table name from the settings. */ { return trackDbRequiredSetting(tdb, tableBase); } static char* getGencodeVersion(struct trackDb *tdb) /* get the GENCODE version or NULL for < V7, which is not supported * by this module. */ { return trackDbSetting(tdb, "wgEncodeGencodeVersion"); } static boolean isGrcH37Native(struct trackDb *tdb) /* Is this GENCODE GRCh37 native build, which requires a different Ensembl site. */ { // check for non-lifted GENCODE on GRCh37/hg19 if (sameString(database, "hg19")) return stringIn("lift37", getGencodeVersion(tdb)) == NULL; else return FALSE; } static boolean isFakeGeneSymbol(char* sym) /* is this a static gene symbol? */ { static const char *regexp = "^AC[0-9]+\\.[0-9]+$"; return regexMatch(sym, regexp); } static int transAnnoCmp(const void *va, const void *vb) /* Compare genePreds, sorting to keep select gene first. The only cases * that annotations will be duplicated is if they are in the PAR and thus * on different chroms. */ { const struct genePred *a = *((struct genePred **)va); const struct genePred *b = *((struct genePred **)vb); if (sameString(a->name, seqName)) return -1; else if (sameString(b->name, seqName)) return 1; else return strcmp(a->name, b->name); } static bool isProteinCodingTrans(struct wgEncodeGencodeAttrs *transAttrs) /* is a transcript protein coding? */ { return sameString(transAttrs->transcriptClass, "coding"); } static struct genePred *transAnnoLoad(struct sqlConnection *conn, struct trackDb *tdb, char *gencodeId) /* load the gencode annotations and sort the one corresponding to the one that was clicked on is * first. Should only have one or two. */ { // must check chrom due to PAR char where[256]; -sqlSafefFrag(where, sizeof(where), "(chrom = \"%s\") and (name = \"%s\")", seqName, gencodeId); +sqlSafef(where, sizeof(where), "(chrom = \"%s\") and (name = \"%s\")", seqName, gencodeId); struct genePred *transAnno = genePredReaderLoadQuery(conn, tdb->track, where); slSort(&transAnno, transAnnoCmp); return transAnno; } static struct wgEncodeGencodeAttrs *transAttrsLoad(struct trackDb *tdb, struct sqlConnection *conn, char *gencodeId) /* load the gencode attributes */ { char query[1024]; sqlSafef(query, sizeof(query), "select * from %s where transcriptId = \"%s\"", getGencodeTable(tdb, "wgEncodeGencodeAttrs"), gencodeId); struct sqlResult *sr = sqlGetResult(conn, query); char **row = sqlNextRow(sr); if (row == NULL) errAbort("gencode transcript %s not found in %s", gencodeId, getGencodeTable(tdb, "wgEncodeGencodeAttrs")); // older version don't have proteinId column. struct wgEncodeGencodeAttrs *transAttrs = wgEncodeGencodeAttrsLoad(row, sqlCountColumns(sr)); sqlFreeResult(&sr); return transAttrs; } static void getGeneBounds(struct trackDb *tdb, struct sqlConnection *conn, struct genePred *transAnno, int *geneChromStart, int *geneChromEnd) /* find bounds for the gene */ { // must check chrom due to PAR char where[256]; -sqlSafefFrag(where, sizeof(where), "(chrom = \"%s\") and (name2 = \"%s\")", seqName, transAnno->name2); +sqlSafef(where, sizeof(where), "(chrom = \"%s\") and (name2 = \"%s\")", seqName, transAnno->name2); struct genePred *geneAnnos = genePredReaderLoadQuery(conn, tdb->track, where); struct genePred *geneAnno; *geneChromStart = transAnno->txStart; *geneChromEnd = transAnno->txEnd; for (geneAnno = geneAnnos; geneAnno != NULL; geneAnno = geneAnno->next) { *geneChromStart = min(*geneChromStart, geneAnno->txStart); *geneChromEnd = max(*geneChromEnd, transAnno->txEnd); } genePredFreeList(&geneAnnos); } static void *metaDataLoad(struct trackDb *tdb, struct sqlConnection *conn, char *gencodeId, char *tableBase, char *keyCol, unsigned queryOpts, sqlLoadFunc loadFunc) /* load autoSql objects for gencode meta data. */ { return sqlQueryObjs(conn, loadFunc, queryOpts, "select * from %s where %s = \"%s\"", getGencodeTable(tdb, tableBase), keyCol, gencodeId); } static int uniProtDatasetCmp(const void *va, const void *vb) /* Compare wgEncodeGencodeUniProt by dateset */ { const struct wgEncodeGencodeUniProt *a = *((struct wgEncodeGencodeUniProt **)va); const struct wgEncodeGencodeUniProt *b = *((struct wgEncodeGencodeUniProt **)vb); return a->dataset - b->dataset; } static char *getMethodDesc(char *source) /* return the annotation method name based gene or transcript source */ { // sometimes backmap doesn't get every entry method entry mapped. Until that // is fixed, allow it to be missing // looks for being havana and/or ensembl // classifies other sources as automatic (mt_genbank_import ncrna ncrna_pseudogene) bool hasHav = containsStringNoCase(source, "havana") != NULL; bool hasEns = containsStringNoCase(source, "ensembl") != NULL; if (hasHav && hasEns) return "manual & automatic"; else if (hasHav) return "manual"; else return "automatic"; } static char *getLevelDesc(int level) /* return english description for level */ { if (level == 1) return "validated"; else if (level == 2) return "manual"; else if (level == 3) return "automatic"; else return "unknown"; } static char *getSupportLevelDesc(struct wgEncodeGencodeTranscriptionSupportLevel *tsl) /* return description for level */ { static char buf[32]; if ((tsl == NULL) || (tsl->level <= 0)) return "tslNA"; else { safef(buf, sizeof(buf), "tsl%d", tsl->level); return buf; } } static char* getScientificNameSym(void) /* get the scientific name of an organism in the form "Homo_sapiens" * WARNING: static return */ { static char sciNameSym[128]; char *sciName = hScientificName(database); if (sciName == NULL) errAbort("can't get scientific name for %s", database); safecpy(sciNameSym, sizeof(sciNameSym), sciName); freeMem(sciName); subChar(sciNameSym, ' ', '_'); return sciNameSym; } static void prExtIdAnchor(char *id, char *urlTemplate) /* if an id to an external database is not empty, print an HTML anchor to it */ { if (!isEmpty(id)) { char urlBuf[512]; safef(urlBuf, sizeof(urlBuf), urlTemplate, id); printf("<a href=\"%s\" target=_blank>%s</a>", urlBuf, id); } } #if UNUSED static void prTdExtIdAnchor(char *id, char *urlTemplate) /* print a table data element with an anchor for a id */ { printf("<td>"); prExtIdAnchor(id, urlTemplate); } #endif static void prEnsIdAnchor(char *id, char *urlTemplate) /* if an id to an ensembl database is not empty, print an HTML anchor to it */ { if (!isEmpty(id)) { char idBuf[64], urlBuf[512]; /* The lift37 releases append a '_N' modifier to the ids to indicate the are * mapped. N is an integer mapping version. Don't include this in link if it exists. */ safecpy(idBuf, sizeof(idBuf), id); char *p = strchr(idBuf, '_'); if (p != NULL) *p = '\0'; safef(urlBuf, sizeof(urlBuf), urlTemplate, getScientificNameSym(), idBuf); printf("<a href=\"%s\" target=_blank>%s</a>", urlBuf, id); } } static void prTdEnsIdAnchor(char *id, char *urlTemplate) /* print a table data element with an ensembl anchor for a id */ { printf("<td>"); prEnsIdAnchor(id, urlTemplate); } static void prApprisTdAnchor(char *id, char *label, char *urlTemplate) /* print a gene or transcript link to APPRIS */ { // under bar separated, lower case species name. char *speciesArg = hScientificName(database); toLowerN(speciesArg, strlen(speciesArg)); subChar(speciesArg, ' ', '_'); char accBuf[64]; printf("<td><a href=\""); printf(urlTemplate, speciesArg, getBaseAcc(id, accBuf, sizeof(accBuf))); printf("\" target=_blank>%s</a>", label); freeMem(speciesArg); } static void writePosLink(char *chrom, int chromStart, int chromEnd) /* write link to a genomic position */ { printf("<a href=\"%s&db=%s&position=%s%%3A%d-%d\">%s:%d-%d</A>", hgTracksPathAndSettings(), database, chrom, chromStart, chromEnd, chrom, chromStart+1, chromEnd); } static bool geneHasApprisTranscripts(struct trackDb *tdb, struct sqlConnection *conn, struct wgEncodeGencodeAttrs *transAttrs) /* check if any transcript in a gene has an APPRIS tags */ { char query[1024]; -sqlSafefFrag(query, sizeof(query), +sqlSafef(query, sizeof(query), "%s tag where tag.tag like 'appris%%' and transcriptId in " "(select transcriptId from %s where geneId='%s')", getGencodeTable(tdb, "wgEncodeGencodeTag"), getGencodeTable(tdb, "wgEncodeGencodeAttrs"), transAttrs->geneId); return sqlRowCount(conn, query) > 0; } static char* findApprisTag(struct wgEncodeGencodeTag *tags) /* search list for APPRIS tag or NULL */ { struct wgEncodeGencodeTag *tag; for (tag = tags; tag != NULL; tag = tag->next) { if (startsWith("appris_", tag->tag)) return tag->tag; } return NULL; } static char* apprisTagToSymbol(char* tag) /* convert APPRIS tag to the symbol use by APPRIS. WARNING static return. */ { // appris_principal_1 -> PRINCIPAL:1 static char buf[64]; safecpy(buf, sizeof(buf), tag+7); touppers(buf); subChar(buf, '_', ':'); return buf; } static void writeAprrisRow(struct sqlConnection *conn, struct trackDb *tdb, struct wgEncodeGencodeAttrs *transAttrs, struct wgEncodeGencodeTag *tags) /* write row for APPRIS */ { // Get labels to use. if transcript has an appris tag, then we link to the transcript. // if it doesn;t have a appris tag, we can still link to the gene if any of the transcripts // have appris tags char* apprisTag = findApprisTag(tags); char* transLabel = (apprisTag != NULL) ? apprisTagToSymbol(apprisTag) : NULL; char *geneLabel = ((apprisTag != NULL) || geneHasApprisTranscripts(tdb, conn, transAttrs)) ? transAttrs->geneName : NULL; // APPRIS gene and transcript now go to the same location printf("<tr><th><a href=\"%s\" target=_blank>APPRIS</a>\n", apprisHomeUrl); if (transLabel != NULL) prApprisTdAnchor(transAttrs->geneId, transLabel, apprisGeneUrl); else printf("<td> "); if (geneLabel != NULL) prApprisTdAnchor(transAttrs->geneId, geneLabel, apprisGeneUrl); else printf("<td> "); printf("</tr>\n"); } static void writeBasicInfoHtml(struct sqlConnection *conn, struct trackDb *tdb, char *gencodeId, struct genePred *transAnno, struct wgEncodeGencodeAttrs *transAttrs, int geneChromStart, int geneChromEnd, struct wgEncodeGencodeGeneSource *geneSource, struct wgEncodeGencodeTranscriptSource *transcriptSource, struct wgEncodeGencodeTag *tags, bool haveTsl, struct wgEncodeGencodeTranscriptionSupportLevel *tsl) /* write basic HTML info for all genes */ { // basic gene and transcript information printf("<table class=\"hgcCcds\" style=\"white-space: nowrap;\"><thead>\n"); printf("<tr><th><th>Transcript<th>Gene</tr>\n"); printf("</thead><tbody>\n"); printf("<tr><th>GENCODE id"); prTdEnsIdAnchor(transAttrs->transcriptId, (isGrcH37Native(tdb) ? ensemblH37TranscriptIdUrl: ensemblTranscriptIdUrl)); prTdEnsIdAnchor(transAttrs->geneId, (isGrcH37Native(tdb) ? ensemblH37GeneIdUrl : ensemblGeneIdUrl)); printf("</tr>\n"); if (transAttrs->proteinId != NULL) { // protein id in database, maybe not for this transcript printf("<tr><th>Protein id"); if (strlen(transAttrs->proteinId) > 0) prTdEnsIdAnchor(transAttrs->proteinId, (isGrcH37Native(tdb) ? ensemblH37ProteinIdUrl: ensemblProteinIdUrl)); else printf("<td> "); printf("<td>"); printf("</tr>\n"); } printf("<tr><th>HAVANA manual id"); printf("<td>%s", transAttrs->havanaTranscriptId); printf("<td>%s", transAttrs->havanaGeneId); printf("</tr>\n"); printf("<tr><th>Position"); printf("<td>"); writePosLink(transAnno->chrom, transAnno->txStart, transAnno->txEnd); printf("<td>"); writePosLink(transAnno->chrom, geneChromStart, geneChromEnd); printf("</tr>\n"); printf("<tr><th>Strand<td>%s<td></tr>\n", transAnno->strand); printf("<tr><th><a href=\"%s\" target = _blank>Biotype</a><td>%s<td>%s</tr>\n", gencodeBiotypesUrl, transAttrs->transcriptType, transAttrs->geneType); printf("<tr><th>Annotation Level<td>%s (%d)<td></tr>\n", getLevelDesc(transAttrs->level), transAttrs->level); char *transSrcDesc = (transcriptSource != NULL) ? getMethodDesc(transcriptSource->source) : UNKNOWN; char *geneSrcDesc = (geneSource != NULL) ? getMethodDesc(geneSource->source) : UNKNOWN; printf("<tr><th>Annotation Method<td>%s<td>%s</tr>\n", transSrcDesc, geneSrcDesc); if (haveTsl) { char *tslDesc = getSupportLevelDesc(tsl); printf("<tr><th><a href=\"#tsl\">Transcription Support Level</a><td><a href=\"#%s\">%s</a><td></tr>\n", tslDesc, tslDesc); } printf("<tr><th>HGNC gene symbol<td colspan=2>"); if (!isFakeGeneSymbol(transAttrs->geneName)) prExtIdAnchor(transAttrs->geneName, hgncUrl); printf("</tr>\n"); printf("<tr><th>CCDS<td>"); if (!isEmpty(transAttrs->ccdsId)) { printf("<a href=\""); printCcdsExtUrl(transAttrs->ccdsId); printf("\" target=_blank>%s</a>", transAttrs->ccdsId); } printf("<td></tr>\n"); printf("<tr><th>GeneCards<td colspan=2>"); if (!isFakeGeneSymbol(transAttrs->geneName)) prExtIdAnchor(transAttrs->geneName, geneCardsUrl); printf("</tr>\n"); if (isProteinCodingTrans(transAttrs)) writeAprrisRow(conn, tdb, transAttrs, tags); // FIXME: add sequence here?? printf("</tbody></table>\n"); } static void writeSequenceHtml(struct trackDb *tdb, char *gencodeId, struct genePred *transAnno) /* write links to get sequences */ { printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=\"2\">Sequences</tr>\n"); printf("</thead><tbody>\n"); if (transAnno->cdsStart < transAnno->cdsEnd) { // protein coding printf("<tr><td width=\"50%%\">"); hgcAnchorSomewhere("htcGeneMrna", gencodeId, tdb->table, seqName); printf("Predicted mRNA</a>"); printf("<td width=\"50%%\">"); hgcAnchorSomewhere("htcTranslatedPredMRna", gencodeId, "translate", seqName); printf("Predicted protein</a></tr>\n"); } else { // non-protein coding printf("<tr><td width=\"50%%\">"); hgcAnchorSomewhere("htcGeneMrna", gencodeId, tdb->table, seqName); printf("Predicted mRNA</a><td width=\"50%%\"></tr>\n"); } printf("</tbody></table>\n"); } static void writeAnnotationRemarkHtml(struct wgEncodeGencodeAnnotationRemark *remarks) /* write HTML links to remarks */ { printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=\"1\">Annotation Remarks</tr>\n"); printf("</thead><tbody>\n"); // make sure at least one empty row in printed if (remarks == NULL) printf("<tr><td></td></tr>\n"); struct wgEncodeGencodeAnnotationRemark *remark; for (remark = remarks; remark != NULL; remark = remark->next) { char *encRemark = htmlEncode(remark->remark); printf("<tr><td>%s</td></tr>\n", encRemark); freeMem(encRemark); } printf("</tbody></table>\n"); } static void writePdbLinkHtml(struct wgEncodeGencodePdb *pdbs) /* write HTML links to PDB */ { printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=\"3\">Protein Data Bank</tr>\n"); printf("</thead><tbody>\n"); struct wgEncodeGencodePdb *pdb = pdbs; int i, rowCnt = 0; while ((pdb != NULL) || (rowCnt == 0)) { printf("<tr>"); for (i = 0; i < 3; i++) { printf("<td width=\"33.33%%\">"); if (pdb != NULL) { printf("<a href=\"http://www.rcsb.org/pdb/cgi/explore.cgi?job=graphics&pdbId=%s\" target=_blank>%s</a>", pdb->pdbId, pdb->pdbId); pdb = pdb->next; } } printf("</tr>\n"); rowCnt++; } printf("</tbody></table>\n"); } static void writePubMedEntry(struct wgEncodeGencodePubMed *pubMed) /* write HTML table entry for a pubMed */ { printf("<td width=\"33.33%%\"><a href=\""); printEntrezPubMedUidUrl(stdout, pubMed->pubMedId); printf("\" target=_blank>%d</a>", pubMed->pubMedId); } static void writePubMedLinkHtml(struct wgEncodeGencodePubMed *pubMeds) /* write HTML links to PubMed */ { printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=\"3\">PubMed</tr>\n"); printf("</thead><tbody>\n"); struct wgEncodeGencodePubMed *pubMed = pubMeds; int i, rowCnt = 0; while ((pubMed != NULL) || (rowCnt == 0)) { printf("<tr>"); for (i = 0; i < 3; i++) { if (pubMed != NULL) { writePubMedEntry(pubMed); pubMed = pubMed->next; } else printf("<td width=\"33.33%%\">"); } printf("</tr>\n"); rowCnt++; } printf("</tbody></table>\n"); } static void writeEntrezGeneEntry(struct wgEncodeGencodeEntrezGene *entrezGene) /* write HTML table entry for a entrezGene */ { printf("<td width=\"33.33%%\"><a href=\""); printEntrezGeneUrl(stdout, entrezGene->entrezGeneId); printf("\" target=_blank>%d</a>", entrezGene->entrezGeneId); } static void writeEntrezGeneLinkHtml(struct wgEncodeGencodeEntrezGene *entrezGenes) /* write HTML links to EntrezGene */ { printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=\"3\">Entrez Gene</tr>\n"); printf("</thead><tbody>\n"); struct wgEncodeGencodeEntrezGene *entrezGene = entrezGenes; int i, rowCnt = 0; while ((entrezGene != NULL) || (rowCnt == 0)) { printf("<tr>"); for (i = 0; i < 3; i++) { if (entrezGene != NULL) { writeEntrezGeneEntry(entrezGene); entrezGene = entrezGene->next; } else printf("<td width=\"33.33%%\">"); } printf("</tr>\n"); rowCnt++; } printf("</tbody></table>\n"); } static void writeRefSeqEntry(struct wgEncodeGencodeRefSeq *refSeq) /* write HTML table entry for a RefSeq */ { printf("<td width=\"50%%\"><a href=\""); printEntrezNucleotideUrl(stdout, refSeq->rnaAcc); printf("\" target=_blank>%s</a>", refSeq->rnaAcc); printf("<td width=\"50%%\">"); if (!isEmpty(refSeq->pepAcc)) { printf("<a href=\""); printEntrezProteinUrl(stdout, refSeq->pepAcc); printf("\" target=_blank>%s</a>", refSeq->pepAcc); } } static void writeRefSeqLinkHtml(struct wgEncodeGencodeRefSeq *refSeqs) /* write HTML links to RefSeq */ { printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=\"2\">RefSeq</tr>\n"); printf("<tr class=\"hgcCcdsSub\"><th>RNA<th>Protein</tr>\n"); printf("</thead><tbody>\n"); struct wgEncodeGencodeRefSeq *refSeq = refSeqs; int rowCnt = 0; while ((refSeq != NULL) || (rowCnt == 0)) { printf("<tr>"); if (refSeq != NULL) { writeRefSeqEntry(refSeq); refSeq = refSeq->next; } else printf("<td width=\"50%%\"><td width=\"50%%\">"); printf("</tr>\n"); rowCnt++; } printf("</tbody></table>\n"); } static void writeUniProtEntry(struct wgEncodeGencodeUniProt *uniProt) /* write HTML table entry for a UniProt */ { printf("<td width=\"15%%\">%s", (uniProt->dataset == wgEncodeGencodeUniProtSwissProt) ? "SwissProt" : "TrEMBL"); printf("<td width=\"15%%\"><a href=\""); printSwissProtAccUrl(stdout, uniProt->acc); printf("\" target=_blank>%s</a>", uniProt->acc); printf("<td width=\"20%%\"><a href=\""); printSwissProtAccUrl(stdout, uniProt->name); printf("\" target=_blank>%s</a>", uniProt->name); } static void writeUniProtLinkHtml(struct wgEncodeGencodeUniProt *uniProts) /* write HTML links to UniProt */ { printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=\"6\">UniProt</tr>\n"); printf("<tr class=\"hgcCcdsSub\"><th>Data set<th>Accession<th>Name<th>Data set<th>Accession<th>Name</tr>\n"); printf("</thead><tbody>\n"); int i, rowCnt = 0; struct wgEncodeGencodeUniProt *uniProt = uniProts; while ((uniProt != NULL) || (rowCnt == 0)) { printf("<tr>"); for (i = 0; i < 2; i++) { if (uniProt != NULL) { writeUniProtEntry(uniProt); uniProt = uniProt->next; } else printf("<td width=\"50%%\" colspan=3>"); } printf("</tr>\n"); rowCnt++; } printf("</tbody></table>\n"); } struct supportEvid /* temporary struct for subset of supporting information displayed */ { struct supportEvid *next; char *seqId; /* sequence id (memory not owned) */ char *seqSrc; /* evidence source database (memory not owned) */ }; static int supportEvidCmp(const void *va, const void *vb) /* Compare two supportEvid objects. */ { const struct supportEvid *a = *((struct supportEvid **)va); const struct supportEvid *b = *((struct supportEvid **)vb); int diff = strcmp(a->seqSrc, b->seqSrc); if (diff == 0) diff = strcmp(a->seqId, b->seqId); return diff; } static void transcriptSupportToSupportEvid(struct supportEvid **supportEvids, struct wgEncodeGencodeTranscriptSupport *transcriptSupports) /* convert transcriptSupport to common structure */ { struct wgEncodeGencodeTranscriptSupport *transcriptSupport; for (transcriptSupport = transcriptSupports; transcriptSupport != NULL; transcriptSupport = transcriptSupport->next) { struct supportEvid *supportEvid; AllocVar(supportEvid); supportEvid->seqId = transcriptSupport->seqId; supportEvid->seqSrc = transcriptSupport->seqSrc; slAddHead(supportEvids, supportEvid); } } static void exonSupportToSupportEvid(struct supportEvid **supportEvids, struct wgEncodeGencodeExonSupport *exonSupports) /* convert exonSupport to common structure */ { struct wgEncodeGencodeExonSupport *exonSupport; for (exonSupport = exonSupports; exonSupport != NULL; exonSupport = exonSupport->next) { struct supportEvid *supportEvid; AllocVar(supportEvid); supportEvid->seqId = exonSupport->seqId; supportEvid->seqSrc = exonSupport->seqSrc; slAddHead(supportEvids, supportEvid); } } static void sortUniqSupportExidence(struct supportEvid **supportEvids) /* sort support evidence and make unique */ { struct supportEvid *supportEvid, *supportEvids2 = NULL; slSort(supportEvids, supportEvidCmp); // make unique while ((supportEvid = slPopHead(supportEvids)) != NULL) { if ((supportEvids2 == NULL) || (supportEvidCmp(&supportEvid, &supportEvids2) != 0)) slAddHead(&supportEvids2, supportEvid); else freeMem(supportEvid); } slReverse(&supportEvids2); *supportEvids = supportEvids2; } static struct supportEvid *loadSupportEvid(struct wgEncodeGencodeTranscriptSupport *transcriptSupports, struct wgEncodeGencodeExonSupport *exonSupports) /* load transcript and supporting evidence into a common structure */ { struct supportEvid *supportEvids = NULL; transcriptSupportToSupportEvid(&supportEvids, transcriptSupports); exonSupportToSupportEvid(&supportEvids, exonSupports); sortUniqSupportExidence(&supportEvids); return supportEvids; } static void writeSupportEvidenceEntry(struct supportEvid *supportEvid) /* write HTML table entry for a supporting evidence */ { // FIXME: should link to sources when possible printf("<td width=\"25%%\">%s", supportEvid->seqSrc); printf("<td width=\"25%%\">%s", supportEvid->seqId); } static void writeSupportingEvidenceLinkHtml(struct trackDb *tdb, char *gencodeId, struct wgEncodeGencodeTranscriptSupport *transcriptSupports, struct wgEncodeGencodeExonSupport *exonSupports) /* write HTML links to supporting evidence */ { struct supportEvid *supportEvids = loadSupportEvid(transcriptSupports, exonSupports); printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=\"4\">Supporting Evidence ("); prEnsIdAnchor(gencodeId, (isGrcH37Native(tdb) ? ensemblH37SupportingEvidUrl: ensemblSupportingEvidUrl)); printf(")</tr>\n"); printf("<tr class=\"hgcCcdsSub\"><th>Source<th>Sequence<th>Source<th>Sequence</tr>\n"); printf("</thead><tbody>\n"); struct supportEvid *supportEvid = supportEvids; int i, rowCnt = 0; while ((supportEvid != NULL) || (rowCnt == 0)) { printf("<tr>"); for (i = 0; i < 2; i++) { if (supportEvid != NULL) { writeSupportEvidenceEntry(supportEvid); supportEvid = supportEvid->next; } else printf("<td colspan=\"2\" width=\"50%%\">"); } printf("</tr>\n"); rowCnt++; } printf("</tbody></table>\n"); slFreeList(&supportEvids); } static void writeTagEntry(struct wgEncodeGencodeTag *tag) /* write HTML table entry for a Tag */ { // FIXME: link to help once gencodegenes.org has it printf("<td width=\"33.33%%\">%s", tag->tag); } static void writeTagLinkHtml(struct wgEncodeGencodeTag *tags) /* write HTML links to Tag */ { printf("<table class=\"hgcCcds\"><thead>\n"); printf("<tr><th colspan=3><a href=\"%s\" target=_blank>Tags</a></tr>\n", gencodeTagsUrl); printf("</thead><tbody>\n"); int i, rowCnt = 0; struct wgEncodeGencodeTag *tag = tags; while ((tag != NULL) || (rowCnt == 0)) { printf("<tr>"); for (i = 0; i < 3; i++) { if (tag != NULL) { writeTagEntry(tag); tag = tag->next; } else printf("<td width=\"33.33%%\">"); } printf("</tr>\n"); rowCnt++; } printf("</tbody></table>\n"); } static void doGencodeGeneTrack(struct trackDb *tdb, char *gencodeId, struct sqlConnection *conn, struct genePred *transAnno) /* Process click on a GENCODE gene annotation track. */ { struct wgEncodeGencodeAttrs *transAttrs = transAttrsLoad(tdb, conn, gencodeId); char *gencodeGeneId = transAttrs->geneId; struct wgEncodeGencodeGeneSource *geneSource = metaDataLoad(tdb, conn, gencodeGeneId, "wgEncodeGencodeGeneSource", "geneId", sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeGeneSourceLoad); struct wgEncodeGencodeTranscriptSource *transcriptSource = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptSource", "transcriptId", sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeTranscriptSourceLoad); bool haveRemarks = haveGencodeTable(tdb, "wgEncodeGencodeAnnotationRemark"); struct wgEncodeGencodeAnnotationRemark *remarks = haveRemarks ? metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeAnnotationRemark", "transcriptId", 0, (sqlLoadFunc)wgEncodeGencodeAnnotationRemarkLoad) : NULL; struct wgEncodeGencodePdb *pdbs = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodePdb", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodePdbLoad); struct wgEncodeGencodePubMed *pubMeds = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodePubMed", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodePubMedLoad); bool haveEntrezGene = haveGencodeTable(tdb, "wgEncodeGencodeEntrezGene"); struct wgEncodeGencodeEntrezGene *entrezGenes = haveEntrezGene ? metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeEntrezGene", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeEntrezGeneLoad) : NULL; struct wgEncodeGencodeRefSeq *refSeqs = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeRefSeq", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeRefSeqLoad); struct wgEncodeGencodeTag *tags = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTag", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeTagLoad); struct wgEncodeGencodeTranscriptSupport *transcriptSupports = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptSupport", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeTranscriptSupportLoad); struct wgEncodeGencodeExonSupport *exonSupports = NULL; // exonSupports not available in back mapped GENCODE releases if (haveGencodeTable(tdb, "wgEncodeGencodeExonSupport")) exonSupports = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeExonSupport", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeExonSupportLoad); struct wgEncodeGencodeUniProt *uniProts = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeUniProt", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeUniProtLoad); slSort(&uniProts, uniProtDatasetCmp); bool haveTsl = haveGencodeTable(tdb, "wgEncodeGencodeTranscriptionSupportLevel"); struct wgEncodeGencodeTranscriptionSupportLevel *tsl = haveTsl ? metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptionSupportLevel", "transcriptId", 0, (sqlLoadFunc)wgEncodeGencodeTranscriptionSupportLevelLoad) : NULL; int geneChromStart, geneChromEnd; getGeneBounds(tdb, conn, transAnno, &geneChromStart, &geneChromEnd); char title[256]; safef(title, sizeof(title), "GENCODE V%s Transcript Annotation", getGencodeVersion(tdb)); char header[256]; safef(header, sizeof(header), "%s %s", title, gencodeId); if (!isEmpty(transAttrs->geneName)) safef(header, sizeof(header), "%s %s (%s)", title, gencodeId, transAttrs->geneName); else safef(header, sizeof(header), "%s %s", title, gencodeId); cartWebStart(cart, database, "%s", header); printf("<H2>%s</H2>\n", header); writeBasicInfoHtml(conn, tdb, gencodeId, transAnno, transAttrs, geneChromStart, geneChromEnd, geneSource, transcriptSource, tags, haveTsl, tsl); writeTagLinkHtml(tags); writeSequenceHtml(tdb, gencodeId, transAnno); if (haveRemarks) writeAnnotationRemarkHtml(remarks); if (isProteinCodingTrans(transAttrs)) writePdbLinkHtml(pdbs); writePubMedLinkHtml(pubMeds); if (haveEntrezGene) writeEntrezGeneLinkHtml(entrezGenes); writeRefSeqLinkHtml(refSeqs); if (isProteinCodingTrans(transAttrs)) writeUniProtLinkHtml(uniProts); writeSupportingEvidenceLinkHtml(tdb, gencodeId, transcriptSupports, exonSupports); wgEncodeGencodeAttrsFree(&transAttrs); wgEncodeGencodeAnnotationRemarkFreeList(&remarks); wgEncodeGencodeGeneSourceFreeList(&geneSource); wgEncodeGencodeTranscriptSourceFreeList(&transcriptSource); wgEncodeGencodePdbFreeList(&pdbs); wgEncodeGencodePubMedFreeList(&pubMeds); wgEncodeGencodeEntrezGeneFreeList(&entrezGenes); wgEncodeGencodeRefSeqFreeList(&refSeqs); wgEncodeGencodeTranscriptSupportFreeList(&transcriptSupports); wgEncodeGencodeExonSupportFreeList(&exonSupports); wgEncodeGencodeUniProtFreeList(&uniProts); wgEncodeGencodeTranscriptionSupportLevelFreeList(&tsl); } static void doGencodeGene2WayPseudo(struct trackDb *tdb, char *gencodeId, struct sqlConnection *conn, struct genePred *pseudoAnno) /* Process click on a GENCODE two-way pseudogene annotation track. */ { char header[256]; safef(header, sizeof(header), "GENCODE 2-way consensus pseudogene %s", gencodeId); cartWebStart(cart, database, "%s", header); printf("<H2>%s</H2>\n", header); printf("<b>Yale id:</b> "); prExtIdAnchor(gencodeId, yalePseudoUrl); printf("<br>"); printPos(pseudoAnno->chrom, pseudoAnno->txStart, pseudoAnno->txEnd, pseudoAnno->strand, FALSE, NULL); } static void doGencodeGenePolyA(struct trackDb *tdb, char *gencodeId, struct sqlConnection *conn, struct genePred *polyAAnno) /* Process click on a GENCODE poly-A annotation track. */ { char header[256]; safef(header, sizeof(header), "GENCODE PolyA Annotation %s (%s)", polyAAnno->name2, gencodeId); cartWebStart(cart, database, "%s", header); printf("<H2>%s</H2>\n", header); printf("<b>Annotation id:</b> %s<br>", gencodeId); printf("<b>Annotation Type:</b> %s<br>",polyAAnno->name2); printPos(polyAAnno->chrom, polyAAnno->txStart, polyAAnno->txEnd, polyAAnno->strand, FALSE, NULL); } void doGencodeGene(struct trackDb *tdb, char *gencodeId) /* Process click on a GENCODE annotation. */ { struct sqlConnection *conn = hAllocConn(database); struct genePred *anno = transAnnoLoad(conn, tdb, gencodeId); if (startsWith("wgEncodeGencodeBasic", tdb->track) || startsWith("wgEncodeGencodeComp", tdb->track) || startsWith("wgEncodeGencodePseudoGene", tdb->track)) doGencodeGeneTrack(tdb, gencodeId, conn, anno); else if (startsWith("wgEncodeGencode2wayConsPseudo", tdb->track)) doGencodeGene2WayPseudo(tdb, gencodeId, conn, anno); else if (startsWith("wgEncodeGencodePolya", tdb->track)) doGencodeGenePolyA(tdb, gencodeId, conn, anno); else errAbort("doGencodeGene: track not handled: \"%s\"", tdb->track); htmlHorizontalLine(); printTrackHtml(tdb); genePredFreeList(&anno); hFreeConn(&conn); } bool isNewGencodeGene(struct trackDb *tdb) /* is this a new-style gencode (>= V7) track, as indicated by * the presence of the wgEncodeGencodeVersion setting */ { return getGencodeVersion(tdb) != NULL; }