fcdf5c401c80840d28c346409c4fbb527544fac7
markd
Tue Jun 9 13:59:33 2020 -0700
make gencode hgc robust against metadata that is sometimes not mapped in the backmap releases
diff --git src/hg/hgc/gencodeClick.c src/hg/hgc/gencodeClick.c
index 953ccc9..e31305c 100644
--- src/hg/hgc/gencodeClick.c
+++ src/hg/hgc/gencodeClick.c
@@ -46,30 +46,32 @@
static char *ensemblH37TranscriptIdUrl = "http://grch37.ensembl.org/%s/Transcript/Summary?db=core;t=%s";
static char *ensemblH37GeneIdUrl = "http://grch37.ensembl.org/%s/Gene/Summary?db=core;t=%s";
static char *ensemblH37ProteinIdUrl = "http://grch37.ensembl.org/%s/Transcript/ProteinSummary?db=core;t=%s";
static char *ensemblH37SupportingEvidUrl = "http://grch37.ensembl.org/%s/Transcript/SupportingEvidence?db=core;t=%s";
static char *gencodeBiotypesUrl = "http://www.gencodegenes.org/pages/biotypes.html";
static char *gencodeTagsUrl = "http://www.gencodegenes.org/pages/tags.html";
static char *yalePseudoUrl = "http://tables.pseudogene.org/%s";
static char *hgncUrl = " https://www.genenames.org/data/gene-symbol-report/#!/symbol/%s";
static char *geneCardsUrl = "http://www.genecards.org/cgi-bin/carddisp.pl?gene=%s";
static char *apprisHomeUrl = "http://appris-tools.org/";
static char *apprisGeneUrl = "http://appris-tools.org/#/database/id/%s/%s?sc=ensembl";
+static char* UNKNOWN = "unknown";
+
static char *getBaseAcc(char *acc, char *accBuf, int accBufSize)
/* get the accession with version number dropped. */
{
safecpy(accBuf, accBufSize, acc);
char *dot = strchr(accBuf, '.');
if (dot != NULL)
*dot = '\0';
return accBuf;
}
static bool haveGencodeTable(struct trackDb *tdb, char *tableBase)
/* determine if table is in settings and thus in this gencode release */
{
return trackDbSetting(tdb, tableBase) != NULL;
}
@@ -179,30 +181,32 @@
return sqlQueryObjs(conn, loadFunc, queryOpts, "select * from %s where %s = \"%s\"",
getGencodeTable(tdb, tableBase), keyCol, gencodeId);
}
static int uniProtDatasetCmp(const void *va, const void *vb)
/* Compare wgEncodeGencodeUniProt by dateset */
{
const struct wgEncodeGencodeUniProt *a = *((struct wgEncodeGencodeUniProt **)va);
const struct wgEncodeGencodeUniProt *b = *((struct wgEncodeGencodeUniProt **)vb);
return a->dataset - b->dataset;
}
static char *getMethodDesc(char *source)
/* return the annotation method name based gene or transcript source */
{
+// sometimes backmap doesn't get every entry method entry mapped. Until that
+// is fixed, allow it to be missing
// looks for being havana and/or ensembl
// classifies other sources as automatic (mt_genbank_import ncrna ncrna_pseudogene)
bool hasHav = containsStringNoCase(source, "havana") != NULL;
bool hasEns = containsStringNoCase(source, "ensembl") != NULL;
if (hasHav && hasEns)
return "manual & automatic";
else if (hasHav)
return "manual";
else
return "automatic";
}
static char *getLevelDesc(int level)
/* return english description for level */
{
@@ -410,31 +414,33 @@
printf("\n");
printf("
Position");
printf(" | ");
writePosLink(transAnno->chrom, transAnno->txStart, transAnno->txEnd);
printf(" | ");
writePosLink(transAnno->chrom, geneChromStart, geneChromEnd);
printf(" |
\n");
printf("Strand | %s | |
\n", transAnno->strand);
printf("Biotype | %s | %s |
\n", gencodeBiotypesUrl, transAttrs->transcriptType, transAttrs->geneType);
printf("Annotation Level | %s (%d) | |
\n", getLevelDesc(transAttrs->level), transAttrs->level);
-printf("Annotation Method | %s | %s |
\n", getMethodDesc(transcriptSource->source), getMethodDesc(geneSource->source));
+char *transSrcDesc = (transcriptSource != NULL) ? getMethodDesc(transcriptSource->source) : UNKNOWN;
+char *geneSrcDesc = (geneSource != NULL) ? getMethodDesc(geneSource->source) : UNKNOWN;
+printf("Annotation Method | %s | %s |
\n", transSrcDesc, geneSrcDesc);
if (haveTsl)
{
char *tslDesc = getSupportLevelDesc(tsl);
printf("Transcription Support Level | %s | |
\n", tslDesc, tslDesc);
}
printf("HGNC gene symbol | ");
if (!isFakeGeneSymbol(transAttrs->geneName))
prExtIdAnchor(transAttrs->geneName, hgncUrl);
printf(" |
\n");
printf("CCDS | ");
if (!isEmpty(transAttrs->ccdsId))
{
printf("");
}
printf(" |
\n");
rowCnt++;
}
printf("\n");
}
static void doGencodeGeneTrack(struct trackDb *tdb, char *gencodeId, struct sqlConnection *conn, struct genePred *transAnno)
/* Process click on a GENCODE gene annotation track. */
{
struct wgEncodeGencodeAttrs *transAttrs = transAttrsLoad(tdb, conn, gencodeId);
char *gencodeGeneId = transAttrs->geneId;
-struct wgEncodeGencodeGeneSource *geneSource = metaDataLoad(tdb, conn, gencodeGeneId, "wgEncodeGencodeGeneSource", "geneId", sqlQueryMust|sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeGeneSourceLoad);
-struct wgEncodeGencodeTranscriptSource *transcriptSource = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptSource", "transcriptId", sqlQueryMust|sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeTranscriptSourceLoad);
+struct wgEncodeGencodeGeneSource *geneSource = metaDataLoad(tdb, conn, gencodeGeneId, "wgEncodeGencodeGeneSource", "geneId", sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeGeneSourceLoad);
+struct wgEncodeGencodeTranscriptSource *transcriptSource = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptSource", "transcriptId", sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeTranscriptSourceLoad);
bool haveRemarks = haveGencodeTable(tdb, "wgEncodeGencodeAnnotationRemark");
struct wgEncodeGencodeAnnotationRemark *remarks = haveRemarks ? metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeAnnotationRemark", "transcriptId", 0, (sqlLoadFunc)wgEncodeGencodeAnnotationRemarkLoad) : NULL;
struct wgEncodeGencodePdb *pdbs = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodePdb", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodePdbLoad);
struct wgEncodeGencodePubMed *pubMeds = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodePubMed", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodePubMedLoad);
bool haveEntrezGene = haveGencodeTable(tdb, "wgEncodeGencodeEntrezGene");
struct wgEncodeGencodeEntrezGene *entrezGenes = haveEntrezGene ? metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeEntrezGene", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeEntrezGeneLoad) : NULL;
struct wgEncodeGencodeRefSeq *refSeqs = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeRefSeq", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeRefSeqLoad);
struct wgEncodeGencodeTag *tags = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTag", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeTagLoad);
struct wgEncodeGencodeTranscriptSupport *transcriptSupports = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptSupport", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeTranscriptSupportLoad);
struct wgEncodeGencodeExonSupport *exonSupports = NULL;
// exonSupports not available in back mapped GENCODE releases
if (haveGencodeTable(tdb, "wgEncodeGencodeExonSupport"))
exonSupports = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeExonSupport", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeExonSupportLoad);
struct wgEncodeGencodeUniProt *uniProts = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeUniProt", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeUniProtLoad);
slSort(&uniProts, uniProtDatasetCmp);