fcdf5c401c80840d28c346409c4fbb527544fac7 markd Tue Jun 9 13:59:33 2020 -0700 make gencode hgc robust against metadata that is sometimes not mapped in the backmap releases diff --git src/hg/hgc/gencodeClick.c src/hg/hgc/gencodeClick.c index 953ccc9..e31305c 100644 --- src/hg/hgc/gencodeClick.c +++ src/hg/hgc/gencodeClick.c @@ -46,30 +46,32 @@ static char *ensemblH37TranscriptIdUrl = "http://grch37.ensembl.org/%s/Transcript/Summary?db=core;t=%s"; static char *ensemblH37GeneIdUrl = "http://grch37.ensembl.org/%s/Gene/Summary?db=core;t=%s"; static char *ensemblH37ProteinIdUrl = "http://grch37.ensembl.org/%s/Transcript/ProteinSummary?db=core;t=%s"; static char *ensemblH37SupportingEvidUrl = "http://grch37.ensembl.org/%s/Transcript/SupportingEvidence?db=core;t=%s"; static char *gencodeBiotypesUrl = "http://www.gencodegenes.org/pages/biotypes.html"; static char *gencodeTagsUrl = "http://www.gencodegenes.org/pages/tags.html"; static char *yalePseudoUrl = "http://tables.pseudogene.org/%s"; static char *hgncUrl = " https://www.genenames.org/data/gene-symbol-report/#!/symbol/%s"; static char *geneCardsUrl = "http://www.genecards.org/cgi-bin/carddisp.pl?gene=%s"; static char *apprisHomeUrl = "http://appris-tools.org/"; static char *apprisGeneUrl = "http://appris-tools.org/#/database/id/%s/%s?sc=ensembl"; +static char* UNKNOWN = "unknown"; + static char *getBaseAcc(char *acc, char *accBuf, int accBufSize) /* get the accession with version number dropped. */ { safecpy(accBuf, accBufSize, acc); char *dot = strchr(accBuf, '.'); if (dot != NULL) *dot = '\0'; return accBuf; } static bool haveGencodeTable(struct trackDb *tdb, char *tableBase) /* determine if table is in settings and thus in this gencode release */ { return trackDbSetting(tdb, tableBase) != NULL; } @@ -179,30 +181,32 @@ return sqlQueryObjs(conn, loadFunc, queryOpts, "select * from %s where %s = \"%s\"", getGencodeTable(tdb, tableBase), keyCol, gencodeId); } static int uniProtDatasetCmp(const void *va, const void *vb) /* Compare wgEncodeGencodeUniProt by dateset */ { const struct wgEncodeGencodeUniProt *a = *((struct wgEncodeGencodeUniProt **)va); const struct wgEncodeGencodeUniProt *b = *((struct wgEncodeGencodeUniProt **)vb); return a->dataset - b->dataset; } static char *getMethodDesc(char *source) /* return the annotation method name based gene or transcript source */ { +// sometimes backmap doesn't get every entry method entry mapped. Until that +// is fixed, allow it to be missing // looks for being havana and/or ensembl // classifies other sources as automatic (mt_genbank_import ncrna ncrna_pseudogene) bool hasHav = containsStringNoCase(source, "havana") != NULL; bool hasEns = containsStringNoCase(source, "ensembl") != NULL; if (hasHav && hasEns) return "manual & automatic"; else if (hasHav) return "manual"; else return "automatic"; } static char *getLevelDesc(int level) /* return english description for level */ { @@ -410,31 +414,33 @@ printf("\n"); printf("Position"); printf(""); writePosLink(transAnno->chrom, transAnno->txStart, transAnno->txEnd); printf(""); writePosLink(transAnno->chrom, geneChromStart, geneChromEnd); printf("\n"); printf("Strand%s\n", transAnno->strand); printf("Biotype%s%s\n", gencodeBiotypesUrl, transAttrs->transcriptType, transAttrs->geneType); printf("Annotation Level%s (%d)\n", getLevelDesc(transAttrs->level), transAttrs->level); -printf("Annotation Method%s%s\n", getMethodDesc(transcriptSource->source), getMethodDesc(geneSource->source)); +char *transSrcDesc = (transcriptSource != NULL) ? getMethodDesc(transcriptSource->source) : UNKNOWN; +char *geneSrcDesc = (geneSource != NULL) ? getMethodDesc(geneSource->source) : UNKNOWN; +printf("Annotation Method%s%s\n", transSrcDesc, geneSrcDesc); if (haveTsl) { char *tslDesc = getSupportLevelDesc(tsl); printf("Transcription Support Level%s\n", tslDesc, tslDesc); } printf("HGNC gene symbol"); if (!isFakeGeneSymbol(transAttrs->geneName)) prExtIdAnchor(transAttrs->geneName, hgncUrl); printf("\n"); printf("CCDS"); if (!isEmpty(transAttrs->ccdsId)) { printf(""); } printf("\n"); rowCnt++; } printf("\n"); } static void doGencodeGeneTrack(struct trackDb *tdb, char *gencodeId, struct sqlConnection *conn, struct genePred *transAnno) /* Process click on a GENCODE gene annotation track. */ { struct wgEncodeGencodeAttrs *transAttrs = transAttrsLoad(tdb, conn, gencodeId); char *gencodeGeneId = transAttrs->geneId; -struct wgEncodeGencodeGeneSource *geneSource = metaDataLoad(tdb, conn, gencodeGeneId, "wgEncodeGencodeGeneSource", "geneId", sqlQueryMust|sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeGeneSourceLoad); -struct wgEncodeGencodeTranscriptSource *transcriptSource = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptSource", "transcriptId", sqlQueryMust|sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeTranscriptSourceLoad); +struct wgEncodeGencodeGeneSource *geneSource = metaDataLoad(tdb, conn, gencodeGeneId, "wgEncodeGencodeGeneSource", "geneId", sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeGeneSourceLoad); +struct wgEncodeGencodeTranscriptSource *transcriptSource = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptSource", "transcriptId", sqlQuerySingle, (sqlLoadFunc)wgEncodeGencodeTranscriptSourceLoad); bool haveRemarks = haveGencodeTable(tdb, "wgEncodeGencodeAnnotationRemark"); struct wgEncodeGencodeAnnotationRemark *remarks = haveRemarks ? metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeAnnotationRemark", "transcriptId", 0, (sqlLoadFunc)wgEncodeGencodeAnnotationRemarkLoad) : NULL; struct wgEncodeGencodePdb *pdbs = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodePdb", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodePdbLoad); struct wgEncodeGencodePubMed *pubMeds = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodePubMed", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodePubMedLoad); bool haveEntrezGene = haveGencodeTable(tdb, "wgEncodeGencodeEntrezGene"); struct wgEncodeGencodeEntrezGene *entrezGenes = haveEntrezGene ? metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeEntrezGene", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeEntrezGeneLoad) : NULL; struct wgEncodeGencodeRefSeq *refSeqs = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeRefSeq", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeRefSeqLoad); struct wgEncodeGencodeTag *tags = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTag", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeTagLoad); struct wgEncodeGencodeTranscriptSupport *transcriptSupports = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeTranscriptSupport", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeTranscriptSupportLoad); struct wgEncodeGencodeExonSupport *exonSupports = NULL; // exonSupports not available in back mapped GENCODE releases if (haveGencodeTable(tdb, "wgEncodeGencodeExonSupport")) exonSupports = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeExonSupport", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeExonSupportLoad); struct wgEncodeGencodeUniProt *uniProts = metaDataLoad(tdb, conn, gencodeId, "wgEncodeGencodeUniProt", "transcriptId", sqlQueryMulti, (sqlLoadFunc)wgEncodeGencodeUniProtLoad); slSort(&uniProts, uniProtDatasetCmp);