16720e7dfab79a7f60e91f0cb102a213c3e4738a max Fri Apr 28 15:39:08 2017 -0700 first big commit for hgGeneGraph. Others will follow as QA progresses. refs #13634 diff --git src/hg/hgGeneGraph/hgGeneGraph src/hg/hgGeneGraph/hgGeneGraph new file mode 100755 index 0000000..023818d --- /dev/null +++ src/hg/hgGeneGraph/hgGeneGraph @@ -0,0 +1,1951 @@ +#!/usr/bin/env python2.7 + +# Protein Interaction Viewer for the Genome Browser + +# query tables with prefix "gg" in hgFixed, writes the results to a dot file, +# runs graphviz's "dot" program to create a pathway map from it and write html +# and mapfiles to the trash directory. + +# CGI params: gene=(HGNCsymbol) or link=sym1:sym2 +# optional params: addNeighbors + +# colors: + +# grey+thickness = only text mining data +# light blue, dashed = only high-throughput data + +# light blue, thickness = high-throughput data + text +# dark blue, dashed = only low-throughput data + +# dark blue, thickness = low-throughput data + text +# dark blue + dashed = only pathway data + +# code review +# - os.system is not a security risk here, no variables go into the cmd line +# - mysql statements are not escaped, instead all CGI vars are checked for non-alpha letters + +# hgFixed tables required for this script: ggLink (main table with gene-gene links), +# ggLinkEvent (details about link), ggEventDb (details about links from databases), +# ggEventText (details about links from text mining), ggDoc (details about documents for ggEventText) +# ggGeneName (symbols), ggGeneClass (HPRD/Panther class) + +# these are default python modules on python 2.7, no errors expected here +import sys, cgi, os, string, urllib, operator, hashlib +from sys import exit +from collections import defaultdict, namedtuple +from os.path import * + +# import the UCSC-specific library +sys.path.append("pylib") +from hgLib import cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \ + sqlConnect, sqlQuery, errAbort, cfgOption, runCmd, cgiGetAll, printHgcHeader, \ + printHgcSection, webStartGbNoBanner, htmlPageEnd, hConnectCentral, sqlTableExists + +# not using feedback button for now. Fan would have liked it, but not sure how we can write +# to any form of database. +#skimpyGimpyLoaded=False +#try: + #from skimpyGimpy import skimpyAPI + #skimpyGimpyLoaded=True +#except: + #pass + +# the list of allowed chars in cgi args: digits, letters and dashes +legalChars = set(string.digits) +legalChars.update(set(string.letters)) +legalChars.update("_-./: ") + +# number of genes to show on graph by default +DEFGENECOUNT="25" +# ignore all text mining data with less than X abstracts +MINSUPP=2 + +# minResCount is used throughout the code. For a given interaction, it is the minimal +# number of interactions from all documents linked to this interaction. +# E.g. minResCount of 5 means that the interaction is based on at least one document +# that contained not more than 5 interactions. + +# Cutoff on minResCount: +# maximum number of pairs a study can have to be considered "low-throughput" +# only interactions with at least one low-throughput study are colord in dark +# In essence, this gets rid of curated papers that describe huge complexes +LTCUTOFF=10 + +# color for edges with only text-mining data ("text") +TEXTCOLOR="#BBBBBB" +# color for edges with low-throughput data (=pwy or (ppi and LTCUTOFF)) +LTCOLOR="#000099" +# color for edges with high-throughput data (=ppi) +HTCOLOR="#8080CC" +# transparency for the gene graph edges +TRANSPARENCY="C0" + +# url of the user's manual +MANUALURL="../goldenPath/help/hgGeneGraph.html" + +# database where the tables are stored +GGDB="hgFixed" + +# ==== GLOBALS ===== + +# CGI parameters as a FieldStorage object +# args = None + +# external DB information +dbData = { +"kegg" : ("KEGG", "http://www.kegg.jp/kegg-bin/show_pathway?%s"), +"wikipathways" : ("WikiPathways", "http://www.wikipathways.org/index.php/Pathway:%s"), +"iref" : ("Iref", "http://wodaklab.org/iRefWeb/interaction/show/%s"), +#"pid" : ("NCI Pathway Database", "http://pid.nci.nih.gov/search/InteractionPage?atomid=%s"), +#PID server is down, linking to NDex now, adapted the IDs, see ggPidToTables +"pid" : ("NCI Pathway Database", "http://www.ndexbio.org/#/search?searchType=All&searchString=labels%%253A%s"), +"reactome" : ("Reactome", "http://www.reactome.org/content/detail/%s"), +"go" : ("Gene Ontology Complexes", "http://www.ebi.ac.uk/QuickGO/GTerm?id=%s#info=2"), +"fastforward" : ("FastForward", "http://fastforward.sys-bio.net/popup.php?name_target=%s"), +"argdb" : ("ARGDB", "http://argdb.fudan.edu.cn/geneshow_id.php?gene_id=%s"), +"corum" : ("MIPS CORUM", "http://mips.helmholtz-muenchen.de/genre/proj/corum/complexdetails.html?id=%s"), +"string" : ("STRING", "http://string-db.org/newstring_cgi/show_network_section.pl?multiple_input_items=%s&multi_input=1&multiple_input_type=multi_identifier&limit=0&input_page_type=multiple_identifiers&have_user_input=2&species_text=Homo%%20sapiens&input_query_species=auto_detect&flash=15&required_score=400") +} + +# mime types to send in http header for other downloads +mimeTypes = { +"svg" : "image/svg+xml", +"pdf" : "application/pdf", +"json" : "application/json", +"sif" : "application/octet-stream" +} + +# gbdb file names and descriptions +geneAnnotFiles = [ +("none", None, "No Annotation", "Do not annotate color genes on graph by external information"), +("gnf2", "gnf2Avg.tab", "GNF2 Expression", "Gene Expression Atlas 2 average across tissues"), +("drugbank", "drugbank.tab", "DrugBank", "DrugBank. Black = gene is targetable with a drug. Mouse-over shows drug names"), +("cosmic" , "cosmicCensus.tab", "Cancer Gene Census", "COSMIC Cancer Gene Census Tumor Types. Black = gene is in cancer gene census. Mouse-over shows cancer type"), +("tcgaMut" , "tcgaMut.tab", "Pan-Cancer Mutations", "TCGA PanCan12 samples with non-silent mutations - Gene mouse-over shows count") +] + +# ==== FUNCTIONS === + +# +# +# + +# +# +# + +def printInlineAndStyles(): + #print('') + print('') + + + + print(""" + + + + + """) + +def htmlHeader(): + " print start of page " + webStartGbNoBanner("", "Genome Browser Gene Interaction Graph") + + print('
') + printMenuBar() + + db = getCgiVar("db", "hg19") + printHgcHeader(db, "Protein Interactions Track", "Protein interactions and pathways from curated databases and text-mining", addGoButton=False, infoUrl=MANUALURL) + + +def mustBeClean(str): + """ make sure a string contains only letters and digits """ + if str==None: + return str + + str = urllib.unquote(str) + str = str.strip() + + for s in str: + if s not in legalChars: + errAbort("illegal character in CGI parameter") + return str + +def getCgiVar(name, default=None, allowAnyChar=False, maxLen=30): + " get named cgi variable as a string " + val = cgiString(name, default=default) + if not allowAnyChar: + mustBeClean(val) + + if val != None and len(val) > maxLen: + errAbort("CGI arg %s cannot be longer than %d characters" % (name, maxLen)) + return val + +def mergeCgiParamDicts(data, changes, reset=False): + """ given a data dict and a 2nd one with key=val changes, return a new dict with + changes merged into data. If changes has key=None, remove the key from data. + + Skips some typical hgTracks-specific settings, like hgt.*, l, r or pix. + """ + cgiArgs = cgiGetAll() + newArgs = {} + # copy all existing CGI vars into new dict + if not reset: + for key in data.keys(): + if key.startswith("hgt") or key.startswith("dink") or key=="c" or key=="l" or key=="r" or key=="pix" or key=="position": + continue + val = cgiArgs.getfirst(key) + newArgs[key] = val + + # remove or add the changes + for key, val in changes.iteritems(): + if val==None: + if key in newArgs: + del newArgs[key] + else: + newArgs[key] = str(val) + return newArgs + + +def makeSelfUrl(changes, clear=False): + " return a url to myself, keep all CGI vars, but change/append addParam=addValue " + # construct the new link + newArgs = mergeCgiParamDicts(cgiGetAll(), changes, clear) + paramStr = urllib.urlencode(newArgs) + myName = basename(__file__) + url = "%s?%s" % (myName, paramStr) + return url + +def printSelfHiddenVars(paramDict, clear=False, skipList=[]): + " like makeSelfUrl, but write out all current CGI vars as hidden form fields " + newArgs = mergeCgiParamDicts(cgiGetAll(), paramDict, clear) + for key, val in newArgs.iteritems(): + if key not in skipList and not key=="submit": + print '' % (key, val) + +def makeSelfLink(linkName, paramDict, clear=False, anchor=None, className=None, title=None, style=None, dataToggle=None): + """ make a href link to myself, keep all CGI vars, but change/append addParam=addValue + styleDict is a list of css styles + """ + url = makeSelfUrl(paramDict, clear=clear) + if anchor!=None: + url += "#"+anchor + + classStr = "" + if className!=None: + classStr='class="%s" ' % className + + dataToggleStr = "" + if dataToggle!=None: + dataToggleStr='data-toggle="%s" data-placement="right"' % dataToggle + + styleStr = "" + if style!=None: + styleStr='style="%s" ' % style + + titleStr = "" + if title!=None: + titleStr = ' title="%s"' % title.replace('"', ' ') + return '%s' % (titleStr, classStr, dataToggleStr, url, styleStr, linkName) + +def saltedHash(word, length=5): + " return first 5 chars of salted hash " + # pretty simple salt: PITX2, salting is just for the captcha + hashStr = "".join(hashlib.sha1(word+"PITX2").hexdigest()[:length]).lower() + return hashStr + +def htmlCaptcha(word): + " return html that encodes captcha word " + # copied from http://skimpygimpy.sourceforge.net/ + if not skimpyGimpyLoaded: + return None + + HTMLSPECKLE = 0.1 + HTMLSCALE = 1.5 + HTMLCOLOR = "000000" + + # create an HTML generator + htmlGenerator = skimpyAPI.Pre(word, + speckle=HTMLSPECKLE, # optional + scale=HTMLSCALE, # optional + color=HTMLCOLOR, # optional + ) + # store the preformatted text as htmlText + htmlText = htmlGenerator.data() + return htmlText + +def reqMinSupp(links, minArtSupp, maxResCount, targetGene): + """ remove all 'text mining only' links with less than minArtSupp supporting documents + The only exception is targetGene which we always want to stay connected + + Also remove links that are PPI-only and have a high minResCount. + """ + newLinks = defaultdict(set) + genes = set() + targetConns = {} + for genePair, linkData in links.iteritems(): + docCount, dbCount, tagSet, minResCount = linkData[:4] + if targetGene in genePair: + targetConns[genePair] = linkData + # remove text-mining links with only one article + if "text" in tagSet and docCount < minArtSupp: + continue + # remove noisy PPI links + if len(tagSet)==1 and "ppi" in tagSet and minResCount > maxResCount: + continue + genes.update(genePair) + newLinks[genePair] = linkData + + # is the target gene still connected to something? If not add it back and + # accept that these links are less than minSupp + if targetGene not in genes: + for genePair, linkData in targetConns.iteritems(): + newLinks[genePair] = linkData + return newLinks + +def scorePair(docCount, tagSet): + " return the score for a gene pair " + # pairs that have no text mining results get assigned artifical + # article counts, based on this query: + # hgsql publications -e "select * from ggLink where linkTypes like '%ppi%'" -NB | tawk '($4!=0 && $5!=0) { print $4+$5}' | avg + # the avg for PPI was 14 and the one for pwy 22 + # but PPI scores were mostly 0, so I lower it manually + geneScore = docCount + if geneScore==0 and "ppi" in tagSet: + geneScore = 2 + if geneScore==0 and "pwy" in tagSet: + geneScore = 4 + return geneScore + +def iterLinkScores(links): + """ given a dict (gene1, gene2) -> [score1, score2, ...] + yield tuples (gene1, gene2), totalScore + """ + for genes, linkData in links.iteritems(): + docCount, dbCount, tagSet = linkData[:3] + geneScore = scorePair(docCount, tagSet) + yield genes, geneScore + + +def limitGenes(links, maxRank, targetGene, lowLinks=None): + """ get the links for the top X genes and return two lists of links, highLinks and lowLinks + The targetGene is always in the high list. + """ + # create a list of genes sorted by article count in links + geneScores = defaultdict(int) + for genes, geneScore in iterLinkScores(links): + for g in genes: + geneScores[g] += geneScore + + geneScores = geneScores.items() + geneScores.sort(key=operator.itemgetter(1), reverse=True) + + # keep only the best genes + sortedGenes = [x for x,y in geneScores] + highGenes = set(sortedGenes[:maxRank]) + lowGenes = set(sortedGenes[maxRank:]) + + # the target gene itself always has to be in the top list + if targetGene not in highGenes: + highGenes.add(targetGene) + if targetGene in lowGenes: + lowGenes.remove(targetGene) + + # filter the links into high and low + highLinks = defaultdict(set) + if lowLinks==None: + lowLinks = defaultdict(set) + for genes, linkData in links.iteritems(): + g1, g2 = genes + if g1 in highGenes and g2 in highGenes: + highLinks[genes] = linkData + else: + lowLinks[genes] = linkData + return highLinks, lowLinks + +def limitLinks(graphLinks, lowLinks, maxRank, targetGene): + """ filter the high links to keep only the best X and return them, move the + others into the lowLinks dict. Always keep all links to/from targetGene. + """ + # ignoring link direction, get all PMIDs per link + linkScores = {} + for genes, geneScore in iterLinkScores(graphLinks): + linkScores[genes] = geneScore + linkScores= linkScores.items() # convert to list + linkScores.sort(key=operator.itemgetter(1), reverse=True) # sort + linkScores = [x for x,y in linkScores] # keep only the gene pairs + + highPairs = set(linkScores[:maxRank]) # keep only the best pairs + lowPairs = set(linkScores[maxRank:]) + + # now filter all links and keep only the those of the pairs + highLinksFiltered = defaultdict(set) + for pair, linkData in graphLinks.iteritems(): + g1, g2 = pair + if pair in highPairs or g1==targetGene or g2==targetGene: + #print "high", pair + highLinksFiltered[pair] = linkData + else: + #print "low" + lowLinks[pair] = linkData + return highLinksFiltered, lowLinks + + +def splitHighLowLinks(links, gene, minSupp, lowLinks, geneCount): + """ split into two sets of links: high = best geneCount genes and best 2*geneCount links + between them, low = all the others""" + links = reqMinSupp(links, minSupp, LTCUTOFF, gene) + + highLinks, lowLinks = limitGenes(links, geneCount, gene, lowLinks) + #print "lowLinks", lowLinks, "" + #print "highLinks", highLinks, "
" + + highLinks, lowLinks = limitLinks(highLinks, lowLinks, 2*geneCount, gene) + #print "bestLinks", highLinks, "
" + + return highLinks, lowLinks + +def queryLinks(conn, gene=None, genes=None): + """ query the mysql table ggLink for either all links to gene + or all links between all pairs of genes + Return result as a dict (gene1, gene2) -> + (docCount, tagSet, snippet) + tags can be any subset of ppi, text, pwy, pwyRev, low + """ + assert (gene!=None or genes!=None) + query = "SELECT gene1, gene2, linkTypes, docCount, minResCount, dbList, snippet FROM ggLink " + + if gene!=None: + query += "WHERE gene1='%s' OR gene2='%s'" % (gene, gene) + else: + geneList = ["'"+g+"'" for g in genes] + listStr = '(%s)' % ",".join(geneList) + query += "WHERE gene1 IN %s AND gene2 IN %s" % (listStr, listStr) + + rows = sqlQuery(conn, query) + + links = {} + for row in rows: + pair = (row.gene1, row.gene2) + links[pair] = (int(row.docCount), row.dbList.split("|"), \ + row.linkTypes.split(","), int(row.minResCount), row.snippet) + return links + +def filterLinks(links): + " remove links depending on the CGI param 'supportLevel'. Can show all, only with PPI/pathway or only with pathway " + showTags = getFilterStatus() + if len(showTags)==3: + # no filtering + return links + + # filter links + filtLinks = {} + for pair, pairData in links.iteritems(): + docCount, dbList, tagSet = pairData[:3] + if len(showTags.intersection(tagSet))!=0: + filtLinks[pair] = pairData + return filtLinks + +def buildGraph(conn, gene, geneCount, minSupp, addNeighbors): + """ get (gene,gene) links from database and annotate with weights and tags + only get links with minSupp articles for each link + """ + links = queryLinks(conn, gene=gene) + + if len(links)==0: + errAbort("Sorry, the gene %s is not a valid gene symbol or is not present in any gene interaction database." % cgi.escape(gene)) + + links = filterLinks(links) + + lowLinks = defaultdict(set) + graphLinks, lowLinks = splitHighLowLinks(links, gene, minSupp, lowLinks, geneCount) + + if addNeighbors: + # create the links between all other genes, be less stringent about these + otherGenes = set() + for genes, pmids in graphLinks.iteritems(): + otherGenes.update(genes) + if gene in otherGenes: + otherGenes.remove(gene) + # get links between all other genes but keep the limit on the graphed links + otherLinks = queryLinks(conn, genes=otherGenes) + otherLinks = filterLinks(otherLinks) + otherLinks = reqMinSupp(otherLinks, 2, 999999999, gene) # we still require two text mining abstracts + otherLinks, lowLinks = limitLinks(otherLinks, lowLinks, 35, gene) + # add the high links back to the graph + for pair, linkData in otherLinks.iteritems(): + assert(pair not in graphLinks) + graphLinks[pair] = linkData + + return graphLinks, lowLinks + +#def queryLinkColors(conn, links): + #" figure out if links have annotations in pathway commons " + #colors = {} + #for gene1, gene2 in links: + #if sqlQueryExists(conn, "SELECT causeGene, themeGene FROM pathCommons where causeGene='%s' and themeGene='%s'" % (gene1,gene2)): + #colors[(gene1, gene2)] = "blue" + #return colors + +def pmidCountToThick(pmidCount): + " convert pmid count to line thickness " + if pmidCount > 50: + thick = 4.5 + elif pmidCount > 5: + thick = 3.5 + elif pmidCount > 1: + thick = 2.5 + else: + thick = 1 + return thick + +def minResToThick(count): + " convert minResCount to line thickness, for description of minResCount see start of file " + if count > 100: + thick = 1 + elif count > 50: + thick = 2 + elif count > 10: + thick = 3.5 + else: + thick = 2 + return thick + +def runDot(fname, alg, format="png"): + " run dot and return tuple outFname, outMapFname " + outFname = splitext(fname)[0]+"."+format + outMap = splitext(fname)[0]+".map" + binPath = cfgOption("graphvizPath", "dot") + #cmd = "%s -Gdpi=96 -Gsize=12,5 -Gratio=fill -K%s %s -T%s -o %s" % (binPath, alg, fname, format, outFname) + cmd = [binPath, "-Gdpi=96", "-Gsize=12,5", "-Gratio=fill", "-K"+alg, "-T"+format, fname, "-o",outFname] + # create a html map for png format + if format=="png": + cmd.extend(["-Tcmapx", "-o", outMap]) + runCmd(cmd) + return outFname, outMap + +def dictToDot(d): + """ reformat a dictionary to a string like [key1="value1"; key2="value2"] """ + if len(d)==0: + return "" + + strList = [] + for key, val in d.iteritems(): + strList.append('%s="%s"' % (key, val.replace('"', ''))) + return "[%s]" % (";".join(strList)) + +def writeDot(allGenes, links, fname, targetGene, geneDescs, annotLabel, geneAnnots, linkSnips): + """ write a description of the graph to fname in dot format. + targetGene is highlighted, geneDescs are on mouseovers, geneAnnots are used to color the genes. + geneSnips are added to mouseovers. + """ + + ofh = open(fname, "w") + ofh.write("digraph test {\n") + ofh.write("graph [bgcolor=transparent; esep=0.4];\n") + #ofh.write("rankdir=LR;\n") + ofh.write("overlap=voronoi;\n") + ofh.write('size="7,7";\n') + #ofh.write('splines=polyline;\n') + ofh.write('splines=true;\n') + #ofh.write('nodesep=2.0;\n') + #ofh.write('pack=true;\n') + #ofh.write('edge [arrowhead=vee, color=grey];\n') + #ofh.write('node [color=none; shape=plaintext; fixedsize=true,width=0.9,fontname="Helvetica"];\n') + ofh.write('edge [color="%s"; weight=0.2; arrowsize=0.7];\n' % (TEXTCOLOR+TRANSPARENCY)) + ofh.write('node [penwidth=0; style=filled; fontcolor="#ffffff"; fillcolor="#111177"; shape=ellipse; fontsize=11; fixedsize=true; width=0.8; height=0.3; fontname="Helvetica"];\n') + #ofh.write('"%s" [fontcolor="#00000"; color="transparent"; style=filled; fillcolor="#ffff00"];\n' % targetGene) + + # write out the genes + for g in allGenes: + d = {} + ttLines = geneDescs[g] + if g in geneAnnots: + #d["color"]="#3636E2" + geneAnnotVal, geneAnnotGrey = geneAnnots[g] + transVal = 200 - (geneAnnotGrey*25) # scale to 55-255 + transHex = "%0.2X" % transVal # convert to hex + d["fillcolor"]="#"+"".join([transHex]*3) # concat three times, creates a grey-value + d["fontcolor"] = "white" + ttLines.append("%s: %s" % (annotLabel, geneAnnotVal)) + d["tooltip"] = " <br> ".join(ttLines) # separate with encoded html linebreaks + url = makeSelfUrl({"gene":g}) + url = url.replace("&", "&") # otherwise invalid SVG will result + d["URL"] = url + if g==targetGene: + d["penwidth"] = "1" + d["fontcolor"] = "black" + d["fillcolor"] = "yellow" + #d["style"] = "bold" + d["color"] = "blue" + ofh.write('"%s" %s;\n' % (g, dictToDot(d))) + + # write the links between genes + for gene1, gene2, score, docCount, dbs, tags, minResCount, snippet in links: + dbs = [x.upper() for x in dbs if x!=''] + + addStr = "" + + # based on http://www.w3schools.com/tags/ref_colorpicker.asp?colorhex=F0F8FF + color = None + if "pwy" in tags or ("ppi" in tags and minResCount<=LTCUTOFF): + color = LTCOLOR+TRANSPARENCY #"#000099A0" + elif "ppi" in tags: + color = HTCOLOR+TRANSPARENCY # "#8080CCA0" + + if color: + addStr = 'color="%s";' % color + + if gene2==gene1: + addStr += 'len="5"; ' + thick = pmidCountToThick(score) + + if "fwd" in tags and "rev" in tags: + arrDir = "both" + elif "fwd" in tags: + arrDir = "forward" + elif "rev" in tags: + arrDir = "back" + else: + arrDir = "none" + + if "text" in tags: + style = "solid" + else: + style = "dashed" + thick = 1 + + tooltipLines = ["%s-%s " % (gene1, gene2)] + tooltipLines.append("