2b9a3a726493d66c526e3d8256178ff3e12c7e3c max Fri Apr 28 16:48:03 2017 -0700 renaming pylib to pyLib after Angie's suggestion (thanks) and adding a better error message when graphviz was not found. refs #13634 diff --git src/hg/hgGeneGraph/hgGeneGraph src/hg/hgGeneGraph/hgGeneGraph index 023818d..8b39204 100755 --- src/hg/hgGeneGraph/hgGeneGraph +++ src/hg/hgGeneGraph/hgGeneGraph @@ -1,1951 +1,1986 @@ #!/usr/bin/env python2.7 # Protein Interaction Viewer for the Genome Browser # query tables with prefix "gg" in hgFixed, writes the results to a dot file, # runs graphviz's "dot" program to create a pathway map from it and write html # and mapfiles to the trash directory. # CGI params: gene=(HGNCsymbol) or link=sym1:sym2 # optional params: addNeighbors # colors: # grey+thickness = only text mining data # light blue, dashed = only high-throughput data # light blue, thickness = high-throughput data + text # dark blue, dashed = only low-throughput data # dark blue, thickness = low-throughput data + text # dark blue + dashed = only pathway data # code review # - os.system is not a security risk here, no variables go into the cmd line # - mysql statements are not escaped, instead all CGI vars are checked for non-alpha letters # hgFixed tables required for this script: ggLink (main table with gene-gene links), # ggLinkEvent (details about link), ggEventDb (details about links from databases), # ggEventText (details about links from text mining), ggDoc (details about documents for ggEventText) # ggGeneName (symbols), ggGeneClass (HPRD/Panther class) # these are default python modules on python 2.7, no errors expected here import sys, cgi, os, string, urllib, operator, hashlib from sys import exit from collections import defaultdict, namedtuple from os.path import * # import the UCSC-specific library -sys.path.append("pylib") +sys.path.append("pyLib") +try: from hgLib import cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \ sqlConnect, sqlQuery, errAbort, cfgOption, runCmd, cgiGetAll, printHgcHeader, \ printHgcSection, webStartGbNoBanner, htmlPageEnd, hConnectCentral, sqlTableExists +except: + print("Content-type: text/html\n") + print("Cannot find the directory cgi-bin/pyLib in Apache. This is an installation error.") + print("All all parts of cgi-bin installed? Did you do 'make' in kent/src/hg/pyLib?") # not using feedback button for now. Fan would have liked it, but not sure how we can write # to any form of database. #skimpyGimpyLoaded=False #try: #from skimpyGimpy import skimpyAPI #skimpyGimpyLoaded=True #except: #pass # the list of allowed chars in cgi args: digits, letters and dashes legalChars = set(string.digits) legalChars.update(set(string.letters)) legalChars.update("_-./: ") # number of genes to show on graph by default DEFGENECOUNT="25" # ignore all text mining data with less than X abstracts MINSUPP=2 # minResCount is used throughout the code. For a given interaction, it is the minimal # number of interactions from all documents linked to this interaction. # E.g. minResCount of 5 means that the interaction is based on at least one document # that contained not more than 5 interactions. # Cutoff on minResCount: # maximum number of pairs a study can have to be considered "low-throughput" # only interactions with at least one low-throughput study are colord in dark # In essence, this gets rid of curated papers that describe huge complexes LTCUTOFF=10 # color for edges with only text-mining data ("text") TEXTCOLOR="#BBBBBB" # color for edges with low-throughput data (=pwy or (ppi and LTCUTOFF)) LTCOLOR="#000099" # color for edges with high-throughput data (=ppi) HTCOLOR="#8080CC" # transparency for the gene graph edges TRANSPARENCY="C0" # url of the user's manual MANUALURL="../goldenPath/help/hgGeneGraph.html" # database where the tables are stored GGDB="hgFixed" # ==== GLOBALS ===== # CGI parameters as a FieldStorage object # args = None # external DB information dbData = { "kegg" : ("KEGG", "http://www.kegg.jp/kegg-bin/show_pathway?%s"), "wikipathways" : ("WikiPathways", "http://www.wikipathways.org/index.php/Pathway:%s"), "iref" : ("Iref", "http://wodaklab.org/iRefWeb/interaction/show/%s"), #"pid" : ("NCI Pathway Database", "http://pid.nci.nih.gov/search/InteractionPage?atomid=%s"), #PID server is down, linking to NDex now, adapted the IDs, see ggPidToTables "pid" : ("NCI Pathway Database", "http://www.ndexbio.org/#/search?searchType=All&searchString=labels%%253A%s"), "reactome" : ("Reactome", "http://www.reactome.org/content/detail/%s"), "go" : ("Gene Ontology Complexes", "http://www.ebi.ac.uk/QuickGO/GTerm?id=%s#info=2"), "fastforward" : ("FastForward", "http://fastforward.sys-bio.net/popup.php?name_target=%s"), "argdb" : ("ARGDB", "http://argdb.fudan.edu.cn/geneshow_id.php?gene_id=%s"), "corum" : ("MIPS CORUM", "http://mips.helmholtz-muenchen.de/genre/proj/corum/complexdetails.html?id=%s"), "string" : ("STRING", "http://string-db.org/newstring_cgi/show_network_section.pl?multiple_input_items=%s&multi_input=1&multiple_input_type=multi_identifier&limit=0&input_page_type=multiple_identifiers&have_user_input=2&species_text=Homo%%20sapiens&input_query_species=auto_detect&flash=15&required_score=400") } # mime types to send in http header for other downloads mimeTypes = { "svg" : "image/svg+xml", "pdf" : "application/pdf", "json" : "application/json", "sif" : "application/octet-stream" } # gbdb file names and descriptions geneAnnotFiles = [ ("none", None, "No Annotation", "Do not annotate color genes on graph by external information"), ("gnf2", "gnf2Avg.tab", "GNF2 Expression", "Gene Expression Atlas 2 average across tissues"), ("drugbank", "drugbank.tab", "DrugBank", "DrugBank. Black = gene is targetable with a drug. Mouse-over shows drug names"), ("cosmic" , "cosmicCensus.tab", "Cancer Gene Census", "COSMIC Cancer Gene Census Tumor Types. Black = gene is in cancer gene census. Mouse-over shows cancer type"), ("tcgaMut" , "tcgaMut.tab", "Pan-Cancer Mutations", "TCGA PanCan12 samples with non-silent mutations - Gene mouse-over shows count") ] # ==== FUNCTIONS === # # # # # # def printInlineAndStyles(): #print('') print('') print(""" """) def htmlHeader(): " print start of page " webStartGbNoBanner("", "Genome Browser Gene Interaction Graph") print('
') printMenuBar() db = getCgiVar("db", "hg19") printHgcHeader(db, "Protein Interactions Track", "Protein interactions and pathways from curated databases and text-mining", addGoButton=False, infoUrl=MANUALURL) def mustBeClean(str): """ make sure a string contains only letters and digits """ if str==None: return str str = urllib.unquote(str) str = str.strip() for s in str: if s not in legalChars: errAbort("illegal character in CGI parameter") return str def getCgiVar(name, default=None, allowAnyChar=False, maxLen=30): " get named cgi variable as a string " val = cgiString(name, default=default) if not allowAnyChar: mustBeClean(val) if val != None and len(val) > maxLen: errAbort("CGI arg %s cannot be longer than %d characters" % (name, maxLen)) return val def mergeCgiParamDicts(data, changes, reset=False): """ given a data dict and a 2nd one with key=val changes, return a new dict with changes merged into data. If changes has key=None, remove the key from data. Skips some typical hgTracks-specific settings, like hgt.*, l, r or pix. """ cgiArgs = cgiGetAll() newArgs = {} # copy all existing CGI vars into new dict if not reset: for key in data.keys(): if key.startswith("hgt") or key.startswith("dink") or key=="c" or key=="l" or key=="r" or key=="pix" or key=="position": continue val = cgiArgs.getfirst(key) newArgs[key] = val # remove or add the changes for key, val in changes.iteritems(): if val==None: if key in newArgs: del newArgs[key] else: newArgs[key] = str(val) return newArgs def makeSelfUrl(changes, clear=False): " return a url to myself, keep all CGI vars, but change/append addParam=addValue " # construct the new link newArgs = mergeCgiParamDicts(cgiGetAll(), changes, clear) paramStr = urllib.urlencode(newArgs) myName = basename(__file__) url = "%s?%s" % (myName, paramStr) return url def printSelfHiddenVars(paramDict, clear=False, skipList=[]): " like makeSelfUrl, but write out all current CGI vars as hidden form fields " newArgs = mergeCgiParamDicts(cgiGetAll(), paramDict, clear) for key, val in newArgs.iteritems(): if key not in skipList and not key=="submit": print '' % (key, val) def makeSelfLink(linkName, paramDict, clear=False, anchor=None, className=None, title=None, style=None, dataToggle=None): """ make a href link to myself, keep all CGI vars, but change/append addParam=addValue styleDict is a list of css styles """ url = makeSelfUrl(paramDict, clear=clear) if anchor!=None: url += "#"+anchor classStr = "" if className!=None: classStr='class="%s" ' % className dataToggleStr = "" if dataToggle!=None: dataToggleStr='data-toggle="%s" data-placement="right"' % dataToggle styleStr = "" if style!=None: styleStr='style="%s" ' % style titleStr = "" if title!=None: titleStr = ' title="%s"' % title.replace('"', ' ') return '%s' % (titleStr, classStr, dataToggleStr, url, styleStr, linkName) def saltedHash(word, length=5): " return first 5 chars of salted hash " # pretty simple salt: PITX2, salting is just for the captcha hashStr = "".join(hashlib.sha1(word+"PITX2").hexdigest()[:length]).lower() return hashStr def htmlCaptcha(word): " return html that encodes captcha word " # copied from http://skimpygimpy.sourceforge.net/ if not skimpyGimpyLoaded: return None HTMLSPECKLE = 0.1 HTMLSCALE = 1.5 HTMLCOLOR = "000000" # create an HTML generator htmlGenerator = skimpyAPI.Pre(word, speckle=HTMLSPECKLE, # optional scale=HTMLSCALE, # optional color=HTMLCOLOR, # optional ) # store the preformatted text as htmlText htmlText = htmlGenerator.data() return htmlText def reqMinSupp(links, minArtSupp, maxResCount, targetGene): """ remove all 'text mining only' links with less than minArtSupp supporting documents The only exception is targetGene which we always want to stay connected Also remove links that are PPI-only and have a high minResCount. """ newLinks = defaultdict(set) genes = set() targetConns = {} for genePair, linkData in links.iteritems(): docCount, dbCount, tagSet, minResCount = linkData[:4] if targetGene in genePair: targetConns[genePair] = linkData # remove text-mining links with only one article if "text" in tagSet and docCount < minArtSupp: continue # remove noisy PPI links if len(tagSet)==1 and "ppi" in tagSet and minResCount > maxResCount: continue genes.update(genePair) newLinks[genePair] = linkData # is the target gene still connected to something? If not add it back and # accept that these links are less than minSupp if targetGene not in genes: for genePair, linkData in targetConns.iteritems(): newLinks[genePair] = linkData return newLinks def scorePair(docCount, tagSet): " return the score for a gene pair " # pairs that have no text mining results get assigned artifical # article counts, based on this query: # hgsql publications -e "select * from ggLink where linkTypes like '%ppi%'" -NB | tawk '($4!=0 && $5!=0) { print $4+$5}' | avg # the avg for PPI was 14 and the one for pwy 22 # but PPI scores were mostly 0, so I lower it manually geneScore = docCount if geneScore==0 and "ppi" in tagSet: geneScore = 2 if geneScore==0 and "pwy" in tagSet: geneScore = 4 return geneScore def iterLinkScores(links): """ given a dict (gene1, gene2) -> [score1, score2, ...] yield tuples (gene1, gene2), totalScore """ for genes, linkData in links.iteritems(): docCount, dbCount, tagSet = linkData[:3] geneScore = scorePair(docCount, tagSet) yield genes, geneScore def limitGenes(links, maxRank, targetGene, lowLinks=None): """ get the links for the top X genes and return two lists of links, highLinks and lowLinks The targetGene is always in the high list. """ # create a list of genes sorted by article count in links geneScores = defaultdict(int) for genes, geneScore in iterLinkScores(links): for g in genes: geneScores[g] += geneScore geneScores = geneScores.items() geneScores.sort(key=operator.itemgetter(1), reverse=True) # keep only the best genes sortedGenes = [x for x,y in geneScores] highGenes = set(sortedGenes[:maxRank]) lowGenes = set(sortedGenes[maxRank:]) # the target gene itself always has to be in the top list if targetGene not in highGenes: highGenes.add(targetGene) if targetGene in lowGenes: lowGenes.remove(targetGene) # filter the links into high and low highLinks = defaultdict(set) if lowLinks==None: lowLinks = defaultdict(set) for genes, linkData in links.iteritems(): g1, g2 = genes if g1 in highGenes and g2 in highGenes: highLinks[genes] = linkData else: lowLinks[genes] = linkData return highLinks, lowLinks def limitLinks(graphLinks, lowLinks, maxRank, targetGene): """ filter the high links to keep only the best X and return them, move the others into the lowLinks dict. Always keep all links to/from targetGene. """ # ignoring link direction, get all PMIDs per link linkScores = {} for genes, geneScore in iterLinkScores(graphLinks): linkScores[genes] = geneScore linkScores= linkScores.items() # convert to list linkScores.sort(key=operator.itemgetter(1), reverse=True) # sort linkScores = [x for x,y in linkScores] # keep only the gene pairs highPairs = set(linkScores[:maxRank]) # keep only the best pairs lowPairs = set(linkScores[maxRank:]) # now filter all links and keep only the those of the pairs highLinksFiltered = defaultdict(set) for pair, linkData in graphLinks.iteritems(): g1, g2 = pair if pair in highPairs or g1==targetGene or g2==targetGene: #print "high", pair highLinksFiltered[pair] = linkData else: #print "low" lowLinks[pair] = linkData return highLinksFiltered, lowLinks def splitHighLowLinks(links, gene, minSupp, lowLinks, geneCount): """ split into two sets of links: high = best geneCount genes and best 2*geneCount links between them, low = all the others""" links = reqMinSupp(links, minSupp, LTCUTOFF, gene) highLinks, lowLinks = limitGenes(links, geneCount, gene, lowLinks) #print "lowLinks", lowLinks, "" #print "highLinks", highLinks, "
" highLinks, lowLinks = limitLinks(highLinks, lowLinks, 2*geneCount, gene) #print "bestLinks", highLinks, "
" return highLinks, lowLinks def queryLinks(conn, gene=None, genes=None): """ query the mysql table ggLink for either all links to gene or all links between all pairs of genes Return result as a dict (gene1, gene2) -> (docCount, tagSet, snippet) tags can be any subset of ppi, text, pwy, pwyRev, low """ assert (gene!=None or genes!=None) query = "SELECT gene1, gene2, linkTypes, docCount, minResCount, dbList, snippet FROM ggLink " if gene!=None: query += "WHERE gene1='%s' OR gene2='%s'" % (gene, gene) else: geneList = ["'"+g+"'" for g in genes] listStr = '(%s)' % ",".join(geneList) query += "WHERE gene1 IN %s AND gene2 IN %s" % (listStr, listStr) rows = sqlQuery(conn, query) links = {} for row in rows: pair = (row.gene1, row.gene2) links[pair] = (int(row.docCount), row.dbList.split("|"), \ row.linkTypes.split(","), int(row.minResCount), row.snippet) return links def filterLinks(links): " remove links depending on the CGI param 'supportLevel'. Can show all, only with PPI/pathway or only with pathway " showTags = getFilterStatus() if len(showTags)==3: # no filtering return links # filter links filtLinks = {} for pair, pairData in links.iteritems(): docCount, dbList, tagSet = pairData[:3] if len(showTags.intersection(tagSet))!=0: filtLinks[pair] = pairData return filtLinks def buildGraph(conn, gene, geneCount, minSupp, addNeighbors): """ get (gene,gene) links from database and annotate with weights and tags only get links with minSupp articles for each link """ links = queryLinks(conn, gene=gene) if len(links)==0: errAbort("Sorry, the gene %s is not a valid gene symbol or is not present in any gene interaction database." % cgi.escape(gene)) links = filterLinks(links) lowLinks = defaultdict(set) graphLinks, lowLinks = splitHighLowLinks(links, gene, minSupp, lowLinks, geneCount) if addNeighbors: # create the links between all other genes, be less stringent about these otherGenes = set() for genes, pmids in graphLinks.iteritems(): otherGenes.update(genes) if gene in otherGenes: otherGenes.remove(gene) # get links between all other genes but keep the limit on the graphed links otherLinks = queryLinks(conn, genes=otherGenes) otherLinks = filterLinks(otherLinks) otherLinks = reqMinSupp(otherLinks, 2, 999999999, gene) # we still require two text mining abstracts otherLinks, lowLinks = limitLinks(otherLinks, lowLinks, 35, gene) # add the high links back to the graph for pair, linkData in otherLinks.iteritems(): assert(pair not in graphLinks) graphLinks[pair] = linkData return graphLinks, lowLinks #def queryLinkColors(conn, links): #" figure out if links have annotations in pathway commons " #colors = {} #for gene1, gene2 in links: #if sqlQueryExists(conn, "SELECT causeGene, themeGene FROM pathCommons where causeGene='%s' and themeGene='%s'" % (gene1,gene2)): #colors[(gene1, gene2)] = "blue" #return colors def pmidCountToThick(pmidCount): " convert pmid count to line thickness " if pmidCount > 50: thick = 4.5 elif pmidCount > 5: thick = 3.5 elif pmidCount > 1: thick = 2.5 else: thick = 1 return thick def minResToThick(count): " convert minResCount to line thickness, for description of minResCount see start of file " if count > 100: thick = 1 elif count > 50: thick = 2 elif count > 10: thick = 3.5 else: thick = 2 return thick +def which(binPath): + " find full path of binary " + import distutils.spawn + return distutils.spawn.find_executable(binPath) + def runDot(fname, alg, format="png"): " run dot and return tuple outFname, outMapFname " outFname = splitext(fname)[0]+"."+format outMap = splitext(fname)[0]+".map" binPath = cfgOption("graphvizPath", "dot") - #cmd = "%s -Gdpi=96 -Gsize=12,5 -Gratio=fill -K%s %s -T%s -o %s" % (binPath, alg, fname, format, outFname) + + if which(binPath) is None: + print("Could not find the command %s
" % binPath)
+ print("This usually means that on this UCSC Genome Browser, GraphViz is not installed.")
+ print("To install GraphViz, ask the Administrator of this server to run one of the following commands:
")
+ print("sudo apt-get install graphviz
")
+ print("sudo wget http://www.graphviz.org/graphviz-rhel.repo -o /etc/yum/repos.d/graphviz.rhel.repo; sudo yum install graphviz*
")
+ print("
")
+ print("Alternatively, you can compile GraphViz from source and specify the full path to 'dot' with the option 'graphvizPath' in cgi-bin/hg.conf.")
+ exit(0)
+
cmd = [binPath, "-Gdpi=96", "-Gsize=12,5", "-Gratio=fill", "-K"+alg, "-T"+format, fname, "-o",outFname]
# create a html map for png format
if format=="png":
cmd.extend(["-Tcmapx", "-o", outMap])
- runCmd(cmd)
+ ret = runCmd(cmd, mustRun=False)
+
+ if ret!=0:
+ print("Could not run the command '%s'.
" % "".join(cmd))
+ print("This means that on this UCSC Genome Browser Installation too old.")
+ print("To update GraphViz to a more recent version, ask the Administrator of this ")
+ print("server to try running one of these commands:
")
+ print("sudo apt-get install graphviz
")
+ print("sudo wget http://www.graphviz.org/graphviz-rhel.repo -o ")
+ print("/etc/yum/repos.d/graphviz.rhel.repo;")
+ print("sudo yum install graphviz*
")
+ print("
") + print("If this does not work, you may have to compile GraphViz from source and") + print("specify the 'dot' binary with the option graphvizPath cgi-bin/hg.conf.") + exit(0) + return outFname, outMap def dictToDot(d): """ reformat a dictionary to a string like [key1="value1"; key2="value2"] """ if len(d)==0: return "" strList = [] for key, val in d.iteritems(): strList.append('%s="%s"' % (key, val.replace('"', ''))) return "[%s]" % (";".join(strList)) def writeDot(allGenes, links, fname, targetGene, geneDescs, annotLabel, geneAnnots, linkSnips): """ write a description of the graph to fname in dot format. targetGene is highlighted, geneDescs are on mouseovers, geneAnnots are used to color the genes. geneSnips are added to mouseovers. """ ofh = open(fname, "w") ofh.write("digraph test {\n") ofh.write("graph [bgcolor=transparent; esep=0.4];\n") #ofh.write("rankdir=LR;\n") ofh.write("overlap=voronoi;\n") ofh.write('size="7,7";\n') #ofh.write('splines=polyline;\n') ofh.write('splines=true;\n') #ofh.write('nodesep=2.0;\n') #ofh.write('pack=true;\n') #ofh.write('edge [arrowhead=vee, color=grey];\n') #ofh.write('node [color=none; shape=plaintext; fixedsize=true,width=0.9,fontname="Helvetica"];\n') ofh.write('edge [color="%s"; weight=0.2; arrowsize=0.7];\n' % (TEXTCOLOR+TRANSPARENCY)) ofh.write('node [penwidth=0; style=filled; fontcolor="#ffffff"; fillcolor="#111177"; shape=ellipse; fontsize=11; fixedsize=true; width=0.8; height=0.3; fontname="Helvetica"];\n') #ofh.write('"%s" [fontcolor="#00000"; color="transparent"; style=filled; fillcolor="#ffff00"];\n' % targetGene) # write out the genes for g in allGenes: d = {} ttLines = geneDescs[g] if g in geneAnnots: #d["color"]="#3636E2" geneAnnotVal, geneAnnotGrey = geneAnnots[g] transVal = 200 - (geneAnnotGrey*25) # scale to 55-255 transHex = "%0.2X" % transVal # convert to hex d["fillcolor"]="#"+"".join([transHex]*3) # concat three times, creates a grey-value d["fontcolor"] = "white" ttLines.append("%s: %s" % (annotLabel, geneAnnotVal)) d["tooltip"] = " <br> ".join(ttLines) # separate with encoded html linebreaks url = makeSelfUrl({"gene":g}) url = url.replace("&", "&") # otherwise invalid SVG will result d["URL"] = url if g==targetGene: d["penwidth"] = "1" d["fontcolor"] = "black" d["fillcolor"] = "yellow" #d["style"] = "bold" d["color"] = "blue" ofh.write('"%s" %s;\n' % (g, dictToDot(d))) # write the links between genes for gene1, gene2, score, docCount, dbs, tags, minResCount, snippet in links: dbs = [x.upper() for x in dbs if x!=''] addStr = "" # based on http://www.w3schools.com/tags/ref_colorpicker.asp?colorhex=F0F8FF color = None if "pwy" in tags or ("ppi" in tags and minResCount<=LTCUTOFF): color = LTCOLOR+TRANSPARENCY #"#000099A0" elif "ppi" in tags: color = HTCOLOR+TRANSPARENCY # "#8080CCA0" if color: addStr = 'color="%s";' % color if gene2==gene1: addStr += 'len="5"; ' thick = pmidCountToThick(score) if "fwd" in tags and "rev" in tags: arrDir = "both" elif "fwd" in tags: arrDir = "forward" elif "rev" in tags: arrDir = "back" else: arrDir = "none" if "text" in tags: style = "solid" else: style = "dashed" thick = 1 tooltipLines = ["%s-%s " % (gene1, gene2)] tooltipLines.append("