fded6ce46cb167faaae559ff93b050c2b7d18ff1 max Mon Jun 26 08:59:00 2023 -0700 Porting hgGeneGraph to python3. refs #31563 diff --git src/hg/hgGeneGraph/hgGeneGraph src/hg/hgGeneGraph/hgGeneGraph index 96bd4bc..b5df5d0 100755 --- src/hg/hgGeneGraph/hgGeneGraph +++ src/hg/hgGeneGraph/hgGeneGraph @@ -1,16 +1,16 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 # Gene Interaction Viewer for the Genome Browser # query tables with prefix "gg" in hgFixed, writes the results to a dot file, # runs graphviz's "dot" program to create a pathway map from it and write html # and mapfiles to the trash directory. # CGI params: gene=(HGNCsymbol) or link=sym1:sym2 # optional params: addNeighbors # colors: # grey+thickness = only text mining data # light blue, dashed = only high-throughput data @@ -18,59 +18,59 @@ # dark blue, dashed = only low-throughput data # dark blue, thickness = low-throughput data + text # dark blue + dashed = only pathway data # code review # - os.system is not a security risk here, no variables go into the cmd line # - mysql statements are not escaped, instead all CGI vars are checked for non-alpha letters # hgFixed tables required for this script: ggLink (main table with gene-gene links), # ggLinkEvent (details about link), ggEventDb (details about links from databases), # ggEventText (details about links from text mining), ggDoc (details about documents for ggEventText) # ggGeneName (symbols), ggGeneClass (HPRD/Panther class) # these are default python modules on python 2.7, no errors expected here -import sys, cgi, os, string, urllib, operator, hashlib +import sys, cgi, os, string, urllib.request, urllib.parse, urllib.error, operator, hashlib from sys import exit from collections import defaultdict, namedtuple from os.path import * import cgitb cgitb.enable() # import the UCSC-specific library sys.path.append(join(dirname(__file__), "pyLib")) try: - from hgLib import cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \ + from hgLib3 import cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \ sqlConnect, sqlQuery, errAbort, cfgOption, runCmd, cgiGetAll, printHgcHeader, \ printHgcSection, getNonce, getCspMetaHeader, jsOnEventById, \ jsInlineFinish, webStartGbNoBanner, htmlPageEnd, hConnectCentral, \ sqlTableExists, readSmallFile except: print("Content-type: text/html\n") print("Cannot find the directory cgi-bin/pyLib in Apache. This is an installation error.") print("All all parts of cgi-bin installed? Did you do 'make' in kent/src/hg/pyLib?") -import MySQLdb +import pymysql.cursors # not using feedback button for now. Fan would have liked it, but not sure how we can write # to any form of database. # the list of allowed chars in cgi args: digits, letters and dashes legalChars = set(string.digits) -legalChars.update(set(string.letters)) +legalChars.update(set(string.ascii_letters)) legalChars.update("_-./: ") # number of genes to show on graph by default DEFGENECOUNT="25" # ignore all text mining data with less than X abstracts MINSUPP=2 # minResCount is used throughout the code. For a given interaction, it is the minimal # number of interactions from all documents linked to this interaction. # E.g. minResCount of 5 means that the interaction is based on at least one document # that contained not more than 5 interactions. # Cutoff on minResCount: # maximum number of pairs a study can have to be considered "low-throughput" # only interactions with at least one low-throughput study are colord in dark @@ -137,31 +137,31 @@ # # # # # # def printInlineAndStyles(): #print('') print('') - print(""" + print((""" ' - print ' ' + print(' ') - print "" - print ' ' + print("") + print(' ') #print '' def openGeneAnnotFile(dataName): " return name and lines in gbdb tab sep file " annotData = None for annotTuple in geneAnnotFiles: if dataName==annotTuple[0]: annotData = annotTuple break if annotData==None: errAbort("Could not find annotations with name %s" % dataName) _, annotFname, annotName, annotLongName = annotData inPath = join("/gbdb", "hgFixed", "geneGraph", annotFname) ifh = readSmallFile(inPath) @@ -1181,125 +1183,128 @@ selfLink = makeSelfLink("Remove all filters", {"supportLevel": None}) errAbort("Sorry, there are no direct interactions with %s that fulfill your " "filter criteria. %s" % (geneName, selfLink)) allGenes = set() sifLines = [] for linkRow in weightedLinks: gene1, gene2 = linkRow[:2] allGenes.add(gene1) allGenes.add(gene2) if format=="sif": sifLines.append("%s pp %s" % (gene1, gene2)) if format in ["sif", "json"]: printHttpHead(format) if format=="sif": - print "\n".join(sifLines) + print("\n".join(sifLines)) else: - print jsonStr + print(jsonStr) return geneDescs = queryGeneDescs(conn, allGenes) linkSnips = querySnippets(conn, weightedLinks) annotLabel, geneAnnots = getGeneAnnots() writeDot(allGenes, weightedLinks, tmpName, targetGene, geneDescs, annotLabel, geneAnnots, linkSnips) if len(allGenes)>=100: alg = "fdp" picName, mapName = runDot(tmpName, alg, format) if format!="png": printHttpHead(format) - sys.stdout.write(open(picName,"rb").read()) + sys.stdout.flush() + sys.stdout.buffer.write(open(picName,"rb").read()) # binary data must use sys.stdout.buffer sys.stdout.flush() return # text above graph print("Mouse over or click genes or lines for details. Dashed lines indicate interactions without text mining support. ") print("Click any gene to make it the new center. Click any line to show details about the interaction. ") - print("Only %s-interacting genes and only the most-mentioned/most-curated interactions are shown in the graph. " % (targetGene)) + print(("Only %s-interacting genes and only the most-mentioned/most-curated interactions are shown in the graph. " % (targetGene))) print("See the Help Page for details.
") # menu above graph # background #fffef5 would be an alternive - print '
' + print('
') printGraphMenu(conn, targetGene, addNeighbors) - print "

" + print("

") # graph itself - print '' % picName - map = open(mapName).read() - print map + print('' % picName) + mapData = open(mapName, "rb").read() + sys.stdout.flush() + sys.stdout.buffer.write(mapData) # only way to get binary data to stdout. Problem may be that data is mixed latin1/utf8 in tables from Stanford + sys.stdout.flush() - print '

' - print "

" + print('

') + print("

") def printPmidSearchForm(): " print a little form that allows to search for a PMID " - print "


" - print "Search for a PMID: " - print '
' - print ' ' - print ' ' - print '

' + print("


") + print("Search for a PMID: ") + print('
') + print(' ') + print(' ') + print('

') def printDisclaimer(): - print ''' + print('''

NOTE:
Gene interactions that are not highlighted in blue were obtained with text mining software. They include errors. Please read the original source text before relying on an interaction.

- ''' + ''') def printInfo(): - print """Please see the Gene Interactions Track Manual.""" % MANUALURL - print "
 
" + print("""Please see the Gene Interactions Track Manual.""" % MANUALURL) + print("
 
") def printLowLinksTable(gene, lowLinks, sortByCount): " print the other links as a table " if len(lowLinks)==0: return lowGeneList = countTargetLinks(lowLinks, gene) if len(lowGeneList)==0: return # sort lowGenes by either count or name if sortByCount: lowGeneList.sort(key=operator.itemgetter(1), reverse=True) selfLink = makeSelfLink("sort alphabetically", {"sortByCount": None}, anchor="table") currentSortDesc = "Sorted by article count" else: lowGeneList.sort(key=operator.itemgetter(0)) selfLink = makeSelfLink("sort by article count", {"sortByCount": "1"}, anchor="table") currentSortDesc = "Sorted alphabetically" geneCount = int(getCgiVar("geneCount", DEFGENECOUNT)) title = 'Less-frequently mentioned interactions with %s, not among the Top %d' % (gene, geneCount) printHgcSection(title, "", id='table') - print "Other genes interacting with %s. Mouse-over to show number of abstracts or databases. %s (%s).
" % \ - (gene, currentSortDesc, selfLink) + print("Other genes interacting with %s. Mouse-over to show number of abstracts or databases. %s (%s).
" % \ + (gene, currentSortDesc, selfLink)) print("Like above, interactions are colored by support. Grey:only text mining, light-blue:interaction database, blue:pathway database

") #print("Click any gene to make it the new center. Click number of articles to show sentences.

") - print '' - print '' + print('
') + print('') i = 0 rowSize = 7 # columns per row for g, artCount, dbCount, tags in lowGeneList: # to set the cell width, browsers use only the first row cellStyle = "" if i <= rowSize: cellStyle=' style="width:140px"' color = None if "low" in tags or "pwy" in tags: color = LTCOLOR elif "ppi" in tags: color = HTCOLOR elif "text" in tags: @@ -1311,67 +1316,67 @@ newGeneLink = makeSelfLink("▸", {"gene": g}, title="Center graph on %s" % g, style='text-decoration:none', dataToggle="tooltip") if dbCount == 1 and artCount != 0: detailsText = "interaction %s-%s mentioned by %d articles and %d database" % (gene, g, artCount, dbCount) elif dbCount != 0 and artCount != 0: detailsText = "interaction %s-%s mentioned by %d articles and %d databases" % (gene, g, artCount, dbCount) elif artCount != 0: detailsText = "interaction %s-%s mentioned by %d articles" % (gene, g, artCount) elif dbCount != 0: detailsText = "interaction %s-%s mentioned by %d databases" % (gene, g, dbCount) detailsLink = makeSelfLink(g, {"gene": None, "lastGene": gene,"link":"%s:%s" % (g, gene)}, style=linkStyle, title=detailsText, dataToggle="tooltip") - print( '%s %s' % (cellStyle, detailsLink, newGeneLink)) + print(( '%s %s' % (cellStyle, detailsLink, newGeneLink))) i += 1 if i % rowSize == 0: - print "" + print("") - print "" - print "
" + print("") + print("") def listToMysqlList(l): " convert a python list of strings to a mysql list, like ('a', 'b') " newList = [] for s in l: s = s.replace("'", "") newList.append("'"+s+"'") return "(%s)" % ",".join(newList) def querySnippets(conn, pairs): """ get the little text snippets from the ggLink table for a list of pairs return as a dict (gene1, gene2) -> snippet """ # convert list of pairs to mysql quoted string list newList = [] for pairData in pairs: g1, g2 = pairData[:2] pairQuote = "('%s', '%s')" % (g1, g2) newList.append(pairQuote) listStr = "(%s)" % ",".join(newList) query = "SELECT gene1, gene2, snippet FROM ggLink "\ "WHERE (gene1, gene2) in %s" % listStr rows = sqlQuery(conn, query) pairSnips = {} for g1, g2, snip in rows: - pairSnips[(g1,g2)]=snip + pairSnips[(g1,g2)]=snip.decode("utf8") return pairSnips def makeHgGeneLink(sym, db): return "%s" % (db, sym, sym) def showGraphBrowser(): " run graphviz on gene graph around gene, print html img and html map " gene, alg, addNeighbors, sortByCount, geneCount = parseGraphArgs() conn = sqlConnect(GGDB) graphLinks, lowLinks = buildGraph(conn, gene, geneCount, MINSUPP, addNeighbors) weightedLinks, minAbsCount = flattenLink(graphLinks) humanDb = getCgiVar("db", "hg19") @@ -1453,116 +1458,116 @@ geneStr = ", ".join(eGenes) if eName=="": return "%s" % geneStr else: return "%s (%s)" % (eName, geneStr) if eType == "family": geneStr = "/".join(eGenes) if eName=="": return "%s" % geneStr else: return "%s (%s)" % (eName, geneStr) def printDbRows(conn, rows, onlyDoc=None): " print a row from the ggEventDb table as html " - print '

") def showPwyPpiInfo(conn, gene1, gene2): " print pathway and PPI info about link " gene1, gene2 = sorted([gene1, gene2]) q = "SELECT ggEventDb.* FROM ggLinkEvent, ggEventDb " \ "WHERE gene1='%s' and gene2='%s' AND ggEventDb.eventId=ggLinkEvent.eventId" % (gene1,gene2) rows = sqlQuery(conn, q) # split db rows into ppi and pathway rows pwyRows, ppiRows = [], [] for r in rows: if r[0].startswith("ppi_"): ppiRows.append(r) else: pwyRows.append(r) if len(pwyRows)!=0: - print "

Pathways - manually collected, often from reviews:

" + print("

Pathways - manually collected, often from reviews:

") printDbRows(conn, pwyRows) if len(ppiRows)!=0: - print "

Protein-Protein interactions - manually collected from original source literature:

" - print '

Studies that report less than %d interactions are marked with *

' % LTCUTOFF + print("

Protein-Protein interactions - manually collected from original source literature:

") + print('

Studies that report less than %d interactions are marked with *

' % LTCUTOFF) printDbRows(conn, ppiRows) def markupSentence(row): " given a MSR-textmining row, print a sentence with the various detected words marked up in bold " sent = row.sentence tStart, tEnd = int(row.themeTokenStart), int(row.themeTokenEnd) cStart, cEnd = int(row.causeTokenStart), int(row.causeTokenEnd) trigToken = int(row.triggerTokenId) sentId = int(row.sentenceId) # put into temporary var to be able to replace " , " later parts = [] for i, word in enumerate(sent.split()): if i==tStart or i==cStart: if i==cStart: @@ -1597,45 +1602,45 @@ doneDocs.add(docInfo) yield row def printDocNlpResults(rows): " print text mining results for document-centered view " for row in iterUniqueInteractions(rows): causeGene, themeGene = row.causeGenes, row.themeGenes if "Negative" in row.relType: sym = "8867" # unicode right-tack elif "Positive" in row.relType: sym = "8594" # unicode right arrow else: sym = "8212" # unicode long dash - print '%s &#%s; %s: "' % (row.causeName, sym, row.themeName), - print markupSentence(row), - print '"

' + print('%s &#%s; %s: "' % (row.causeName, sym, row.themeName)) + print(markupSentence(row)) + print('"

') def printMsrNlpRows(rows): " print msrNlp table rows with marked-up snippets " phrases = [] for row in iterUniqueInteractions(rows): causeGene, themeGene = row.causeGenes, row.themeGenes #if addGenes: #print "%s → %s :" % (causeGene, themeGene) line = markupSentence(row) phrases.append(line) - print " ... ".join(phrases) + print(" ... ".join(phrases)) # the blacklist ist not used right now, as we cannot write to the database. # pending further discussions with senior engineers #def readBlackList(conn): #""" return flagged interactions as a set of (cause, theme, pmid (as str)) """ #centralDb = cfgOption("central.db") #blackList = set() #rows = sqlQuery(conn, "SELECT causeGene, themeGene, pmid from %s.ggFeedback" % centralDb) #for row in rows: #blackList.add( (row.causeGene, row.themeGene, str(row.pmid)) ) #return blackList def queryEventText(conn, gene1, gene2): " return rows from the ggEventText table " @@ -1682,114 +1687,114 @@ if showStar and int(row.resCount)<=LTCUTOFF: note = '*' #note = "*" text = "%s %s, %s %s" % (fAu, suffix, row.journal, row.year) mouseOver = None if row.title!="": mouseOver = row.title.replace('"', "") return docLink(row.docId, text=text, mouseOver=mouseOver)+note def showSnipsLink(conn, gene1, gene2): " show snippets for a gene pair " rows = queryEventText(conn, gene1, gene2) if len(rows)!=0: - print '

Text-mined interactions from Literome

' % (gene1, gene2) + print('

Text-mined interactions from Literome

' % (gene1, gene2)) byDoc = defaultdict(list) for row in rows: byDoc[row.docId].append(row) - for docId, rows in byDoc.iteritems(): - print prettyDocLink(rows[0], showStar=False), + for docId, rows in byDoc.items(): + print(prettyDocLink(rows[0], showStar=False)) disSuffix = "" if rows[0].context!="": contexts = rows[0].context.split("|") if len(contexts)>1: suffix = "..." else: suffix = "" disSuffix = "(%s%s)" % (contexts[0], suffix) - print "%s :" % disSuffix + print("%s :" % disSuffix) printMsrNlpRows(rows) - print "
" + print("
") def showLink(link): " print details page with db info and snippets for a gene interaction " if ":" not in link: errAbort("'link' CGI parameter has to contain a colon-separated pair of genes, like PITX2:TBX5") gene1, gene2 = link.split(":") gene1, gene2 = sorted([gene1, gene2]) gene1 = gene1.upper() gene2 = gene2.upper() lastGene = getCgiVar("lastGene") if lastGene is not None: backUrl = makeSelfUrl({"gene":lastGene, "link":None}) - print "

◀ Back to %s

" % (backUrl, lastGene) + print("

◀ Back to %s

" % (backUrl, lastGene)) - print "

%s — %s

" % (gene1, gene2)# unicode long dash + print("

%s — %s

" % (gene1, gene2))# unicode long dash conn = sqlConnect(GGDB) #flagLink = makeSelfLink("Report data error", {"flag":"%s:%s" % (gene1, gene2)}, clear=True) #print ('%s

' % flagLink) showPwyPpiInfo(conn, gene1, gene2) showSnipsLink(conn, gene1, gene2) def makeRefString(articleData): """ prepare a string that describes the citation: vol, issue, page, etc of journal """ refParts = [articleData.journal] if articleData.year!="": refParts[0] += (" "+articleData.year) #if articleData.vol!="": #refParts.append("Vol "+articleData.vol) #if articleData.issue!="": #refParts.append("Issue "+articleData.issue) #if articleData.page!="": #refParts.append("Page "+articleData.page) return ", ".join(refParts) def showArtInfo(conn, pmid): " show basic pubmed metadata for article " q = "SELECT * from ggDoc where docId='%s'" % (str(pmid)) rows = sqlQuery(conn, q) if len(rows)==0: - print "No metadata for docId PMID %s" % str(pmid) + print("No metadata for docId PMID %s" % str(pmid)) return p = rows[0] - print "%s, " % makeRefString(p) - print pubmedLink(p.docId) - print "

" - print '

%s

' % (pubmedLink(pmid, p.title)) - print "%s

" % p.authors - print '

' - print p.abstract - print '

' + print("%s, " % makeRefString(p)) + print(pubmedLink(p.docId)) + print("

") + print('

%s

' % (pubmedLink(pmid, p.title))) + print("%s

" % p.authors) + print('

') + print(p.abstract) + print('

') if p.context!="": - print "Diseases/Pathways annotated by Medline MESH: ", p.context.replace("|", ", "), "
" + print("Diseases/Pathways annotated by Medline MESH: ", p.context.replace("|", ", "), "
") - print "Document information provided by NCBI PubMed

" + print("Document information provided by NCBI PubMed

") return p.resCount #def printPwyRows(rows, showPmids=True): #dbToTypePmids = defaultdict(dict) # db -> intType -> list of pmid #for row in rows: #dbToTypePmids[row.db].setdefault(row.intType, []).append( (row.pmid, row.causeGene, row.themeGene )) # print "