fded6ce46cb167faaae559ff93b050c2b7d18ff1
max
Mon Jun 26 08:59:00 2023 -0700
Porting hgGeneGraph to python3. refs #31563
diff --git src/hg/hgGeneGraph/hgGeneGraph src/hg/hgGeneGraph/hgGeneGraph
index 96bd4bc..b5df5d0 100755
--- src/hg/hgGeneGraph/hgGeneGraph
+++ src/hg/hgGeneGraph/hgGeneGraph
@@ -1,16 +1,16 @@
-#!/usr/bin/env python2
+#!/usr/bin/env python3
# Gene Interaction Viewer for the Genome Browser
# query tables with prefix "gg" in hgFixed, writes the results to a dot file,
# runs graphviz's "dot" program to create a pathway map from it and write html
# and mapfiles to the trash directory.
# CGI params: gene=(HGNCsymbol) or link=sym1:sym2
# optional params: addNeighbors
# colors:
# grey+thickness = only text mining data
# light blue, dashed = only high-throughput data
@@ -18,59 +18,59 @@
# dark blue, dashed = only low-throughput data
# dark blue, thickness = low-throughput data + text
# dark blue + dashed = only pathway data
# code review
# - os.system is not a security risk here, no variables go into the cmd line
# - mysql statements are not escaped, instead all CGI vars are checked for non-alpha letters
# hgFixed tables required for this script: ggLink (main table with gene-gene links),
# ggLinkEvent (details about link), ggEventDb (details about links from databases),
# ggEventText (details about links from text mining), ggDoc (details about documents for ggEventText)
# ggGeneName (symbols), ggGeneClass (HPRD/Panther class)
# these are default python modules on python 2.7, no errors expected here
-import sys, cgi, os, string, urllib, operator, hashlib
+import sys, cgi, os, string, urllib.request, urllib.parse, urllib.error, operator, hashlib
from sys import exit
from collections import defaultdict, namedtuple
from os.path import *
import cgitb
cgitb.enable()
# import the UCSC-specific library
sys.path.append(join(dirname(__file__), "pyLib"))
try:
- from hgLib import cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \
+ from hgLib3 import cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \
sqlConnect, sqlQuery, errAbort, cfgOption, runCmd, cgiGetAll, printHgcHeader, \
printHgcSection, getNonce, getCspMetaHeader, jsOnEventById, \
jsInlineFinish, webStartGbNoBanner, htmlPageEnd, hConnectCentral, \
sqlTableExists, readSmallFile
except:
print("Content-type: text/html\n")
print("Cannot find the directory cgi-bin/pyLib in Apache. This is an installation error.")
print("All all parts of cgi-bin installed? Did you do 'make' in kent/src/hg/pyLib?")
-import MySQLdb
+import pymysql.cursors
# not using feedback button for now. Fan would have liked it, but not sure how we can write
# to any form of database.
# the list of allowed chars in cgi args: digits, letters and dashes
legalChars = set(string.digits)
-legalChars.update(set(string.letters))
+legalChars.update(set(string.ascii_letters))
legalChars.update("_-./: ")
# number of genes to show on graph by default
DEFGENECOUNT="25"
# ignore all text mining data with less than X abstracts
MINSUPP=2
# minResCount is used throughout the code. For a given interaction, it is the minimal
# number of interactions from all documents linked to this interaction.
# E.g. minResCount of 5 means that the interaction is based on at least one document
# that contained not more than 5 interactions.
# Cutoff on minResCount:
# maximum number of pairs a study can have to be considered "low-throughput"
# only interactions with at least one low-throughput study are colord in dark
@@ -137,31 +137,31 @@
#
#
#
#
#
#
def printInlineAndStyles():
#print('')
print('')
- print("""
+ print(("""
'
- print ' '
+ print(' ')
- print ""
- print ' '
+ print("")
+ print(' ')
#print ''
def openGeneAnnotFile(dataName):
" return name and lines in gbdb tab sep file "
annotData = None
for annotTuple in geneAnnotFiles:
if dataName==annotTuple[0]:
annotData = annotTuple
break
if annotData==None:
errAbort("Could not find annotations with name %s" % dataName)
_, annotFname, annotName, annotLongName = annotData
inPath = join("/gbdb", "hgFixed", "geneGraph", annotFname)
ifh = readSmallFile(inPath)
@@ -1181,125 +1183,128 @@
selfLink = makeSelfLink("Remove all filters", {"supportLevel": None})
errAbort("Sorry, there are no direct interactions with %s that fulfill your "
"filter criteria. %s" % (geneName, selfLink))
allGenes = set()
sifLines = []
for linkRow in weightedLinks:
gene1, gene2 = linkRow[:2]
allGenes.add(gene1)
allGenes.add(gene2)
if format=="sif":
sifLines.append("%s pp %s" % (gene1, gene2))
if format in ["sif", "json"]:
printHttpHead(format)
if format=="sif":
- print "\n".join(sifLines)
+ print("\n".join(sifLines))
else:
- print jsonStr
+ print(jsonStr)
return
geneDescs = queryGeneDescs(conn, allGenes)
linkSnips = querySnippets(conn, weightedLinks)
annotLabel, geneAnnots = getGeneAnnots()
writeDot(allGenes, weightedLinks, tmpName, targetGene, geneDescs, annotLabel, geneAnnots, linkSnips)
if len(allGenes)>=100:
alg = "fdp"
picName, mapName = runDot(tmpName, alg, format)
if format!="png":
printHttpHead(format)
- sys.stdout.write(open(picName,"rb").read())
+ sys.stdout.flush()
+ sys.stdout.buffer.write(open(picName,"rb").read()) # binary data must use sys.stdout.buffer
sys.stdout.flush()
return
# text above graph
print("Mouse over or click genes or lines for details. Dashed lines indicate interactions without text mining support. ")
print("Click any gene to make it the new center. Click any line to show details about the interaction. ")
- print("Only %s-interacting genes and only the most-mentioned/most-curated interactions are shown in the graph. " % (targetGene))
+ print(("Only %s-interacting genes and only the most-mentioned/most-curated interactions are shown in the graph. " % (targetGene)))
print("See the Help Page for details.
")
# menu above graph
# background #fffef5 would be an alternive
- print '
" + print("
")
# graph itself
- print '' % picName
- map = open(mapName).read()
- print map
+ print('
' % picName)
+ mapData = open(mapName, "rb").read()
+ sys.stdout.flush()
+ sys.stdout.buffer.write(mapData) # only way to get binary data to stdout. Problem may be that data is mixed latin1/utf8 in tables from Stanford
+ sys.stdout.flush()
- print '
" + print('
") def printPmidSearchForm(): " print a little form that allows to search for a PMID " - print "
' + print("
') def printDisclaimer(): - print ''' + print('''
NOTE:
Gene interactions that are not highlighted in blue were obtained with text mining software.
They include errors. Please read the original source text before relying on an interaction.
") #print("Click any gene to make it the new center. Click number of articles to show sentences.
") - print '
%s %s | ' % (cellStyle, detailsLink, newGeneLink)) + print(( '%s %s | ' % (cellStyle, detailsLink, newGeneLink))) i += 1 if i % rowSize == 0: - print "
Studies that report less than %d interactions are marked with *
' % LTCUTOFF + print("Studies that report less than %d interactions are marked with *
' % LTCUTOFF) printDbRows(conn, ppiRows) def markupSentence(row): " given a MSR-textmining row, print a sentence with the various detected words marked up in bold " sent = row.sentence tStart, tEnd = int(row.themeTokenStart), int(row.themeTokenEnd) cStart, cEnd = int(row.causeTokenStart), int(row.causeTokenEnd) trigToken = int(row.triggerTokenId) sentId = int(row.sentenceId) # put into temporary var to be able to replace " , " later parts = [] for i, word in enumerate(sent.split()): if i==tStart or i==cStart: if i==cStart: @@ -1597,45 +1602,45 @@ doneDocs.add(docInfo) yield row def printDocNlpResults(rows): " print text mining results for document-centered view " for row in iterUniqueInteractions(rows): causeGene, themeGene = row.causeGenes, row.themeGenes if "Negative" in row.relType: sym = "8867" # unicode right-tack elif "Positive" in row.relType: sym = "8594" # unicode right arrow else: sym = "8212" # unicode long dash - print '%s %s; %s: "' % (row.causeName, sym, row.themeName), - print markupSentence(row), - print '"' + print('%s %s; %s: "' % (row.causeName, sym, row.themeName)) + print(markupSentence(row)) + print('"
') def printMsrNlpRows(rows): " print msrNlp table rows with marked-up snippets " phrases = [] for row in iterUniqueInteractions(rows): causeGene, themeGene = row.causeGenes, row.themeGenes #if addGenes: #print "%s → %s :" % (causeGene, themeGene) line = markupSentence(row) phrases.append(line) - print " ... ".join(phrases) + print(" ... ".join(phrases)) # the blacklist ist not used right now, as we cannot write to the database. # pending further discussions with senior engineers #def readBlackList(conn): #""" return flagged interactions as a set of (cause, theme, pmid (as str)) """ #centralDb = cfgOption("central.db") #blackList = set() #rows = sqlQuery(conn, "SELECT causeGene, themeGene, pmid from %s.ggFeedback" % centralDb) #for row in rows: #blackList.add( (row.causeGene, row.themeGene, str(row.pmid)) ) #return blackList def queryEventText(conn, gene1, gene2): " return rows from the ggEventText table " @@ -1682,114 +1687,114 @@ if showStar and int(row.resCount)<=LTCUTOFF: note = '*' #note = "*" text = "%s %s, %s %s" % (fAu, suffix, row.journal, row.year) mouseOver = None if row.title!="": mouseOver = row.title.replace('"', "") return docLink(row.docId, text=text, mouseOver=mouseOver)+note def showSnipsLink(conn, gene1, gene2): " show snippets for a gene pair " rows = queryEventText(conn, gene1, gene2) if len(rows)!=0: - print '
' % flagLink) showPwyPpiInfo(conn, gene1, gene2) showSnipsLink(conn, gene1, gene2) def makeRefString(articleData): """ prepare a string that describes the citation: vol, issue, page, etc of journal """ refParts = [articleData.journal] if articleData.year!="": refParts[0] += (" "+articleData.year) #if articleData.vol!="": #refParts.append("Vol "+articleData.vol) #if articleData.issue!="": #refParts.append("Issue "+articleData.issue) #if articleData.page!="": #refParts.append("Page "+articleData.page) return ", ".join(refParts) def showArtInfo(conn, pmid): " show basic pubmed metadata for article " q = "SELECT * from ggDoc where docId='%s'" % (str(pmid)) rows = sqlQuery(conn, q) if len(rows)==0: - print "No metadata for docId PMID %s" % str(pmid) + print("No metadata for docId PMID %s" % str(pmid)) return p = rows[0] - print "%s, " % makeRefString(p) - print pubmedLink(p.docId) - print "
" - print '
" % p.authors - print '
") + print('
" % p.authors) + print('
" + print("Document information provided by NCBI PubMed
") return p.resCount #def printPwyRows(rows, showPmids=True): #dbToTypePmids = defaultdict(dict) # db -> intType -> list of pmid #for row in rows: #dbToTypePmids[row.db].setdefault(row.intType, []).append( (row.pmid, row.causeGene, row.themeGene )) # print "
") print ("Thank you for reporting errors, e.g.") print("
")
print ("Optional comment:
")
- print ('
") lastGene = getCgiVar("lastGene") linkUrl = makeSelfUrl({"gene":None, "link":"%s:%s" % (causeGene, themeGene), "lastGene":lastGene}) - print ('Return to the Interaction page
' % linkUrl) + print(('Return to the Interaction page' % linkUrl)) def parseGraphArgs(): " get the arguments to build a graph from the CGI parameters " gene = getCgiVar("gene") if gene is None: gene = "MTOR" gene = gene.rstrip(":") gene = gene.split()[0] gene = gene.upper() alg = getCgiVar("alg") if alg==None: alg= "neato" @@ -1949,45 +1954,45 @@ for row in sqlQuery(conn, q): for g1 in row.causeGenes.split("|"): for g2 in row.themeGenes.split("|"): pair = tuple(sorted([g1, g2])) pairs[row.sourceDb].add(pair) q = "SELECT docId, causeGenes, themeGenes from ggEventText" pairToPmids = defaultdict(set) for row in sqlQuery(conn, q): for g1 in row.causeGenes.split("|"): for g2 in row.themeGenes.split("|"): pair = tuple(sorted([g1, g2])) pairToPmids[pair].add(row.docId) pairs["literome"].add(pair) - for pair, pmids in pairToPmids.iteritems(): + for pair, pmids in pairToPmids.items(): if len(pmids)>1: pairs["literome (>= 2 PMIDs)"].add(pair) return pairs def showStats(): " " - print "