src/hg/hgGeneGraph/hgGeneGraph ea7173bd838a0f4795a1222eb5b49c85defbc4cf

ea7173bd838a0f4795a1222eb5b49c85defbc4cf
max
  Mon Jul 3 05:13:11 2023 -0700
the proper fix for hgGeneGraph unicode problems, taking back some of the earlier changes, refs #31563

diff --git src/hg/hgGeneGraph/hgGeneGraph src/hg/hgGeneGraph/hgGeneGraph
index 2fdac98..34adb6a 100755
--- src/hg/hgGeneGraph/hgGeneGraph
+++ src/hg/hgGeneGraph/hgGeneGraph
@@ -19,51 +19,54 @@
 # dark blue, dashed = only low-throughput data
 
 # dark blue, thickness = low-throughput data + text
 # dark blue + dashed  = only pathway data
 
 # code review
 # - os.system is not a security risk here, no variables go into the cmd line
 # - mysql statements are not escaped, instead all CGI vars are checked for non-alpha letters
 
 # hgFixed tables required for this script: ggLink (main table with gene-gene links), 
 # ggLinkEvent (details about link), ggEventDb (details about links from databases), 
 # ggEventText (details about links from text mining), ggDoc (details about documents for ggEventText)
 # ggGeneName (symbols), ggGeneClass (HPRD/Panther class)
 
 # these are default python modules on python 2.7, no errors expected here
-import sys, cgi, os, string, urllib.request, urllib.parse, urllib.error, operator, hashlib, codecs
+import sys, cgi, os, string, urllib.request, urllib.parse, urllib.error, operator, hashlib
 from sys import exit
 from collections import defaultdict, namedtuple
 from os.path import *
 
 import cgitb
 cgitb.enable()
 
 # import the UCSC-specific library
 sys.path.append(join(dirname(__file__), "pyLib"))
 try:
     from hgLib3 import cgiArgs, cgiSetup, cgiString, printContentType, printMenuBar, \
             sqlConnect, sqlQuery, errAbort, cfgOption, runCmd, cgiGetAll, printHgcHeader, \
             printHgcSection, getNonce, getCspMetaHeader, jsOnEventById, \
             jsInlineFinish, webStartGbNoBanner, htmlPageEnd, hConnectCentral, \
-            sqlTableExists, readSmallFile
+            sqlTableExists, readSmallFile, forceUnicode
 except:
     print("Content-type: text/html\n")
     print("Cannot find the directory cgi-bin/pyLib in Apache. This is an installation error.")
     print("All all parts of cgi-bin installed? Did you do 'make' in kent/src/hg/pyLib?")
 
+import hgLib3
+hgLib3.forceUnicode = True
+
 import pymysql.cursors
 
 # not using feedback button for now. Fan would have liked it, but not sure how we can write
 # to any form of database.
 
 # the list of allowed chars in cgi args: digits, letters and dashes
 legalChars = set(string.digits)
 legalChars.update(set(string.ascii_letters))
 legalChars.update("_-./: ")
 
 # number of genes to show on graph by default
 DEFGENECOUNT="25"
 # ignore all text mining data with less than X abstracts
 MINSUPP=2
 
@@ -124,44 +127,33 @@
 "json" : "application/json",
 "sif" : "application/octet-stream"
 }
 
 # gbdb file names and descriptions
 geneAnnotFiles = [
 ("none", None, "No Annotation", "Do not annotate color genes on graph by external information"),
 ("gnf2", "gnf2Avg.tab", "GNF2 Expression", "Gene Expression Atlas 2 average across tissues"),
 ("drugbank", "drugbank.tab", "DrugBank", "DrugBank. Black = gene is targetable with a drug. Mouse-over shows drug names"),
 ("cosmic" , "cosmicCensus.tab", "Cancer Gene Census", "COSMIC Cancer Gene Census Tumor Types. Black = gene is in cancer gene census. Mouse-over shows cancer type"),
 ("tcgaMut" , "tcgaMut.tab", "Pan-Cancer Mutations", "TCGA PanCan12 samples with non-silent mutations - Gene mouse-over shows count")
 ]
 
 # ==== FUNCTIONS ===
 
-#<script src="//code.jquery.com/jquery-1.10.2.js"></script>
-#<script src="//code.jquery.com/ui/1.11.0/jquery-ui.js"></script>
-#<script src="../js/readmore.min.js"></script>
-
-#<link rel="stylesheet" href="//code.jquery.com/ui/1.11.0/themes/smoothness/jquery-ui.css">
-#<link rel="stylesheet" href="../style/HGStyle.css" type="text/css" />
-#<link rel='stylesheet' href='../style/nice_menu.css' type='text/css' />
-
 def printInlineAndStyles():
-    #print('<script src="//code.jquery.com/ui/1.11.0/jquery-ui.js"></script>')
     print('<script src="//cdn.rawgit.com/jedfoster/Readmore.js/master/readmore.min.js"></script>')
 
-
-
     print(("""
 <script type="text/javascript" nonce='%s'>
   $(function() {
     //$( document ).uitooltip();
     //$( document ).uitooltip();
     $('[data-toggle="tooltip"]').bsTooltip(); // bootstrap does not really allow HTML in the title attribute
 
     // use jquery ui tooltips for the graph
     var opt = {
         items: "area",
         track : true,
         content: function() {return $(this).prop('title')}
     };
     $("area").uitooltip(opt);
 
@@ -1220,31 +1212,35 @@
     # text above graph
     print("Mouse over or click genes or lines for details. Dashed lines indicate interactions without text mining support. ")
     print("Click any gene to make it the new center. Click any line to show details about the interaction. ")
     print(("Only %s-interacting genes and only the most-mentioned/most-curated interactions are shown in the graph. " % (targetGene)))
     print("See the <a href='../../goldenPath/help/hgGeneGraph.html'>Help Page</a> for details.<br>")
 
     # menu above graph
     # background #fffef5 would be an alternive
     print('<div style="display:inline-block"> <!-- graph -->')
     printGraphMenu(conn, targetGene, addNeighbors)
     print("<p>")
 
     # graph itself
     print('<img src="%s" usemap="#test">' % picName)
     mapData = open(mapName, "rb").read()
-    print(mapData.decode("latin1")) # graphviz seems to use latin1 encoding for its output file?
+    # the data in the graphviz map file seems to be sometimes utf8 and sometimes other codepages (not latin1). So passing it through as a byte string
+    # to the byte-buffer underneath sys.stdout. I have not found another way to make this work. Not sure why web browsers can show it just fine.
+    sys.stdout.flush()
+    sys.stdout.buffer.write(mapData)
+    sys.stdout.flush()
 
     print('</div> <!-- graph -->')
     print("<p>")
 
 def printPmidSearchForm():
     " print a little form that allows to search for a PMID "
     print("<hr>")
     print("Search for a PMID: ")
     print('<form action="hgGeneGraph" method="get">')
     print('  <input type="text" name="search">')
     print('  <input type="submit" name="Search">')
     print('</form><p>')
 
 def printDisclaimer():
     print('''
@@ -1351,31 +1347,31 @@
     # convert list of pairs to mysql quoted string list
     newList = []
     for pairData in pairs:
         g1, g2 = pairData[:2]
         pairQuote = "('%s', '%s')" % (g1, g2)
         newList.append(pairQuote)
     listStr = "(%s)" % ",".join(newList)
 
     query = "SELECT gene1, gene2, snippet FROM ggLink "\
         "WHERE (gene1, gene2) in %s" % listStr
 
     rows = sqlQuery(conn, query)
 
     pairSnips = {}
     for g1, g2, snip in rows:
-        pairSnips[(g1,g2)]=snip.decode("utf8")
+        pairSnips[(g1,g2)]=snip
     return pairSnips
 
 def makeHgGeneLink(sym, db):
     return "<a title='Link to Gene Details page' href='hgGene?db=%s&hgg_gene=%s'>%s</a>" % (db, sym, sym)
 
 def showGraphBrowser():
     " run graphviz on gene graph around gene, print html img and html map "
     gene, alg, addNeighbors, sortByCount, geneCount = parseGraphArgs()
 
     conn = sqlConnect(GGDB)
 
     graphLinks, lowLinks = buildGraph(conn, gene, geneCount, MINSUPP, addNeighbors)
     weightedLinks, minAbsCount = flattenLink(graphLinks)
 
     humanDb = getCgiVar("db", "hg19")
@@ -1463,37 +1459,30 @@
     if eType == "family":
         geneStr = "/".join(eGenes)
         if eName=="":
             return "%s"  % geneStr
         else:
             return "%s (%s)"  % (eName, geneStr)
 
 def printDbRows(conn, rows, onlyDoc=None):
     " print a row from the ggEventDb table as html "
     print('<ul class="more">')
     # sort by database name
     for row in sorted(rows, key=operator.itemgetter(9)):
         eventId, causeType, causeName, causeGenes, themeType, themeName, themeGenes, \
             relType, relSubtype, sourceDb, sourceId, sourceDesc, docIds, evidence = row
 
-        # in Python3, 'longblob' fields get converted to byte strings
-        causeGenes = causeGenes.decode("utf8")
-        themeGenes = themeGenes.decode("utf8")
-        sourceId = sourceId.decode("utf8")
-        docIds = docIds.decode("utf8")
-        evidence = evidence.decode("utf8")
-
         if eventId.startswith("ppi_"):
             isPpi = True
         else:
             isPpi = False
 
         docSet = set(docIds.split("|"))
 
         if eventId.startswith("ppi_iref"):
             dbName, dbUrlPat = dbData["iref"]
             dbName = "IRef %s" % sourceDb.capitalize()
         else:
             dbName, dbUrlPat = dbData[sourceDb]
 
 
         url = dbUrlPat % sourceId
@@ -1554,104 +1543,100 @@
             ppiRows.append(r)
         else:
             pwyRows.append(r)
 
     if len(pwyRows)!=0:
         print("<h3>Pathways - manually collected, often from reviews:</h3>")
         printDbRows(conn, pwyRows)
 
     if len(ppiRows)!=0:
         print("<h3>Protein-Protein interactions - manually collected from original source literature:</h3>")
         print('<p>Studies that report less than %d interactions are marked with *</p>' % LTCUTOFF)
         printDbRows(conn, ppiRows)
 
 def markupSentence(row):
     " given a MSR-textmining row, print a sentence with the various detected words marked up in bold "
-    sent = row.sentence.decode("utf8")
+    sent = row.sentence
     tStart, tEnd = int(row.themeTokenStart), int(row.themeTokenEnd)
     cStart, cEnd = int(row.causeTokenStart), int(row.causeTokenEnd)
     trigToken = int(row.triggerTokenId)
     sentId = int(row.sentenceId)
 
     # put into temporary var to be able to replace " , " later
     parts = []
     for i, word in enumerate(sent.split()):
-        themeGenes = row.themeGenes.decode("utf8") # longblob = byte string
-        causeGenes = row.causeGenes.decode("utf8")
+        themeGenes = row.themeGenes
+        causeGenes = row.causeGenes
 
         if i==tStart or i==cStart:
             if i==cStart:
                 genes = themeGenes # XX bug in MsrNlp tab file: Inversed?
             else:
                 genes = causeGenes
             geneDisp = genes.replace("|", ", ")
             gene1 = genes.split("|")[0]
             parts.append( '<b><i><a href="http://www.genecards.org/cgi-bin/carddisp.pl?gene=%s" data-toggle="tooltip" data-placement="top" title="recognized genes: %s, click to go to Gene Cards" target=_blank>' % \
                 (gene1, geneDisp))
         if i==trigToken:
             parts.append( '<i class="trigger">')
         if i==tEnd or i==cEnd:
             parts.append( "</a></i></b>")
         if i==trigToken+1:
             parts.append('</i class="trigger">')
         parts.append(word)
 
     line = " ".join(parts)
     line = line.replace(" , ", ", ").replace(" . ", ". ").rstrip(". ")
     line = line.replace("-LRB-", "(").replace("-RRB-", ")")
     return line
 
 def iterUniqueInteractions(rows):
     " iterate over msrNlp rows, but remove interactions we already had (=ignore direction) "
     doneDocs = set()
     for row in rows:
         # make sure to skip duplicated info (=same genes + same document + same sentence)
-        causeGenes = row.causeGenes.decode("utf8") # longblob = byte string
-        themeGenes = row.themeGenes.decode("utf8")
+        causeGenes = row.causeGenes
+        themeGenes = row.themeGenes
         docInfo = tuple(sorted([causeGenes, themeGenes, row.docId, row.sentenceId]))
         if docInfo in doneDocs:
             continue
         doneDocs.add(docInfo)
         yield row
 
 def printDocNlpResults(rows):
     " print text mining results for document-centered view "
     for row in iterUniqueInteractions(rows):
-        causeGene, themeGene = row.causeGenes, row.themeGenes
-
         if "Negative" in row.relType:
             sym = "8867" # unicode right-tack
         elif "Positive" in row.relType:
             sym = "8594" # unicode right arrow
         else:
             sym = "8212" # unicode long dash
 
         print('%s &#%s; %s: "' % (row.causeName, sym, row.themeName))
         print(markupSentence(row))
         print('"<p>')
 
 def printMsrNlpRows(rows):
     " print msrNlp table rows with marked-up snippets "
     phrases = []
     for row in iterUniqueInteractions(rows):
-        causeGene, themeGene = row.causeGenes, row.themeGenes
-        #if addGenes:
-            #print "%s &#8594; %s :" % (causeGene, themeGene)
         line = markupSentence(row)
         phrases.append(line)
 
-    print(" ... ".join(phrases))
+    lines = " ... ".join(phrases)
+    print(lines)
 
 # the blacklist ist not used right now, as we cannot write to the database. 
 # pending further discussions with senior engineers
 
 #def readBlackList(conn):
     #""" return flagged interactions as a set of (cause, theme, pmid (as str)) """
     #centralDb = cfgOption("central.db")
     #blackList = set()
     #rows = sqlQuery(conn, "SELECT causeGene, themeGene, pmid from %s.ggFeedback" % centralDb)
     #for row in rows:
         #blackList.add( (row.causeGene, row.themeGene, str(row.pmid)) )
     #return blackList
 
 def queryEventText(conn, gene1, gene2):
     " return rows from the ggEventText table "
@@ -1672,68 +1657,68 @@
 def prettyDocLinks(conn, pmids):
     " given a list of pmids, return a list of nice links to articles on our own site "
     quoteList = ['"%s"' % pmid for pmid in pmids]
     idStr = "(%s)" % (",".join(quoteList))
     # XX remove distinct if not needed anymore
     query = "SELECT authors, title, journal, year, docId, resCount FROM ggDoc WHERE docId IN %s" % idStr
     rows = sqlQuery(conn, query)
 
     links = []
     for row in rows:
         links.append(prettyDocLink(row))
     return ", ".join(links)
 
 def prettyDocLink(row, showStar=True):
     " given a row that includes author/title/year/docId fields, return a nice link to our doc view "
-    authors = row.authors.decode("utf8").split("; ") # mysql longblob => byte string
+    authors = row.authors
     if len(authors)>0:
-        fAu = authors[0].split(",")[0]
+        fAu = authors.split(",")[0]
     else:
         fAu = ""
 
     suffix = ""
     if len(authors) > 1:
         suffix = " et al."
 
     note = ""
     if showStar and int(row.resCount)<=LTCUTOFF:
         note = '<a href="#" title="Low-throughput publication">*</a>'
         #note = "*"
     text = "%s %s, %s %s" % (fAu, suffix, row.journal, row.year)
 
     mouseOver = None
-    title = row.title.decode("utf8") # longBlob = byte string
+    title = row.title
     if title!="":
         mouseOver = title.replace('"', "")
     return docLink(row.docId, text=text, mouseOver=mouseOver)+note
 
 def showSnipsLink(conn, gene1, gene2):
     " show snippets for a gene pair "
     rows = queryEventText(conn, gene1, gene2)
 
     if len(rows)!=0:
         print('<h3>Text-mined interactions from <A HREF="http://literome.azurewebsites.net/Network?gene1=%s&gene2=%s">Literome</A></h3>' % (gene1, gene2))
 
     byDoc = defaultdict(list)
     for row in rows:
         byDoc[row.docId].append(row)
 
     for docId, rows in byDoc.items():
         print(prettyDocLink(rows[0], showStar=False))
 
         disSuffix = ""
-        context = rows[0].context.decode("utf8") # longblob = byte string
+        context = rows[0].context
         if context!="":
             contexts = context.split("|")
             if len(contexts)>1:
                 suffix = "..."
             else:
                 suffix = ""
             disSuffix = "(%s%s)" % (contexts[0], suffix)
 
         print("%s :" % disSuffix)
         printMsrNlpRows(rows)
         print("<br>")
 
 
 def showLink(link):
     " print details page with db info and snippets for a gene interaction "
@@ -1978,30 +1963,31 @@
                 pair = tuple(sorted([g1, g2]))
                 pairToPmids[pair].add(row.docId)
                 pairs["literome"].add(pair)
 
     for pair, pmids in pairToPmids.items():
         if len(pmids)>1:
             pairs["literome (>= 2 PMIDs)"].add(pair)
     return pairs
 
 def showStats():
     " "
     print("<h3>Databases - number of unique pairs</h3>")
     print("<ul>")
 
     conn = sqlConnect(GGDB)
+
     pairs = uniquePairs(conn)
     for db, pairSet in sorted(pairs.items()):
         print("<li>%s: %d" % (db, len(pairSet)))
     print("</ul>")
 
 def htmlMiddle():
     " print html middle part "
     sys.stdout.flush()
 
     flag = getCgiVar("flag")
     if flag!=None:
         flagInteraction(flag)
         exit(0)
         
     remove = getCgiVar("remove")
@@ -2028,31 +2014,33 @@
 
     showGraphBrowser()
 
 def main():
     cgiSetup()
 
     format = getCgiVar("format")
     if format in ["pdf", "svg", "sif", "json"]:
         conn = sqlConnect(GGDB)
         gene, alg, addNeighbors, sortByCount, geneCount = parseGraphArgs()
         graphLinks, lowLinks = buildGraph(conn, gene, geneCount, MINSUPP, addNeighbors)
         weightedLinks, minAbsCount = flattenLink(graphLinks)
         printGraph(conn, weightedLinks, alg, addNeighbors, gene, format)
         sys.exit(0)
 
+
     # Apache doesn't set LANG, so the default encoding of stdout is ASCII: Change it to utf8
+    import codecs
     sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
 
     printContentType()
 
     if cgiString("debug") is not None:
         global DEBUG
         DEBUG = True
 
     htmlHeader()
     printInlineAndStyles()
     htmlMiddle()
     jsInlineFinish()
     htmlPageEnd()
 
 main()