src/hg/hgGeneGraph/hgGeneGraph ecb21defee4e9278c2d6b80036e2fc7ab1f9f838

ecb21defee4e9278c2d6b80036e2fc7ab1f9f838
max
  Fri Jun 30 06:50:47 2023 -0700
more fixes for hgGeneGraph, refs #31563

diff --git src/hg/hgGeneGraph/hgGeneGraph src/hg/hgGeneGraph/hgGeneGraph
index b5df5d0..a05d236 100755
--- src/hg/hgGeneGraph/hgGeneGraph
+++ src/hg/hgGeneGraph/hgGeneGraph
@@ -1,19 +1,20 @@
 #!/usr/bin/env python3
 
 # Gene Interaction Viewer for the Genome Browser
 
+
 # query tables with prefix "gg" in hgFixed, writes the results to a dot file,
 # runs graphviz's "dot" program to create a pathway map from it and write html
 # and mapfiles to the trash directory.
 
 # CGI params: gene=(HGNCsymbol) or link=sym1:sym2
 # optional params: addNeighbors
 
 # colors:
 
 # grey+thickness = only text mining data
 # light blue, dashed = only high-throughput data
 
 # light blue, thickness = high-throughput data + text
 # dark blue, dashed = only low-throughput data
 
@@ -1219,33 +1220,31 @@
     # text above graph
     print("Mouse over or click genes or lines for details. Dashed lines indicate interactions without text mining support. ")
     print("Click any gene to make it the new center. Click any line to show details about the interaction. ")
     print(("Only %s-interacting genes and only the most-mentioned/most-curated interactions are shown in the graph. " % (targetGene)))
     print("See the <a href='../../goldenPath/help/hgGeneGraph.html'>Help Page</a> for details.<br>")
 
     # menu above graph
     # background #fffef5 would be an alternive
     print('<div style="display:inline-block"> <!-- graph -->')
     printGraphMenu(conn, targetGene, addNeighbors)
     print("<p>")
 
     # graph itself
     print('<img src="%s" usemap="#test">' % picName)
     mapData = open(mapName, "rb").read()
-    sys.stdout.flush()
-    sys.stdout.buffer.write(mapData) # only way to get binary data to stdout. Problem may be that data is mixed latin1/utf8 in tables from Stanford
-    sys.stdout.flush()
+    print(mapData.decode("latin1")) # graphviz seems to use latin1 encoding for its output file?
 
     print('</div> <!-- graph -->')
     print("<p>")
 
 def printPmidSearchForm():
     " print a little form that allows to search for a PMID "
     print("<hr>")
     print("Search for a PMID: ")
     print('<form action="hgGeneGraph" method="get">')
     print('  <input type="text" name="search">')
     print('  <input type="submit" name="Search">')
     print('</form><p>')
 
 def printDisclaimer():
     print('''
@@ -1464,30 +1463,37 @@
     if eType == "family":
         geneStr = "/".join(eGenes)
         if eName=="":
             return "%s"  % geneStr
         else:
             return "%s (%s)"  % (eName, geneStr)
 
 def printDbRows(conn, rows, onlyDoc=None):
     " print a row from the ggEventDb table as html "
     print('<ul class="more">')
     # sort by database name
     for row in sorted(rows, key=operator.itemgetter(9)):
         eventId, causeType, causeName, causeGenes, themeType, themeName, themeGenes, \
             relType, relSubtype, sourceDb, sourceId, sourceDesc, docIds, evidence = row
 
+        # in Python3, 'longblob' fields get converted to byte strings
+        causeGenes = causeGenes.decode("utf8")
+        themeGenes = themeGenes.decode("utf8")
+        sourceId = sourceId.decode("utf8")
+        docIds = docIds.decode("utf8")
+        evidence = evidence.decode("utf8")
+
         if eventId.startswith("ppi_"):
             isPpi = True
         else:
             isPpi = False
 
         docSet = set(docIds.split("|"))
 
         if eventId.startswith("ppi_iref"):
             dbName, dbUrlPat = dbData["iref"]
             dbName = "IRef %s" % sourceDb.capitalize()
         else:
             dbName, dbUrlPat = dbData[sourceDb]
 
 
         url = dbUrlPat % sourceId
@@ -1548,67 +1554,72 @@
             ppiRows.append(r)
         else:
             pwyRows.append(r)
 
     if len(pwyRows)!=0:
         print("<h3>Pathways - manually collected, often from reviews:</h3>")
         printDbRows(conn, pwyRows)
 
     if len(ppiRows)!=0:
         print("<h3>Protein-Protein interactions - manually collected from original source literature:</h3>")
         print('<p>Studies that report less than %d interactions are marked with *</p>' % LTCUTOFF)
         printDbRows(conn, ppiRows)
 
 def markupSentence(row):
     " given a MSR-textmining row, print a sentence with the various detected words marked up in bold "
-    sent = row.sentence
+    sent = row.sentence.decode("utf8")
     tStart, tEnd = int(row.themeTokenStart), int(row.themeTokenEnd)
     cStart, cEnd = int(row.causeTokenStart), int(row.causeTokenEnd)
     trigToken = int(row.triggerTokenId)
     sentId = int(row.sentenceId)
 
     # put into temporary var to be able to replace " , " later
     parts = []
     for i, word in enumerate(sent.split()):
+        themeGenes = row.themeGenes.decode("utf8") # longblob = byte string
+        causeGenes = row.causeGenes.decode("utf8")
+
         if i==tStart or i==cStart:
             if i==cStart:
-                genes = row.themeGenes # XX bug in MsrNlp tab file: Inversed?
+                genes = themeGenes # XX bug in MsrNlp tab file: Inversed?
             else:
-                genes = row.causeGenes
+                genes = causeGenes
             geneDisp = genes.replace("|", ", ")
             gene1 = genes.split("|")[0]
             parts.append( '<b><i><a href="http://www.genecards.org/cgi-bin/carddisp.pl?gene=%s" data-toggle="tooltip" data-placement="top" title="recognized genes: %s, click to go to Gene Cards" target=_blank>' % \
                 (gene1, geneDisp))
         if i==trigToken:
             parts.append( '<i class="trigger">')
         if i==tEnd or i==cEnd:
             parts.append( "</a></i></b>")
         if i==trigToken+1:
             parts.append('</i class="trigger">')
         parts.append(word)
 
     line = " ".join(parts)
     line = line.replace(" , ", ", ").replace(" . ", ". ").rstrip(". ")
     line = line.replace("-LRB-", "(").replace("-RRB-", ")")
     return line
 
 def iterUniqueInteractions(rows):
     " iterate over msrNlp rows, but remove interactions we already had (=ignore direction) "
     doneDocs = set()
     for row in rows:
         # make sure to skip duplicated info (=same genes + same document + same sentence)
-        docInfo = tuple(sorted([row.causeGenes, row.themeGenes, row.docId, row.sentenceId]))
+        causeGenes = row.causeGenes.decode("utf8") # longblob = byte string
+        themeGenes = row.themeGenes.decode("utf8")
+        docInfo = tuple(sorted([causeGenes, themeGenes, row.docId, row.sentenceId]))
         if docInfo in doneDocs:
             continue
         doneDocs.add(docInfo)
         yield row
 
 def printDocNlpResults(rows):
     " print text mining results for document-centered view "
     for row in iterUniqueInteractions(rows):
         causeGene, themeGene = row.causeGenes, row.themeGenes
 
         if "Negative" in row.relType:
             sym = "8867" # unicode right-tack
         elif "Positive" in row.relType:
             sym = "8594" # unicode right arrow
         else:
@@ -1661,68 +1672,70 @@
 def prettyDocLinks(conn, pmids):
     " given a list of pmids, return a list of nice links to articles on our own site "
     quoteList = ['"%s"' % pmid for pmid in pmids]
     idStr = "(%s)" % (",".join(quoteList))
     # XX remove distinct if not needed anymore
     query = "SELECT authors, title, journal, year, docId, resCount FROM ggDoc WHERE docId IN %s" % idStr
     rows = sqlQuery(conn, query)
 
     links = []
     for row in rows:
         links.append(prettyDocLink(row))
     return ", ".join(links)
 
 def prettyDocLink(row, showStar=True):
     " given a row that includes author/title/year/docId fields, return a nice link to our doc view "
-    authors = row.authors.split("; ")
+    authors = row.authors.decode("utf8").split("; ") # mysql longblob => byte string
     if len(authors)>0:
         fAu = authors[0].split(",")[0]
     else:
         fAu = ""
 
     suffix = ""
     if len(authors)>1:
         suffix = " et al."
 
     note = ""
     if showStar and int(row.resCount)<=LTCUTOFF:
         note = '<a href="#" title="Low-throughput publication">*</a>'
         #note = "*"
     text = "%s %s, %s %s" % (fAu, suffix, row.journal, row.year)
 
     mouseOver = None
-    if row.title!="":
-        mouseOver = row.title.replace('"', "")
+    title = row.title.decode("utf8") # longBlob = byte string
+    if title!="":
+        mouseOver = title.replace('"', "")
     return docLink(row.docId, text=text, mouseOver=mouseOver)+note
 
 def showSnipsLink(conn, gene1, gene2):
     " show snippets for a gene pair "
     rows = queryEventText(conn, gene1, gene2)
 
     if len(rows)!=0:
         print('<h3>Text-mined interactions from <A HREF="http://literome.azurewebsites.net/Network?gene1=%s&gene2=%s">Literome</A></h3>' % (gene1, gene2))
 
     byDoc = defaultdict(list)
     for row in rows:
         byDoc[row.docId].append(row)
 
     for docId, rows in byDoc.items():
         print(prettyDocLink(rows[0], showStar=False))
 
         disSuffix = ""
-        if rows[0].context!="":
-            contexts = rows[0].context.split("|")
+        context = rows[0].context.decode("utf8") # longblob = byte string
+        if context!="":
+            contexts = context.split("|")
             if len(contexts)>1:
                 suffix = "..."
             else:
                 suffix = ""
             disSuffix = "(%s%s)" % (contexts[0], suffix)
 
         print("%s :" % disSuffix)
         printMsrNlpRows(rows)
         print("<br>")
 
 
 def showLink(link):
     " print details page with db info and snippets for a gene interaction "
     if ":" not in link:
         errAbort("'link' CGI parameter has to contain a colon-separated pair of genes, like PITX2:TBX5")