src/hg/hgBeacon/hgBeacon 84b876bd80b78018f4f71360a464c43dfe699431

84b876bd80b78018f4f71360a464c43dfe699431
max
  Thu Dec 12 02:45:06 2024 -0800
porting hgBeacon to python3, refs #34921

diff --git src/hg/hgBeacon/hgBeacon src/hg/hgBeacon/hgBeacon
index 3ba0977..2832067 100755
--- src/hg/hgBeacon/hgBeacon
+++ src/hg/hgBeacon/hgBeacon
@@ -1,21 +1,21 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # A beacon allows very limited queries against a set of variants without allowing someone
 # to download the list of variants
 # see ga4gh.org/#/beacon (UCSC redmine 14393)
 
-import cgi, subprocess, sys, cgitb, os, socket, time, json, glob, urlparse, time
+import cgi, subprocess, sys, cgitb, os, socket, time, json, glob, urllib.parse, time
 import sqlite3, gzip, optparse, gc, string, re, socket
 from os.path import join, isfile, dirname, isdir, basename, exists
 
 # general info about this beacon
 hostName = os.environ.get("HTTP_HOST", "localhost")
 homepage = "http://%s/cgi-bin/hgBeacon" % hostName
 beaconDesc = {
     "id":"ucsc-browser", \
     "name":"Genome Browser", \
     "organization" : "UCSC", \
     "description" : "UCSC Genome Browser",
     "api" : "0.2",
     "homepage": homepage
     }
 # the list genome reference we support
@@ -35,45 +35,45 @@
 DataSetDescs = {
 "hgmd" : "Human Genome Variation Database, only single-nucleotide variants, public version, provided by Biobase",
 "lovd" : "Leiden Open Varation Database installations that agreed to share their variants, only single-nucleotide variants and deletions",
 "test" : "small test data on Chromosome 1, from ICGC",
 "test2" : "another piece of test data from Chromosome 1, also from ICGC"
 }
 
 # special case: same datasets do not have alt alleles. In this case, an overlap is enough to trigger a "true"
 NoAltDataSets = ["hgmd"]
 
 def queryBottleneck(host, port, ip):
     " contact UCSC-style bottleneck server to get current delay time "
     # send ip address
     s =  socket.socket()
     s.connect((host, int(port)))
-    msg = ip
-    d = chr(len(msg))+msg
+    msg = ip.encode("latin1")
+    d = chr(len(msg)).encode("latin1")+msg
     s.send(d)
 
     # read delay time
     expLen = ord(s.recv(1))
     totalLen = 0
     buf = list()
     while True:
         resp = s.recv(1024)
         buf.append(resp)
         totalLen+= len(resp)
         if totalLen==expLen:
             break
-    return int("".join(buf))
+    return int(b"".join(buf))
 
 def parseConf(fname):
     " parse a hg.conf style file, return as dict key -> value (both are strings) "
     conf = {}
     for line in open(fname):
         line = line.strip()
         if line.startswith("#"):
             continue
         elif line.startswith("include "):
             inclFname = line.split()[1]
             inclPath = join(dirname(fname), inclFname)
             if isfile(inclPath):
                 inclDict = parseConf(inclPath)
                 conf.update(inclDict)
         elif "=" in line: # string search for "="
@@ -101,77 +101,77 @@
 
 def errAbort(errMsg=None):
     " exit with error message "
     if errMsg == None:
         sys.exit(0)
 
     helpUrl = "http://%s/cgi-bin/hgBeacon?page=help" % hostName
 
     ret = {"errormsg":errMsg, 
         "more_info":"for a complete description of the parameters, read the help message at %s" % helpUrl}
     printJson(ret)
     sys.exit(0)
 
 def printHelp():
     " print help text to stdout and exit "
-    print "Content-Type: text/html\n"
-    print "<html><body>"
+    print("Content-Type: text/html\n")
+    print("<html><body>")
     host = hostName # convert from global to local var
     if host.endswith(".ucsc.edu"):
         helpDir = "/gbdb/hg19/beacon"
     else:
         helpDir = dirname(__file__)
 
     helpPath = join(helpDir, "help.txt")
     if not exists(helpPath):
         errAbort("no file %s found. The beacon is not activated on this machine" % helpPath)
 
     helpText = open(helpPath).read()
-    print helpText % locals()
-    print "</body></html>"
+    print(helpText % locals())
+    print("</body></html>")
     sys.exit(0)
 
 def dataSetResources():
     " Returns the list of DataSetResources "
     totalSize = 0
     dsrList = []
     conn = dbOpen(mustExist=True)
     for tableName in dbListTables(conn):
         rows = dbQuery(conn, "SELECT COUNT(*) from %s" % tableName, None)
         itemCount = rows[0][0]
         
         # the dataset ID is just the file basename without extension
         dsId = tableName
         dsr = (dsId, DataSetDescs.get(dsId, ""), itemCount)
         dsrList.append(dsr)
         totalSize += itemCount
     return totalSize, dsrList
 
 def beaconInfo():
     " return a beaconInfo dict "
     size, dsrList = dataSetResources()
     return \
         {
         "beacon": beaconDesc,
         "references": beaconReferences,
         "datasets": dsrList,
         "size": size
         }
 
 def printJson(data):
-    print "Content-Type: application/json\n"
-    print json.dumps(data, indent=4, sort_keys=True,separators=(',', ': '))
+    print("Content-Type: application/json\n")
+    print(json.dumps(data, indent=4, sort_keys=True,separators=(',', ': ')))
 
 def hgBotDelay():
     " implement bottleneck delay, get bottleneck server from hg.conf "
     global hgConf
     hgConf = parseHgConf("..")
     if "bottleneck.host" not in hgConf:
         return
     ip = os.environ["REMOTE_ADDR"]
     delay = queryBottleneck(hgConf["bottleneck.host"], hgConf["bottleneck.port"], ip)
     if delay>10000:
         time.sleep(delay/1000.0)
     if delay>20000:
         errAbort("IP blocked")
         sys.exit(0)
 
@@ -362,200 +362,200 @@
 
         if refIsOne and altIsOne:
             # single bp subst
             beaconAllele = alt
         elif not refIsOne and altIsOne:
             # deletion
             beaconAllele = "D"+str(len(ref)-1) # skip first nucleotide, VCF always adds one nucleotide
             pos += 1 
         elif refIsOne and not altIsOne:
             # insertion
             beaconAllele = "I"+alt[1:]
             pos += 1
         elif not refIsOne and not altIsOne:
             skipCount +=1
         else:
-            print "Error: invalid VCF fields:", fields
+            print("Error: invalid VCF fields:", fields)
             sys.exit(1)
 
         if len(rows) % 500000 == 0:
-            print "Read %d rows..." % len(rows)
+            print("Read %d rows..." % len(rows))
 
         dataRow = (chrom, pos, beaconAllele)
         if dataRow in doneData:
             continue
 
         rows.append( dataRow )
         doneData.add( dataRow )
 
-    print "skipped %d VCF lines with empty ALT alleles" % emptyCount
-    print "skipped %d VCF lines with both ALT and REF alleles len!=1, cannot encode as beacon queries" % skipCount
+    print("skipped %d VCF lines with empty ALT alleles" % emptyCount)
+    print("skipped %d VCF lines with both ALT and REF alleles len!=1, cannot encode as beacon queries" % skipCount)
     return rows
 
 def readAllelesLovd(ifh):
     """ read the LOVD bed file and return in format (chrom, pos, altAllele) 
     This function is only used internally at UCSC.
     """
     alleles = []
     count = 0
     skipCount = 0
     for line in ifh:
         if line.startswith("chrom"):
             continue
         chrom, start, end, desc = line.rstrip("\n").split("\t")[:4]
 
         if desc[-2]==">":
             mutDesc = desc[-3:]
             ref, _, alt = mutDesc
             assert(len(mutDesc)==3)
         elif desc.endswith("del"):
             alt = "D"+str(int(end)-int(start))
         else:
             skipCount += 1
             continue
 
         chrom = chrom.replace("chr", "")
         start = int(start)
         alleles.append( (chrom, start, alt) )
 
-    print "read %d alleles, skipped %d non-SNV or del alleles" % (len(alleles), skipCount)
+    print("read %d alleles, skipped %d non-SNV or del alleles" % (len(alleles), skipCount))
     return list(set(alleles))
 
 def readAllelesHgmd(ifh):
     """ read the HGMD bed file and return in format (chrom, pos, altAllele).
     This function is only used internally at UCSC.
     """
     # chr1 2338004 2338005 PEX10:CM090797 0 2338004 2338005 PEX10 CM090797 substitution
     alleles = []
     count = 0
     skipCount = 0
     for line in ifh:
         fields = line.rstrip("\n").split("\t")
         chrom, start, end = fields[:3]
         desc = fields[11]
         start = int(start)
         end = int(end)
 
         if desc=="substitution" or desc=="splicing variant":
             assert(end-start==1)
             alt = "*"
         else:
             skipCount += 1
             continue
 
         chrom = chrom.replace("chr", "")
         alleles.append( (chrom, start, alt) )
 
-    print "read %d alleles, skipped %d non-SNV alleles" % (len(alleles), skipCount)
+    print("read %d alleles, skipped %d non-SNV alleles" % (len(alleles), skipCount))
     return list(set(alleles))
 
 def iterChunks(seq, size):
     " yields chunks of at most size elements from seq "
     for pos in range(0, len(seq), size):
         yield seq[pos:pos + size]
 
 def printTime(time1, time2, rowCount):
     timeDiff = time2 - time1
-    print "Time: %f secs for %d rows, %d rows/sec" % (timeDiff, rowCount, rowCount/timeDiff)
+    print("Time: %f secs for %d rows, %d rows/sec" % (timeDiff, rowCount, rowCount/timeDiff))
 
 def importFile(fileName, datasetName, format):
     """ open the sqlite db, create a table datasetName and write the data in fileName into it """
     conn = dbOpen()
     dbMakeTable(conn, datasetName)
 
     # try to make sqlite writes as fast as possible
     conn.execute("PRAGMA synchronous=OFF")
     conn.execute("PRAGMA count_changes=OFF") # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/
     conn.execute("PRAGMA cache_size=800000") # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html
     conn.execute("PRAGMA journal_mode=OFF") # http://www.sqlite.org/pragma.html#pragma_journal_mode
     conn.execute("PRAGMA temp_store=memory")
     conn.commit()
 
     if fileName.endswith(".gz"):
         ifh = gzip.open(fileName)
     else:
         ifh = open(fileName)
 
     # see http://stackoverflow.com/questions/1711631/improve-insert-per-second-performance-of-sqlite
     # for background why I do it like this
-    print "Reading %s into database table %s" % (fileName, datasetName)
+    print("Reading %s into database table %s" % (fileName, datasetName))
     rowCount = 0
     startTime = time.time()
 
     if format=="vcf":
         alleles = readAllelesVcf(ifh)
     elif format=="lovd":
         alleles = readAllelesLovd(ifh)
     elif format=="hgmd":
         alleles = readAllelesHgmd(ifh)
         
 
     loadTime = time.time()
 
     printTime(startTime, loadTime, len(alleles))
 
-    print "Loading alleles into database"
+    print("Loading alleles into database")
     for rows in iterChunks(alleles, 50000):
         sql = "INSERT INTO %s (chrom, pos, allele) VALUES (?,?,?)" % datasetName
         conn.executemany(sql, rows)
         conn.commit()
         rowCount += len(rows)
     insertTime = time.time()
     printTime(loadTime, insertTime, len(alleles))
 
-    print "Indexing database table"
+    print("Indexing database table")
     conn.execute("CREATE UNIQUE INDEX '%s_index' ON '%s' ('chrom', 'pos', 'allele')" % \
         (datasetName, datasetName))
     indexTime = time.time()
     printTime(insertTime, indexTime, len(alleles))
 
 def mainCommandLine():
     " main function if called from command line "
     args, options = parseArgs()
 
     fileName = args[0]
     if len(args)==2:
         datasetName = args[1]
     else:
         datasetName = "defaultDataset"
 
     importFile(fileName, datasetName, options.format)
     
 def mainCgi():
 
     url = os.environ["REQUEST_URI"]
-    parsedUrl = urlparse.urlparse(url)
+    parsedUrl = urllib.parse.urlparse(url)
 
     # get CGI parameters
     form = cgi.FieldStorage()
 
     # react based on symlink that was used to call this script
     page = parsedUrl[2].split("/")[-1] # last part of path is REST endpoint
     if page=="info":
         printJson(beaconInfo())
         sys.exit(0)
 
     hgBotDelay()
 
     chrom = form.getfirst("chromosome")
     pos = form.getfirst("position")
     refBases = form.getfirst("referenceBases")
     altBases = form.getfirst("alternateBases")
     reference = form.getfirst("reference")
     dataset = form.getfirst("dataset")
     format = form.getfirst("format")
 
     if chrom==None and pos==None and altBases==None:
         printHelp()
         sys.exit(0)
 
     ret = lookupAlleleJson(chrom, pos, altBases, refBases, reference, dataset)
     if format=="text":
-        print "Content-Type: text/html\n"
-        print ret["response"]["exists"]
+        print("Content-Type: text/html\n")
+        print(ret["response"]["exists"])
     else:
         printJson(ret)
 
 if __name__=="__main__":
     # deactivate this on the RR, but useful for debugging: prints a http header
     # on errors 
     main()