84b876bd80b78018f4f71360a464c43dfe699431 max Thu Dec 12 02:45:06 2024 -0800 porting hgBeacon to python3, refs #34921 diff --git src/hg/hgBeacon/hgBeacon src/hg/hgBeacon/hgBeacon index 3ba0977..2832067 100755 --- src/hg/hgBeacon/hgBeacon +++ src/hg/hgBeacon/hgBeacon @@ -1,21 +1,21 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # A beacon allows very limited queries against a set of variants without allowing someone # to download the list of variants # see ga4gh.org/#/beacon (UCSC redmine 14393) -import cgi, subprocess, sys, cgitb, os, socket, time, json, glob, urlparse, time +import cgi, subprocess, sys, cgitb, os, socket, time, json, glob, urllib.parse, time import sqlite3, gzip, optparse, gc, string, re, socket from os.path import join, isfile, dirname, isdir, basename, exists # general info about this beacon hostName = os.environ.get("HTTP_HOST", "localhost") homepage = "http://%s/cgi-bin/hgBeacon" % hostName beaconDesc = { "id":"ucsc-browser", \ "name":"Genome Browser", \ "organization" : "UCSC", \ "description" : "UCSC Genome Browser", "api" : "0.2", "homepage": homepage } # the list genome reference we support @@ -35,45 +35,45 @@ DataSetDescs = { "hgmd" : "Human Genome Variation Database, only single-nucleotide variants, public version, provided by Biobase", "lovd" : "Leiden Open Varation Database installations that agreed to share their variants, only single-nucleotide variants and deletions", "test" : "small test data on Chromosome 1, from ICGC", "test2" : "another piece of test data from Chromosome 1, also from ICGC" } # special case: same datasets do not have alt alleles. In this case, an overlap is enough to trigger a "true" NoAltDataSets = ["hgmd"] def queryBottleneck(host, port, ip): " contact UCSC-style bottleneck server to get current delay time " # send ip address s = socket.socket() s.connect((host, int(port))) - msg = ip - d = chr(len(msg))+msg + msg = ip.encode("latin1") + d = chr(len(msg)).encode("latin1")+msg s.send(d) # read delay time expLen = ord(s.recv(1)) totalLen = 0 buf = list() while True: resp = s.recv(1024) buf.append(resp) totalLen+= len(resp) if totalLen==expLen: break - return int("".join(buf)) + return int(b"".join(buf)) def parseConf(fname): " parse a hg.conf style file, return as dict key -> value (both are strings) " conf = {} for line in open(fname): line = line.strip() if line.startswith("#"): continue elif line.startswith("include "): inclFname = line.split()[1] inclPath = join(dirname(fname), inclFname) if isfile(inclPath): inclDict = parseConf(inclPath) conf.update(inclDict) elif "=" in line: # string search for "=" @@ -101,77 +101,77 @@ def errAbort(errMsg=None): " exit with error message " if errMsg == None: sys.exit(0) helpUrl = "http://%s/cgi-bin/hgBeacon?page=help" % hostName ret = {"errormsg":errMsg, "more_info":"for a complete description of the parameters, read the help message at %s" % helpUrl} printJson(ret) sys.exit(0) def printHelp(): " print help text to stdout and exit " - print "Content-Type: text/html\n" - print "<html><body>" + print("Content-Type: text/html\n") + print("<html><body>") host = hostName # convert from global to local var if host.endswith(".ucsc.edu"): helpDir = "/gbdb/hg19/beacon" else: helpDir = dirname(__file__) helpPath = join(helpDir, "help.txt") if not exists(helpPath): errAbort("no file %s found. The beacon is not activated on this machine" % helpPath) helpText = open(helpPath).read() - print helpText % locals() - print "</body></html>" + print(helpText % locals()) + print("</body></html>") sys.exit(0) def dataSetResources(): " Returns the list of DataSetResources " totalSize = 0 dsrList = [] conn = dbOpen(mustExist=True) for tableName in dbListTables(conn): rows = dbQuery(conn, "SELECT COUNT(*) from %s" % tableName, None) itemCount = rows[0][0] # the dataset ID is just the file basename without extension dsId = tableName dsr = (dsId, DataSetDescs.get(dsId, ""), itemCount) dsrList.append(dsr) totalSize += itemCount return totalSize, dsrList def beaconInfo(): " return a beaconInfo dict " size, dsrList = dataSetResources() return \ { "beacon": beaconDesc, "references": beaconReferences, "datasets": dsrList, "size": size } def printJson(data): - print "Content-Type: application/json\n" - print json.dumps(data, indent=4, sort_keys=True,separators=(',', ': ')) + print("Content-Type: application/json\n") + print(json.dumps(data, indent=4, sort_keys=True,separators=(',', ': '))) def hgBotDelay(): " implement bottleneck delay, get bottleneck server from hg.conf " global hgConf hgConf = parseHgConf("..") if "bottleneck.host" not in hgConf: return ip = os.environ["REMOTE_ADDR"] delay = queryBottleneck(hgConf["bottleneck.host"], hgConf["bottleneck.port"], ip) if delay>10000: time.sleep(delay/1000.0) if delay>20000: errAbort("IP blocked") sys.exit(0) @@ -362,200 +362,200 @@ if refIsOne and altIsOne: # single bp subst beaconAllele = alt elif not refIsOne and altIsOne: # deletion beaconAllele = "D"+str(len(ref)-1) # skip first nucleotide, VCF always adds one nucleotide pos += 1 elif refIsOne and not altIsOne: # insertion beaconAllele = "I"+alt[1:] pos += 1 elif not refIsOne and not altIsOne: skipCount +=1 else: - print "Error: invalid VCF fields:", fields + print("Error: invalid VCF fields:", fields) sys.exit(1) if len(rows) % 500000 == 0: - print "Read %d rows..." % len(rows) + print("Read %d rows..." % len(rows)) dataRow = (chrom, pos, beaconAllele) if dataRow in doneData: continue rows.append( dataRow ) doneData.add( dataRow ) - print "skipped %d VCF lines with empty ALT alleles" % emptyCount - print "skipped %d VCF lines with both ALT and REF alleles len!=1, cannot encode as beacon queries" % skipCount + print("skipped %d VCF lines with empty ALT alleles" % emptyCount) + print("skipped %d VCF lines with both ALT and REF alleles len!=1, cannot encode as beacon queries" % skipCount) return rows def readAllelesLovd(ifh): """ read the LOVD bed file and return in format (chrom, pos, altAllele) This function is only used internally at UCSC. """ alleles = [] count = 0 skipCount = 0 for line in ifh: if line.startswith("chrom"): continue chrom, start, end, desc = line.rstrip("\n").split("\t")[:4] if desc[-2]==">": mutDesc = desc[-3:] ref, _, alt = mutDesc assert(len(mutDesc)==3) elif desc.endswith("del"): alt = "D"+str(int(end)-int(start)) else: skipCount += 1 continue chrom = chrom.replace("chr", "") start = int(start) alleles.append( (chrom, start, alt) ) - print "read %d alleles, skipped %d non-SNV or del alleles" % (len(alleles), skipCount) + print("read %d alleles, skipped %d non-SNV or del alleles" % (len(alleles), skipCount)) return list(set(alleles)) def readAllelesHgmd(ifh): """ read the HGMD bed file and return in format (chrom, pos, altAllele). This function is only used internally at UCSC. """ # chr1 2338004 2338005 PEX10:CM090797 0 2338004 2338005 PEX10 CM090797 substitution alleles = [] count = 0 skipCount = 0 for line in ifh: fields = line.rstrip("\n").split("\t") chrom, start, end = fields[:3] desc = fields[11] start = int(start) end = int(end) if desc=="substitution" or desc=="splicing variant": assert(end-start==1) alt = "*" else: skipCount += 1 continue chrom = chrom.replace("chr", "") alleles.append( (chrom, start, alt) ) - print "read %d alleles, skipped %d non-SNV alleles" % (len(alleles), skipCount) + print("read %d alleles, skipped %d non-SNV alleles" % (len(alleles), skipCount)) return list(set(alleles)) def iterChunks(seq, size): " yields chunks of at most size elements from seq " for pos in range(0, len(seq), size): yield seq[pos:pos + size] def printTime(time1, time2, rowCount): timeDiff = time2 - time1 - print "Time: %f secs for %d rows, %d rows/sec" % (timeDiff, rowCount, rowCount/timeDiff) + print("Time: %f secs for %d rows, %d rows/sec" % (timeDiff, rowCount, rowCount/timeDiff)) def importFile(fileName, datasetName, format): """ open the sqlite db, create a table datasetName and write the data in fileName into it """ conn = dbOpen() dbMakeTable(conn, datasetName) # try to make sqlite writes as fast as possible conn.execute("PRAGMA synchronous=OFF") conn.execute("PRAGMA count_changes=OFF") # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/ conn.execute("PRAGMA cache_size=800000") # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html conn.execute("PRAGMA journal_mode=OFF") # http://www.sqlite.org/pragma.html#pragma_journal_mode conn.execute("PRAGMA temp_store=memory") conn.commit() if fileName.endswith(".gz"): ifh = gzip.open(fileName) else: ifh = open(fileName) # see http://stackoverflow.com/questions/1711631/improve-insert-per-second-performance-of-sqlite # for background why I do it like this - print "Reading %s into database table %s" % (fileName, datasetName) + print("Reading %s into database table %s" % (fileName, datasetName)) rowCount = 0 startTime = time.time() if format=="vcf": alleles = readAllelesVcf(ifh) elif format=="lovd": alleles = readAllelesLovd(ifh) elif format=="hgmd": alleles = readAllelesHgmd(ifh) loadTime = time.time() printTime(startTime, loadTime, len(alleles)) - print "Loading alleles into database" + print("Loading alleles into database") for rows in iterChunks(alleles, 50000): sql = "INSERT INTO %s (chrom, pos, allele) VALUES (?,?,?)" % datasetName conn.executemany(sql, rows) conn.commit() rowCount += len(rows) insertTime = time.time() printTime(loadTime, insertTime, len(alleles)) - print "Indexing database table" + print("Indexing database table") conn.execute("CREATE UNIQUE INDEX '%s_index' ON '%s' ('chrom', 'pos', 'allele')" % \ (datasetName, datasetName)) indexTime = time.time() printTime(insertTime, indexTime, len(alleles)) def mainCommandLine(): " main function if called from command line " args, options = parseArgs() fileName = args[0] if len(args)==2: datasetName = args[1] else: datasetName = "defaultDataset" importFile(fileName, datasetName, options.format) def mainCgi(): url = os.environ["REQUEST_URI"] - parsedUrl = urlparse.urlparse(url) + parsedUrl = urllib.parse.urlparse(url) # get CGI parameters form = cgi.FieldStorage() # react based on symlink that was used to call this script page = parsedUrl[2].split("/")[-1] # last part of path is REST endpoint if page=="info": printJson(beaconInfo()) sys.exit(0) hgBotDelay() chrom = form.getfirst("chromosome") pos = form.getfirst("position") refBases = form.getfirst("referenceBases") altBases = form.getfirst("alternateBases") reference = form.getfirst("reference") dataset = form.getfirst("dataset") format = form.getfirst("format") if chrom==None and pos==None and altBases==None: printHelp() sys.exit(0) ret = lookupAlleleJson(chrom, pos, altBases, refBases, reference, dataset) if format=="text": - print "Content-Type: text/html\n" - print ret["response"]["exists"] + print("Content-Type: text/html\n") + print(ret["response"]["exists"]) else: printJson(ret) if __name__=="__main__": # deactivate this on the RR, but useful for debugging: prints a http header # on errors main()