src/hg/hgBeacon/hgBeacon d38f471366cf2b2c0bf22b8eb0bd8feda07f7229

d38f471366cf2b2c0bf22b8eb0bd8feda07f7229
max
  Fri Oct 26 17:07:30 2018 -0700
adding splicing variants to hgBeacon, refs #22325

diff --git src/hg/hgBeacon/hgBeacon src/hg/hgBeacon/hgBeacon
index a3eeba1..45caef1 100755
--- src/hg/hgBeacon/hgBeacon
+++ src/hg/hgBeacon/hgBeacon
@@ -1,558 +1,562 @@
 #!/usr/bin/env python
 # A beacon allows very limited queries against a set of variants without allowing someone
 # to download the list of variants
 # see ga4gh.org/#/beacon (UCSC redmine 14393)
 
 import cgi, subprocess, sys, cgitb, os, socket, time, json, glob, urlparse, time
 import sqlite3, gzip, optparse, gc, string, re, socket
 from os.path import join, isfile, dirname, isdir, basename, exists
 
 # general info about this beacon
 hostName = os.environ.get("HTTP_HOST", "localhost")
 homepage = "http://%s/cgi-bin/hgBeacon" % hostName
 beaconDesc = {
     "id":"ucsc-browser", \
     "name":"Genome Browser", \
     "organization" : "UCSC", \
     "description" : "UCSC Genome Browser",
     "api" : "0.2",
     "homepage": homepage
     }
 # the list genome reference we support
 beaconReferences = ["GRCh37"]
 
 # sqlite database 
 SQLITEDB = "beaconData.sqlite"
 
 # dir of the CGIs relative to REQUEST_URI
 #cgiDir = ".." # symlinks are one level below the main cgi-bin dir
 cgiDir = "." # symlinks are in the current dir, for GA4GH
 
 # cache of hg.conf dict
 hgConf = None
 
 # descriptions of datasets that this beacon is serving
 DataSetDescs = {
 "hgmd" : "Human Genome Variation Database, only single-nucleotide variants, public version, provided by Biobase",
 "lovd" : "Leiden Open Varation Database installations that agreed to share their variants, only single-nucleotide variants and deletions",
 "test" : "small test data on Chromosome 1, from ICGC",
 "test2" : "another piece of test data from Chromosome 1, also from ICGC"
 }
 
 # special case: same datasets do not have alt alleles. In this case, an overlap is enough to trigger a "true"
 NoAltDataSets = ["hgmd"]
 
 def queryBottleneck(host, port, ip):
     " contact UCSC-style bottleneck server to get current delay time "
     # send ip address
     s =  socket.socket()
     s.connect((host, int(port)))
     msg = ip
     d = chr(len(msg))+msg
     s.send(d)
 
     # read delay time
     expLen = ord(s.recv(1))
     totalLen = 0
     buf = list()
     while True:
         resp = s.recv(1024)
         buf.append(resp)
         totalLen+= len(resp)
         if totalLen==expLen:
             break
     return int("".join(buf))
 
 def parseConf(fname):
     " parse a hg.conf style file, return as dict key -> value (both are strings) "
     conf = {}
     for line in open(fname):
         line = line.strip()
         if line.startswith("#"):
             continue
         elif line.startswith("include "):
             inclFname = line.split()[1]
             inclPath = join(dirname(fname), inclFname)
             if isfile(inclPath):
                 inclDict = parseConf(inclPath)
                 conf.update(inclDict)
         elif "=" in line: # string search for "="
             key, value = line.split("=")
             conf[key] = value
     return conf
 
 def parseHgConf(confDir="."):
     """ return hg.conf as dict key:value """
     global hgConf
     if hgConf is not None:
         return hgConf
 
     hgConf = dict() # python dict = hash table
 
     currDir = dirname(__file__)
     fname = join(currDir, confDir, "hg.conf")
     if not isfile(fname):
         fname = join(currDir, "hg.conf")
     if not isfile(fname):
         return {}
     hgConf = parseConf(fname)
 
     return hgConf
 
 def errAbort(errMsg=None):
     " exit with error message "
     if errMsg == None:
         sys.exit(0)
 
     helpUrl = "http://%s/cgi-bin/hgBeacon?page=help" % hostName
 
     ret = {"errormsg":errMsg, 
         "more_info":"for a complete description of the parameters, read the help message at %s" % helpUrl}
     printJson(ret)
     sys.exit(0)
 
 def printHelp():
     " print help text to stdout and exit "
     print "Content-Type: text/html\n"
     print "<html><body>"
     host = hostName # convert from global to local var
     if host.endswith(".ucsc.edu"):
         helpDir = "/gbdb/hg19/beacon"
     else:
         helpDir = dirname(__file__)
 
     helpPath = join(helpDir, "help.txt")
     if not exists(helpPath):
         errAbort("no file %s found. The beacon is not activated on this machine" % helpPath)
 
     helpText = open(helpPath).read()
     print helpText % locals()
     print "</body></html>"
     sys.exit(0)
 
 def dataSetResources():
     " Returns the list of DataSetResources "
     totalSize = 0
     dsrList = []
     conn = dbOpen(mustExist=True)
     for tableName in dbListTables(conn):
         rows = dbQuery(conn, "SELECT COUNT(*) from %s" % tableName, None)
         itemCount = rows[0][0]
         
         # the dataset ID is just the file basename without extension
         dsId = tableName
         dsr = (dsId, DataSetDescs.get(dsId, ""), itemCount)
         dsrList.append(dsr)
         totalSize += itemCount
     return totalSize, dsrList
 
 def beaconInfo():
     " return a beaconInfo dict "
     size, dsrList = dataSetResources()
     return \
         {
         "beacon": beaconDesc,
         "references": beaconReferences,
         "datasets": dsrList,
         "size": size
         }
 
 def printJson(data):
     print "Content-Type: application/json\n"
     print json.dumps(data, indent=4, sort_keys=True,separators=(',', ': '))
 
 def hgBotDelay():
     " implement bottleneck delay, get bottleneck server from hg.conf "
     global hgConf
     hgConf = parseHgConf("..")
     if "bottleneck.host" not in hgConf:
         return
     ip = os.environ["REMOTE_ADDR"]
     delay = queryBottleneck(hgConf["bottleneck.host"], hgConf["bottleneck.port"], ip)
     if delay>10000:
         time.sleep(delay/1000.0)
     if delay>20000:
         errAbort("IP blocked")
         sys.exit(0)
 
 def checkParams(chrom, pos, allele, reference, track):
     " make sure the parameters follow the spec "
     if reference==None or reference=="":
         reference="GRCh37"
     if reference not in beaconReferences:
         errAbort("invalid 'reference' parameter, valid ones are %s" % ",".join(beaconReferences))
     if chrom==None or chrom=="":
         errAbort("missing chromosome parameter")
     if allele==None or allele=="":
         errAbort("missing alternateBases parameter")
 
     allele = allele.upper()
     if not (re.compile("[ACTG]+").match(allele)!=None or \
        re.compile("I[ACTG]+").match(allele)!=None or \
        re.compile("D[0-9]+").match(allele)!=None):
         errAbort("invalid alternateBases parameter, can only be a [ACTG]+ or I[ACTG]+ or D[0-9]+")
     if track is not None and track!="":
         if not track.isalnum():
             errAbort("'dataset' parameter must contain only alphanumeric characters")
         if len(track)>100:
             errAbort("'dataset' parameter must not be longer than 100 chars")
 
     if pos==None or not pos.isdigit():
         errAbort("'position' parameter is not a number")
     pos = int(pos)
 
     # convert chrom to hg19 format
     if chrom==None:
         errAbort( "missing chromosome parameter")
     if not ((chrom.isdigit() and int(chrom)>=1 and int(chrom)<=22) or chrom in ["X","Y","M","test"]):
         errAbort( "invalid chromosome name %s" % chrom)
 
     return chrom, pos, allele, reference, track
 
 def lookupAllele(chrom, pos, allele, reference, dataset):
     " check if an allele is present in a sqlite DB "
     conn = dbOpen(mustExist=True)
     tableList = dbListTables(conn)
     if dataset!=None and dataset!="":
         if dataset not in tableList:
             errAbort("dataset %s is not present on this server" % dataset)
         tableList = [dataset]
 
     for tableName in tableList:
         cur = conn.cursor()
         if tableName in NoAltDataSets:
             # some datasets don't have alt alleles, e.g. HGMD
             sql = "SELECT * from %s WHERE chrom=? AND pos=?" % tableName
             cur.execute(sql, (chrom, pos))
         else:
             sql = "SELECT * from %s WHERE chrom=? AND pos=? AND allele=?" % tableName
             cur.execute(sql, (chrom, pos, allele))
         row = cur.fetchone()
         if row!=None:
             return "true"
 
     return "false"
 
 def lookupAlleleJson(chrom, pos, altBases, refBases, reference, dataset):
     " call lookupAllele and wrap the result into dictionaries "
     chrom, pos, altBases, reference, dataset = checkParams(chrom, pos, altBases, reference, dataset)
 
     exists = lookupAllele(chrom, pos, altBases, reference, dataset)
 
     if chrom=="test" and pos==0:
         exists = "true"
 
     query = {
         "alternateBases": altBases,
         "referenceBases" : refBases,
         "chromosome": chrom.replace("chr",""),
         "position": pos,
         "reference": reference,
         "dataset": dataset
         }
     ret = {"beacon" : beaconDesc, "query" : query, "response" : {"exists":exists} }
     return ret
     
 def main():
     # detect if running under apache or was run from command line
     if 'REQUEST_METHOD' in os.environ:
         if not (hostName.startswith("hgw") and hostName.endswith("ucsc.edu")) \
             or hostName.startswith("hgwdev."):
             # enable special CGI error handler not on the RR, but on hgwdev
             cgitb.enable()
         mainCgi()
     else:
         mainCommandLine()
 
 def parseArgs():
     " parse command line options into args and options "
     parser = optparse.OptionParser("""usage: %prog [options] filename [datasetName] - import VCF or BED files into the beacon database. 
     - parameter 'datasetName' is optional and defaults to 'defaultDataset'.
     - any existing dataset of the same name will be overwritten
     - the data is written to beaconData.sqlite. You can use 'sqlite3' to inspect the data file.
     - the input file can be gzipped
+
+    e.g. to load HGMD:
+    /usr/local/apache/cgi-bin/hgBeacon -f hgmd /tmp/temp.bed hgmd
     """)
 
     parser.add_option("-d", "--debug", dest="debug", \
         action="store_true", help="show debug messages") 
     parser.add_option("-f", "--format", dest="format", \
         action="store", help="format of input file, one of vcf, lovd, hgmd. default %default", \
         default = "vcf") 
     (options, args) = parser.parse_args()
 
     if len(args)==0:
         parser.print_help()
         sys.exit(0)
     return args, options
 
 def dbMakeTable(conn, tableName):
     " create an empty table with chrom/pos/allele fields "
     conn.execute("DROP TABLE IF EXISTS %s" % tableName)
     conn.commit()
 
     _tableDef = (
     'CREATE TABLE IF NOT EXISTS %s '
     '('
     '  chrom text,' # chromosome
     '  pos int,' # start position, 0-based
     '  allele text' # alternate allele, can also be IATG = insertion of ATG or D15 = deletion of 15 bp
     ')')
     conn.execute(_tableDef % tableName)
     conn.commit()
     #'  PRIMARY KEY (chrom, pos, allele) '
 
 def dbOpen(mustExist=False):
     " open the sqlite db and return a DB connection object "
     dbDir = dirname(__file__) # directory where script is located
     fqdn = socket.getfqdn()
     if hostName.endswith("ucsc.edu") or fqdn.endswith(".ucsc.edu") or fqdn.endswith(".sdsc.edu"):
         # at UCSC, the data is not in the CGI-BIN directory
         dbDir = "/gbdb/hg19/beacon/"
 
     dbName = join(dbDir, SQLITEDB)
+    print("Opening %s" % dbName)
 
     if not exists(dbName) and mustExist:
         errAbort("Database file %s does not exist. This beacon is not serving any data." % dbName)
     conn = sqlite3.Connection(dbName)
     return conn
 
 def dbListTables(conn):
     " return list of tables in sqlite db "
     cursor = conn.cursor()
     cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
     rows = cursor.fetchall()
     tables = []
     for row in rows:
         tables.append(row[0])
     return tables
     
 def dbQuery(conn, query, params):
     cursor = conn.cursor()
     if params==None:
         cursor.execute(query)
     else:
         cursor.execute(query, params)
     return cursor.fetchall()
 
 def readAllelesVcf(ifh):
     """ read alleles in VCF file 
         return a list of chrom, pos, allele tuples
     """
     doneData = set() # copy of all data, to check for duplicates
     rows = []
     skipCount = 0
     emptyCount = 0
     gc.disable()
     for line in ifh:
         if line.startswith("#"):
             continue
         fields = string.split(line.rstrip("\n"), "\t", maxsplit=5)
         chrom, pos, varId, ref, alt = fields[:5]
         pos = int(pos)-1 # VCF is 1-based, beacon is 0-based
         
         if alt==".":
             emptyCount += 1
             continue
 
         refIsOne = len(ref)==1
         altIsOne = len(alt)==1
 
         if refIsOne and altIsOne:
             # single bp subst
             beaconAllele = alt
         elif not refIsOne and altIsOne:
             # deletion
             beaconAllele = "D"+str(len(ref)-1) # skip first nucleotide, VCF always adds one nucleotide
             pos += 1 
         elif refIsOne and not altIsOne:
             # insertion
             beaconAllele = "I"+alt[1:]
             pos += 1
         elif not refIsOne and not altIsOne:
             skipCount +=1
         else:
             print "Error: invalid VCF fields:", fields
             sys.exit(1)
 
         if len(rows) % 500000 == 0:
             print "Read %d rows..." % len(rows)
 
         dataRow = (chrom, pos, beaconAllele)
         if dataRow in doneData:
             continue
 
         rows.append( dataRow )
         doneData.add( dataRow )
 
     print "skipped %d VCF lines with empty ALT alleles" % emptyCount
     print "skipped %d VCF lines with both ALT and REF alleles len!=1, cannot encode as beacon queries" % skipCount
     return rows
 
 def readAllelesLovd(ifh):
     """ read the LOVD bed file and return in format (chrom, pos, altAllele) 
     This function is only used internally at UCSC.
     """
     alleles = []
     count = 0
     skipCount = 0
     for line in ifh:
         if line.startswith("chrom"):
             continue
         chrom, start, end, desc = line.rstrip("\n").split("\t")[:4]
 
         if desc[-2]==">":
             mutDesc = desc[-3:]
             ref, _, alt = mutDesc
             assert(len(mutDesc)==3)
         elif desc.endswith("del"):
             alt = "D"+str(int(end)-int(start))
         else:
             skipCount += 1
             continue
 
         chrom = chrom.replace("chr", "")
         start = int(start)
         alleles.append( (chrom, start, alt) )
 
     print "read %d alleles, skipped %d non-SNV or del alleles" % (len(alleles), skipCount)
     return list(set(alleles))
 
 def readAllelesHgmd(ifh):
     """ read the HGMD bed file and return in format (chrom, pos, altAllele).
     This function is only used internally at UCSC.
     """
     # chr1 2338004 2338005 PEX10:CM090797 0 2338004 2338005 PEX10 CM090797 substitution
     alleles = []
     count = 0
     skipCount = 0
     for line in ifh:
         fields = line.rstrip("\n").split("\t")
         chrom, start, end = fields[:3]
-        desc = fields[10]
+        desc = fields[11]
         start = int(start)
         end = int(end)
 
-        if desc=="substitution":
+        if desc=="substitution" or desc=="splicing variant":
             assert(end-start==1)
             alt = "*"
         else:
             skipCount += 1
             continue
 
         chrom = chrom.replace("chr", "")
         alleles.append( (chrom, start, alt) )
 
     print "read %d alleles, skipped %d non-SNV alleles" % (len(alleles), skipCount)
     return list(set(alleles))
 
 def iterChunks(seq, size):
     " yields chunks of at most size elements from seq "
     for pos in range(0, len(seq), size):
         yield seq[pos:pos + size]
 
 def printTime(time1, time2, rowCount):
     timeDiff = time2 - time1
     print "Time: %f secs for %d rows, %d rows/sec" % (timeDiff, rowCount, rowCount/timeDiff)
 
 def importFile(fileName, datasetName, format):
     """ open the sqlite db, create a table datasetName and write the data in fileName into it """
     conn = dbOpen()
     dbMakeTable(conn, datasetName)
 
     # try to make sqlite writes as fast as possible
     conn.execute("PRAGMA synchronous=OFF")
     conn.execute("PRAGMA count_changes=OFF") # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/
     conn.execute("PRAGMA cache_size=800000") # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html
     conn.execute("PRAGMA journal_mode=OFF") # http://www.sqlite.org/pragma.html#pragma_journal_mode
     conn.execute("PRAGMA temp_store=memory")
     conn.commit()
 
     if fileName.endswith(".gz"):
         ifh = gzip.open(fileName)
     else:
         ifh = open(fileName)
 
     # see http://stackoverflow.com/questions/1711631/improve-insert-per-second-performance-of-sqlite
     # for background why I do it like this
     print "Reading %s into database table %s" % (fileName, datasetName)
     rowCount = 0
     startTime = time.time()
 
     if format=="vcf":
         alleles = readAllelesVcf(ifh)
     elif format=="lovd":
         alleles = readAllelesLovd(ifh)
     elif format=="hgmd":
         alleles = readAllelesHgmd(ifh)
         
 
     loadTime = time.time()
 
     printTime(startTime, loadTime, len(alleles))
 
     print "Loading alleles into database"
     for rows in iterChunks(alleles, 50000):
         sql = "INSERT INTO %s (chrom, pos, allele) VALUES (?,?,?)" % datasetName
         conn.executemany(sql, rows)
         conn.commit()
         rowCount += len(rows)
     insertTime = time.time()
     printTime(loadTime, insertTime, len(alleles))
 
     print "Indexing database table"
     conn.execute("CREATE UNIQUE INDEX '%s_index' ON '%s' ('chrom', 'pos', 'allele')" % \
         (datasetName, datasetName))
     indexTime = time.time()
     printTime(insertTime, indexTime, len(alleles))
 
 def mainCommandLine():
     " main function if called from command line "
     args, options = parseArgs()
 
     fileName = args[0]
     if len(args)==2:
         datasetName = args[1]
     else:
         datasetName = "defaultDataset"
 
     importFile(fileName, datasetName, options.format)
     
 def mainCgi():
 
     url = os.environ["REQUEST_URI"]
     parsedUrl = urlparse.urlparse(url)
 
     # get CGI parameters
     form = cgi.FieldStorage()
 
     # react based on symlink that was used to call this script
     page = parsedUrl[2].split("/")[-1] # last part of path is REST endpoint
     if page=="info":
         printJson(beaconInfo())
         sys.exit(0)
 
     hgBotDelay()
 
     chrom = form.getfirst("chromosome")
     pos = form.getfirst("position")
     refBases = form.getfirst("referenceBases")
     altBases = form.getfirst("alternateBases")
     reference = form.getfirst("reference")
     dataset = form.getfirst("dataset")
     format = form.getfirst("format")
 
     if chrom==None and pos==None and altBases==None:
         printHelp()
         sys.exit(0)
 
     ret = lookupAlleleJson(chrom, pos, altBases, refBases, reference, dataset)
     if format=="text":
         print "Content-Type: text/html\n"
         print ret["response"]["exists"]
     else:
         printJson(ret)
 
 if __name__=="__main__":
     # deactivate this on the RR, but useful for debugging: prints a http header
     # on errors 
     main()