src/hg/hgBeacon/hgBeacon c7da6212382f6efa86b8c2f4ca9f267d8273bfdc

c7da6212382f6efa86b8c2f4ca9f267d8273bfdc
max
  Mon Mar 9 03:08:07 2015 -0700
fixing the help message and the tester in hgBeacon.

diff --git src/hg/hgBeacon/hgBeacon src/hg/hgBeacon/hgBeacon
index 6f2c39e..a3eeba1 100755
--- src/hg/hgBeacon/hgBeacon
+++ src/hg/hgBeacon/hgBeacon
@@ -1,552 +1,558 @@
 #!/usr/bin/env python
 # A beacon allows very limited queries against a set of variants without allowing someone
 # to download the list of variants
 # see ga4gh.org/#/beacon (UCSC redmine 14393)
 
 import cgi, subprocess, sys, cgitb, os, socket, time, json, glob, urlparse, time
 import sqlite3, gzip, optparse, gc, string, re, socket
-from os.path import join, isfile, dirname, isdir, basename
+from os.path import join, isfile, dirname, isdir, basename, exists
 
 # general info about this beacon
 hostName = os.environ.get("HTTP_HOST", "localhost")
 homepage = "http://%s/cgi-bin/hgBeacon" % hostName
 beaconDesc = {
     "id":"ucsc-browser", \
     "name":"Genome Browser", \
     "organization" : "UCSC", \
     "description" : "UCSC Genome Browser",
     "api" : "0.2",
     "homepage": homepage
     }
 # the list genome reference we support
 beaconReferences = ["GRCh37"]
 
 # sqlite database 
 SQLITEDB = "beaconData.sqlite"
 
 # dir of the CGIs relative to REQUEST_URI
 #cgiDir = ".." # symlinks are one level below the main cgi-bin dir
 cgiDir = "." # symlinks are in the current dir, for GA4GH
 
 # cache of hg.conf dict
 hgConf = None
 
 # descriptions of datasets that this beacon is serving
 DataSetDescs = {
 "hgmd" : "Human Genome Variation Database, only single-nucleotide variants, public version, provided by Biobase",
 "lovd" : "Leiden Open Varation Database installations that agreed to share their variants, only single-nucleotide variants and deletions",
 "test" : "small test data on Chromosome 1, from ICGC",
 "test2" : "another piece of test data from Chromosome 1, also from ICGC"
 }
 
 # special case: same datasets do not have alt alleles. In this case, an overlap is enough to trigger a "true"
 NoAltDataSets = ["hgmd"]
 
 def queryBottleneck(host, port, ip):
     " contact UCSC-style bottleneck server to get current delay time "
     # send ip address
     s =  socket.socket()
     s.connect((host, int(port)))
     msg = ip
     d = chr(len(msg))+msg
     s.send(d)
 
     # read delay time
     expLen = ord(s.recv(1))
     totalLen = 0
     buf = list()
     while True:
         resp = s.recv(1024)
         buf.append(resp)
         totalLen+= len(resp)
         if totalLen==expLen:
             break
     return int("".join(buf))
 
 def parseConf(fname):
     " parse a hg.conf style file, return as dict key -> value (both are strings) "
     conf = {}
     for line in open(fname):
         line = line.strip()
         if line.startswith("#"):
             continue
         elif line.startswith("include "):
             inclFname = line.split()[1]
             inclPath = join(dirname(fname), inclFname)
             if isfile(inclPath):
                 inclDict = parseConf(inclPath)
                 conf.update(inclDict)
         elif "=" in line: # string search for "="
             key, value = line.split("=")
             conf[key] = value
     return conf
 
 def parseHgConf(confDir="."):
     """ return hg.conf as dict key:value """
     global hgConf
     if hgConf is not None:
         return hgConf
 
     hgConf = dict() # python dict = hash table
 
     currDir = dirname(__file__)
     fname = join(currDir, confDir, "hg.conf")
     if not isfile(fname):
         fname = join(currDir, "hg.conf")
     if not isfile(fname):
         return {}
     hgConf = parseConf(fname)
 
     return hgConf
 
 def errAbort(errMsg=None):
     " exit with error message "
     if errMsg == None:
         sys.exit(0)
 
     helpUrl = "http://%s/cgi-bin/hgBeacon?page=help" % hostName
 
     ret = {"errormsg":errMsg, 
         "more_info":"for a complete description of the parameters, read the help message at %s" % helpUrl}
     printJson(ret)
     sys.exit(0)
 
 def printHelp():
     " print help text to stdout and exit "
+    print "Content-Type: text/html\n"
     print "<html><body>"
     host = hostName # convert from global to local var
     if host.endswith(".ucsc.edu"):
         helpDir = "/gbdb/hg19/beacon"
     else:
         helpDir = dirname(__file__)
 
     helpPath = join(helpDir, "help.txt")
-    if not isfile(helpPath):
+    if not exists(helpPath):
         errAbort("no file %s found. The beacon is not activated on this machine" % helpPath)
 
     helpText = open(helpPath).read()
     print helpText % locals()
     print "</body></html>"
     sys.exit(0)
 
 def dataSetResources():
     " Returns the list of DataSetResources "
     totalSize = 0
     dsrList = []
     conn = dbOpen(mustExist=True)
     for tableName in dbListTables(conn):
         rows = dbQuery(conn, "SELECT COUNT(*) from %s" % tableName, None)
         itemCount = rows[0][0]
         
         # the dataset ID is just the file basename without extension
         dsId = tableName
         dsr = (dsId, DataSetDescs.get(dsId, ""), itemCount)
         dsrList.append(dsr)
         totalSize += itemCount
     return totalSize, dsrList
 
 def beaconInfo():
     " return a beaconInfo dict "
     size, dsrList = dataSetResources()
     return \
         {
         "beacon": beaconDesc,
         "references": beaconReferences,
         "datasets": dsrList,
         "size": size
         }
 
 def printJson(data):
+    print "Content-Type: application/json\n"
     print json.dumps(data, indent=4, sort_keys=True,separators=(',', ': '))
 
 def hgBotDelay():
     " implement bottleneck delay, get bottleneck server from hg.conf "
     global hgConf
     hgConf = parseHgConf("..")
     if "bottleneck.host" not in hgConf:
         return
     ip = os.environ["REMOTE_ADDR"]
     delay = queryBottleneck(hgConf["bottleneck.host"], hgConf["bottleneck.port"], ip)
     if delay>10000:
         time.sleep(delay/1000.0)
     if delay>20000:
-        print("Blocked")
+        errAbort("IP blocked")
         sys.exit(0)
 
 def checkParams(chrom, pos, allele, reference, track):
     " make sure the parameters follow the spec "
     if reference==None or reference=="":
         reference="GRCh37"
     if reference not in beaconReferences:
         errAbort("invalid 'reference' parameter, valid ones are %s" % ",".join(beaconReferences))
     if chrom==None or chrom=="":
         errAbort("missing chromosome parameter")
     if allele==None or allele=="":
-        errAbort("missing allele parameter")
+        errAbort("missing alternateBases parameter")
 
     allele = allele.upper()
     if not (re.compile("[ACTG]+").match(allele)!=None or \
        re.compile("I[ACTG]+").match(allele)!=None or \
        re.compile("D[0-9]+").match(allele)!=None):
-        errAbort("invalid allele parameter, can only be a [ACTG]+ or I[ACTG]+ or D[0-9]+")
-    if track is not None:
+        errAbort("invalid alternateBases parameter, can only be a [ACTG]+ or I[ACTG]+ or D[0-9]+")
+    if track is not None and track!="":
         if not track.isalnum():
             errAbort("'dataset' parameter must contain only alphanumeric characters")
         if len(track)>100:
             errAbort("'dataset' parameter must not be longer than 100 chars")
 
     if pos==None or not pos.isdigit():
         errAbort("'position' parameter is not a number")
     pos = int(pos)
 
     # convert chrom to hg19 format
     if chrom==None:
         errAbort( "missing chromosome parameter")
     if not ((chrom.isdigit() and int(chrom)>=1 and int(chrom)<=22) or chrom in ["X","Y","M","test"]):
         errAbort( "invalid chromosome name %s" % chrom)
 
     return chrom, pos, allele, reference, track
 
 def lookupAllele(chrom, pos, allele, reference, dataset):
     " check if an allele is present in a sqlite DB "
     conn = dbOpen(mustExist=True)
     tableList = dbListTables(conn)
-    if dataset!=None:
+    if dataset!=None and dataset!="":
         if dataset not in tableList:
             errAbort("dataset %s is not present on this server" % dataset)
         tableList = [dataset]
 
     for tableName in tableList:
         cur = conn.cursor()
         if tableName in NoAltDataSets:
             # some datasets don't have alt alleles, e.g. HGMD
             sql = "SELECT * from %s WHERE chrom=? AND pos=?" % tableName
             cur.execute(sql, (chrom, pos))
         else:
             sql = "SELECT * from %s WHERE chrom=? AND pos=? AND allele=?" % tableName
             cur.execute(sql, (chrom, pos, allele))
         row = cur.fetchone()
         if row!=None:
             return "true"
 
     return "false"
 
-def lookupAlleleJson(chrom, pos, allele, reference, dataset):
+def lookupAlleleJson(chrom, pos, altBases, refBases, reference, dataset):
     " call lookupAllele and wrap the result into dictionaries "
-    chrom, pos, allele, reference, dataset = checkParams(chrom, pos, allele, reference, dataset)
+    chrom, pos, altBases, reference, dataset = checkParams(chrom, pos, altBases, reference, dataset)
 
-    exists = lookupAllele(chrom, pos, allele, reference, dataset)
+    exists = lookupAllele(chrom, pos, altBases, reference, dataset)
 
     if chrom=="test" and pos==0:
         exists = "true"
 
     query = {
-        "allele": allele,
+        "alternateBases": altBases,
+        "referenceBases" : refBases,
         "chromosome": chrom.replace("chr",""),
         "position": pos,
         "reference": reference,
         "dataset": dataset
         }
     ret = {"beacon" : beaconDesc, "query" : query, "response" : {"exists":exists} }
     return ret
     
 def main():
     # detect if running under apache or was run from command line
     if 'REQUEST_METHOD' in os.environ:
-        fqdn = socket.getfqdn()
-        if not (fqdn.startswith("hgw") and fqdn.endswith("ucsc.edu")) \
-            or fqdn.startswith("hgwdev."):
+        if not (hostName.startswith("hgw") and hostName.endswith("ucsc.edu")) \
+            or hostName.startswith("hgwdev."):
             # enable special CGI error handler not on the RR, but on hgwdev
             cgitb.enable()
         mainCgi()
     else:
         mainCommandLine()
 
 def parseArgs():
     " parse command line options into args and options "
     parser = optparse.OptionParser("""usage: %prog [options] filename [datasetName] - import VCF or BED files into the beacon database. 
     - parameter 'datasetName' is optional and defaults to 'defaultDataset'.
     - any existing dataset of the same name will be overwritten
     - the data is written to beaconData.sqlite. You can use 'sqlite3' to inspect the data file.
     - the input file can be gzipped
     """)
 
     parser.add_option("-d", "--debug", dest="debug", \
         action="store_true", help="show debug messages") 
     parser.add_option("-f", "--format", dest="format", \
         action="store", help="format of input file, one of vcf, lovd, hgmd. default %default", \
         default = "vcf") 
     (options, args) = parser.parse_args()
 
     if len(args)==0:
         parser.print_help()
         sys.exit(0)
     return args, options
 
 def dbMakeTable(conn, tableName):
     " create an empty table with chrom/pos/allele fields "
     conn.execute("DROP TABLE IF EXISTS %s" % tableName)
     conn.commit()
 
     _tableDef = (
     'CREATE TABLE IF NOT EXISTS %s '
     '('
     '  chrom text,' # chromosome
     '  pos int,' # start position, 0-based
     '  allele text' # alternate allele, can also be IATG = insertion of ATG or D15 = deletion of 15 bp
     ')')
     conn.execute(_tableDef % tableName)
     conn.commit()
     #'  PRIMARY KEY (chrom, pos, allele) '
 
 def dbOpen(mustExist=False):
     " open the sqlite db and return a DB connection object "
     dbDir = dirname(__file__) # directory where script is located
-    if hostName.endswith("ucsc.edu"): # data is not in CGI directory at UCSC
+    fqdn = socket.getfqdn()
+    if hostName.endswith("ucsc.edu") or fqdn.endswith(".ucsc.edu") or fqdn.endswith(".sdsc.edu"):
+        # at UCSC, the data is not in the CGI-BIN directory
         dbDir = "/gbdb/hg19/beacon/"
+
     dbName = join(dbDir, SQLITEDB)
 
-    if not isfile(dbName) and mustExist:
+    if not exists(dbName) and mustExist:
         errAbort("Database file %s does not exist. This beacon is not serving any data." % dbName)
     conn = sqlite3.Connection(dbName)
     return conn
 
 def dbListTables(conn):
     " return list of tables in sqlite db "
     cursor = conn.cursor()
     cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
     rows = cursor.fetchall()
     tables = []
     for row in rows:
         tables.append(row[0])
     return tables
     
 def dbQuery(conn, query, params):
     cursor = conn.cursor()
     if params==None:
         cursor.execute(query)
     else:
         cursor.execute(query, params)
     return cursor.fetchall()
 
 def readAllelesVcf(ifh):
     """ read alleles in VCF file 
         return a list of chrom, pos, allele tuples
     """
     doneData = set() # copy of all data, to check for duplicates
     rows = []
     skipCount = 0
     emptyCount = 0
     gc.disable()
     for line in ifh:
         if line.startswith("#"):
             continue
         fields = string.split(line.rstrip("\n"), "\t", maxsplit=5)
         chrom, pos, varId, ref, alt = fields[:5]
         pos = int(pos)-1 # VCF is 1-based, beacon is 0-based
         
         if alt==".":
             emptyCount += 1
             continue
 
         refIsOne = len(ref)==1
         altIsOne = len(alt)==1
 
         if refIsOne and altIsOne:
             # single bp subst
             beaconAllele = alt
         elif not refIsOne and altIsOne:
             # deletion
             beaconAllele = "D"+str(len(ref)-1) # skip first nucleotide, VCF always adds one nucleotide
             pos += 1 
         elif refIsOne and not altIsOne:
             # insertion
             beaconAllele = "I"+alt[1:]
             pos += 1
         elif not refIsOne and not altIsOne:
             skipCount +=1
         else:
             print "Error: invalid VCF fields:", fields
             sys.exit(1)
 
         if len(rows) % 500000 == 0:
             print "Read %d rows..." % len(rows)
 
         dataRow = (chrom, pos, beaconAllele)
         if dataRow in doneData:
             continue
 
         rows.append( dataRow )
         doneData.add( dataRow )
 
     print "skipped %d VCF lines with empty ALT alleles" % emptyCount
     print "skipped %d VCF lines with both ALT and REF alleles len!=1, cannot encode as beacon queries" % skipCount
     return rows
 
 def readAllelesLovd(ifh):
     """ read the LOVD bed file and return in format (chrom, pos, altAllele) 
     This function is only used internally at UCSC.
     """
     alleles = []
     count = 0
     skipCount = 0
     for line in ifh:
         if line.startswith("chrom"):
             continue
         chrom, start, end, desc = line.rstrip("\n").split("\t")[:4]
 
         if desc[-2]==">":
             mutDesc = desc[-3:]
             ref, _, alt = mutDesc
             assert(len(mutDesc)==3)
         elif desc.endswith("del"):
             alt = "D"+str(int(end)-int(start))
         else:
             skipCount += 1
             continue
 
         chrom = chrom.replace("chr", "")
         start = int(start)
         alleles.append( (chrom, start, alt) )
 
     print "read %d alleles, skipped %d non-SNV or del alleles" % (len(alleles), skipCount)
     return list(set(alleles))
 
 def readAllelesHgmd(ifh):
     """ read the HGMD bed file and return in format (chrom, pos, altAllele).
     This function is only used internally at UCSC.
     """
     # chr1 2338004 2338005 PEX10:CM090797 0 2338004 2338005 PEX10 CM090797 substitution
     alleles = []
     count = 0
     skipCount = 0
     for line in ifh:
         fields = line.rstrip("\n").split("\t")
         chrom, start, end = fields[:3]
         desc = fields[10]
         start = int(start)
         end = int(end)
 
         if desc=="substitution":
             assert(end-start==1)
             alt = "*"
         else:
             skipCount += 1
             continue
 
         chrom = chrom.replace("chr", "")
         alleles.append( (chrom, start, alt) )
 
     print "read %d alleles, skipped %d non-SNV alleles" % (len(alleles), skipCount)
     return list(set(alleles))
 
 def iterChunks(seq, size):
     " yields chunks of at most size elements from seq "
     for pos in range(0, len(seq), size):
         yield seq[pos:pos + size]
 
 def printTime(time1, time2, rowCount):
     timeDiff = time2 - time1
     print "Time: %f secs for %d rows, %d rows/sec" % (timeDiff, rowCount, rowCount/timeDiff)
 
 def importFile(fileName, datasetName, format):
     """ open the sqlite db, create a table datasetName and write the data in fileName into it """
     conn = dbOpen()
     dbMakeTable(conn, datasetName)
 
     # try to make sqlite writes as fast as possible
     conn.execute("PRAGMA synchronous=OFF")
     conn.execute("PRAGMA count_changes=OFF") # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/
     conn.execute("PRAGMA cache_size=800000") # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html
     conn.execute("PRAGMA journal_mode=OFF") # http://www.sqlite.org/pragma.html#pragma_journal_mode
     conn.execute("PRAGMA temp_store=memory")
     conn.commit()
 
     if fileName.endswith(".gz"):
         ifh = gzip.open(fileName)
     else:
         ifh = open(fileName)
 
     # see http://stackoverflow.com/questions/1711631/improve-insert-per-second-performance-of-sqlite
     # for background why I do it like this
     print "Reading %s into database table %s" % (fileName, datasetName)
     rowCount = 0
     startTime = time.time()
 
     if format=="vcf":
         alleles = readAllelesVcf(ifh)
     elif format=="lovd":
         alleles = readAllelesLovd(ifh)
     elif format=="hgmd":
         alleles = readAllelesHgmd(ifh)
         
 
     loadTime = time.time()
 
     printTime(startTime, loadTime, len(alleles))
 
     print "Loading alleles into database"
     for rows in iterChunks(alleles, 50000):
         sql = "INSERT INTO %s (chrom, pos, allele) VALUES (?,?,?)" % datasetName
         conn.executemany(sql, rows)
         conn.commit()
         rowCount += len(rows)
     insertTime = time.time()
     printTime(loadTime, insertTime, len(alleles))
 
     print "Indexing database table"
     conn.execute("CREATE UNIQUE INDEX '%s_index' ON '%s' ('chrom', 'pos', 'allele')" % \
         (datasetName, datasetName))
     indexTime = time.time()
     printTime(insertTime, indexTime, len(alleles))
 
 def mainCommandLine():
     " main function if called from command line "
     args, options = parseArgs()
 
     fileName = args[0]
     if len(args)==2:
         datasetName = args[1]
     else:
         datasetName = "defaultDataset"
 
     importFile(fileName, datasetName, options.format)
     
 def mainCgi():
-    print "Content-Type: text/html\n"
 
     url = os.environ["REQUEST_URI"]
     parsedUrl = urlparse.urlparse(url)
 
     # get CGI parameters
     form = cgi.FieldStorage()
 
     # react based on symlink that was used to call this script
     page = parsedUrl[2].split("/")[-1] # last part of path is REST endpoint
     if page=="info":
         printJson(beaconInfo())
         sys.exit(0)
 
     hgBotDelay()
 
     chrom = form.getfirst("chromosome")
     pos = form.getfirst("position")
-    allele = form.getfirst("allele")
+    refBases = form.getfirst("referenceBases")
+    altBases = form.getfirst("alternateBases")
     reference = form.getfirst("reference")
     dataset = form.getfirst("dataset")
     format = form.getfirst("format")
 
-    if chrom==None and pos==None and allele==None:
+    if chrom==None and pos==None and altBases==None:
         printHelp()
         sys.exit(0)
 
-    ret = lookupAlleleJson(chrom, pos, allele, reference, dataset)
+    ret = lookupAlleleJson(chrom, pos, altBases, refBases, reference, dataset)
     if format=="text":
+        print "Content-Type: text/html\n"
         print ret["response"]["exists"]
     else:
         printJson(ret)
 
 if __name__=="__main__":
     # deactivate this on the RR, but useful for debugging: prints a http header
     # on errors 
     main()