src/hg/pyLib/hgLib.py cf32bd93dfcf0e307c889aaef4e5b1845bbd5a98

cf32bd93dfcf0e307c889aaef4e5b1845bbd5a98
max
  Tue Jul 11 17:55:00 2017 -0700
Adding support for GBIB-style remote /gbdb files to hgGeneGraph. refs #19745

diff --git src/hg/pyLib/hgLib.py src/hg/pyLib/hgLib.py
index 5e74625..1f4206b 100644
--- src/hg/pyLib/hgLib.py
+++ src/hg/pyLib/hgLib.py
@@ -15,31 +15,31 @@
 # Non-standard imports. They need to be installed on the machine. 
 # We provide a pre-compiled library as part of our cgi-bin distribution as a
 # fallback in the "pyLib" directory. The idea of having pyLib be the last
 # directory in sys.path is that the system MySQLdb takes precedence.
 try:
     import MySQLdb
 except:
     print "Installation error - could not load MySQLdb for Python. Please tell your system administrator to run " \
         "one of these commands as root: 'yum install MySQL-python', 'apt-get install python-mysqldb' or 'pip install MySQL-python'."
     exit(0)
 
 # Imports from the Python 2.7 standard library
 # Please minimize global imports. Each library import can take up to 20msecs.
 import os, cgi, sys, logging
 
-from os.path import join, isfile, normpath, abspath, dirname
+from os.path import join, isfile, normpath, abspath, dirname, isdir, splitext
 from collections import namedtuple
 
 # activate debugging output output only on dev
 import platform
 if "hgwdev" in platform.node():
     import cgitb
     cgitb.enable()
 
 # debug level: a number. the higher, the more debug info is printed
 verboseLevel = None
 
 cgiArgs = None
 
 # like in the kent tree, we keep track of whether we have already output the content-type line
 contentLineDone = False
@@ -443,30 +443,117 @@
     """ Parse text file in format key<tab>value<newline> and return as dict key->val.
     Does not abort on duplicate keys, for performance reasons. """
     import gzip
     d = {}
 
     if fname.endswith(".gz"):
         fh = gzip.open(fname)
     else:
         fh = open(fname)
 
     for line in fh:
         key, val = line.rstrip("\n").split("\t")
         d[key] = val
     return d
 
+def gbdbReplace(fname, hgConfSetting):
+    " replace /gbdb/ in fname with hgConfSetting "
+    if not fname.startswith("/gbdb/"):
+        return None
+
+    gbdbLoc = hgConf.get(hgConfSetting)
+    if gbdbLoc is None:
+        return None
+
+    return fname.replace("/gbdb/", gbdbLoc)
+
+def getUdcCacheDir():
+    " return the udc cache dir and create it if it doesn't exist "
+    udcDir = hgConf.get("udc.cacheDir", "/tmp/udcCache")
+    if not isdir(udcDir):
+        os.makedirs(udcDir)
+    return udcDir
+
+def hGbdbReplace(fname):
+    " hdb.c : get the local filename on disk or construct a URL to a /gbdb file and return it "
+    if not isfile(fname):
+    # try using gbdbLoc1
+        fname2 = gbdbReplace(fname, "gbdbLoc1")
+        if fname2 and isfile(fname2):
+            return fname2
+
+    if not isfile(fname):
+    # try using gbdbLoc2, which can be a URL
+        fname2 = gbdbReplace(fname, "gbdbLoc2")
+        return fname2
+    return fname
+
+def netUrlOpen(url):
+    " net.c: open a URL and return a file object "
+    import urllib2, time, errno
+    from socket import error as SocketError
+
+    # let our webservers know that we are not a Firefox
+    opener = urllib2.build_opener()
+    opener.addheaders = [('User-Agent', 'Genome Browser pyLib/hgLib.py:netUrlOpen()')]
+
+    resp = None
+    for x in range(5): # limit number of retries
+      try:
+        resp = opener.open(url)
+        break
+      except SocketError as e:
+        if e.errno != errno.ECONNRESET:
+          raise # re-raise any other error
+        time.sleep(1)
+    return resp
+
+def readSmallFile(fname):
+    """ read a small file, usually from /gbdb/, entirely into memory and return lines.
+    If the file doesn't exist, try gbdbLoc1, then gbdbLoc2
+    and keep a local copy if only found on gbdbLoc2.
+
+    This is similar but not the same as the UDC system in the kent C code, but
+    for small files and a complete read over it, the UDC system may be overkill
+    for Python.
+    """
+    fname = hGbdbReplace(fname)
+
+    if fname.startswith("http"):
+        # download to local disk; local filename is the hash of the URL
+        import hashlib
+        udcCacheDir = getUdcCacheDir()
+        m = hashlib.md5()
+        m.update(fname)
+        fileExt = splitext(fname)[-1]
+        tmpFname = join(udcCacheDir, "hgLibCache_"+m.hexdigest()+"."+fileExt)
+        if not isfile(tmpFname):
+            data = netUrlOpen(fname).read()
+            fh = open(tmpFname, "w")
+            fh.write(data)
+            fh.close()
+        fname = tmpFname
+
+    if fname.endswith(".gz"):
+        import gzip
+        fh = gzip.open(fname)
+    else:
+        fh = open(fname)
+
+    lines = fh.read().splitlines()
+    return lines
+
 def cgiString(name, default=None):
     " get named cgi variable as a string, like lib/cheapcgi.c "
     val = cgiArgs.getfirst(name, default=default)
     return val
 
 def cgiGetAll():
     return cgiArgs
 
 def makeRandomKey(numBits=128+33):
     " copied line-by-line from kent/src/lib/htmlshell.c:makeRandomKey "
     import base64
     numBytes = (numBits + 7) / 8  # round up to nearest whole byte.
     numBytes = ((numBytes+2)/3)*3 # round up to the nearest multiple of 3 to avoid equals-char padding in base64 output
     f = open("/dev/urandom", "r") # open random system device for read-only access.
     binaryString = f.read(numBytes)