e940236873e8a6fe882adec2465f0aa45417db81 max Wed Sep 11 04:14:09 2019 -0700 adding cookie bottleneck logic to hgGeneGraph, refs #24094 diff --git src/hg/pyLib/hgLib.py src/hg/pyLib/hgLib.py index 2cbca3e..4339242 100644 --- src/hg/pyLib/hgLib.py +++ src/hg/pyLib/hgLib.py @@ -7,41 +7,44 @@ # The cart is currently read-only. More work is needed to allow writing a cart. # General rules for CGI in Python: # - never insert values into SQL queries. Write %s in the query and provide the # arguments to sqlQuery as a list. # - never print incoming HTTP argument as raw text. Run it through cgi.escape to # destroy javascript code in them. # Non-standard imports. They need to be installed on the machine. # We provide a pre-compiled library as part of our cgi-bin distribution as a # fallback in the "pyLib" directory. The idea of having pyLib be the last # directory in sys.path is that the system MySQLdb takes precedence. try: import MySQLdb except: - print "Installation error - could not load MySQLdb for Python. Please tell your system administrator to run " \ - "one of these commands as root: 'yum install MySQL-python', 'apt-get install python-mysqldb' or 'pip install MySQL-python'." + print("Installation error - could not load MySQLdb for Python. Please tell your system administrator to run " \ + "one of these commands as root: 'yum install MySQL-python', 'apt-get install python-mysqldb' or 'pip install MySQL-python'.") exit(0) # Imports from the Python 2.7 standard library # Please minimize global imports. Each library import can take up to 20msecs. -import os, cgi, sys, logging +import os, cgi, sys, logging, time from os.path import join, isfile, normpath, abspath, dirname, isdir, splitext from collections import namedtuple +# start to support both python2 and python3 +from six.moves import http_cookies, urllib + # activate debugging output output only on dev import platform if "hgwdev" in platform.node(): import cgitb cgitb.enable() # debug level: a number. the higher, the more debug info is printed # to see most debug messages, set to 1 # another way to change this variable is by setting the URL variable "debug" to 1 verboseLevel = 0 cgiArgs = None # like in the kent tree, we keep track of whether we have already output the content-type line contentLineDone = False @@ -52,31 +55,31 @@ botDelayMsecs = 0 # two global variables: the first is the botDelay limit after which the page is slowed down and a warning is shown # the second is the limit after which the page is not shown anymore botDelayWarn = 1000 botDelayBlock = 5000 jksqlTrace = False def warn(format, *args): print (format % args) def errAbort(msg, status=None, headers = None): " show msg and abort. Like errAbort.c " printContentType(status=status, headers=headers) - print msg + print(msg) exit(0) def debug(level, msg): " output debug message with a given verbosity level " if verboseLevel >= level: printContentType() print(msg+"
") sys.stdout.flush() def parseConf(fname): " parse a hg.conf style file, return as dict key -> value (all strings) " conf = {} for line in open(fname): line = line.strip() if line.startswith("#"): @@ -104,30 +107,35 @@ hgConf = dict() # python dict = hash table confDir = os.path.dirname(__file__) # look for hg.conf in parent dir fname = os.path.join(confDir, "..", "hg.conf") hgConf = parseConf(fname) if cfgOptionBoolean("JKSQL_TRACE"): global jksqlTrace jksqlTrace = True return hgConf def cfgOption(name, default=None): " return hg.conf option or default " + global hgConf + + if not hgConf: + parseHgConf() + return hgConf.get(name, default) def cfgOptionBoolean(name, default=False): " return True if option is set to 1, on or true, or default if not set " val = hgConf.get(name, default) in [True, "on", "1", "true"] return val def sqlConnect(db, host=None, user=None, passwd=None): """ connect to sql server specified in hg.conf with given db. Like jksql.c. """ cfg = parseHgConf() if host==None: host, user, passwd = cfg["db.host"], cfg["db.user"], cfg["db.password"] conn = MySQLdb.connect(host=host, user=user, passwd=passwd, db=db) # we will need this info later @@ -180,31 +188,31 @@ """ cursor = conn.cursor() if jksqlTrace: from datetime import datetime sys.stderr.write("SQL_QUERY 0 %s %s %s %s\n" % (conn.host, conn.db, query, args)) startTime = datetime.now() try: rows = cursor.execute(query, args) if jksqlTrace: timeDiff = _timeDeltaSeconds(datetime.now(), startTime) sys.stderr.write("SQL_TIME 0 %s %s %.3f\n" % (conn.host, conn.db, timeDiff)) - except MySQLdb.Error, errObj: + except MySQLdb.Error as errObj: # on table not found, try the secondary mysql connection, "slow-db" in hg.conf errCode, errDesc = errObj if errCode!=1146: # "table not found" error raise if conn.failoverConn == None: _sqlConnectFailover(conn) if not conn.failoverConn: raise # stay compatible with the jksql.c JKSQL_TRACE output format if jksqlTrace: sys.stderr.write("SQL_FAILOVER 0 %s %s db -> slow-db | %s %s\n" % \ (conn.host, conn.db, query, args)) @@ -216,32 +224,32 @@ data = cursor.fetchall() cursor.close() if jksqlTrace: timeDiff = _timeDeltaSeconds(datetime.now(), startTime) sys.stderr.write("SQL_FETCH 0 %s %s %.3f\n" % (conn.host, conn.db, timeDiff)) colNames = [desc[0] for desc in cursor.description] Rec = namedtuple("MysqlRow", colNames) recs = [Rec(*row) for row in data] return recs def htmlPageEnd(oldJquery=False): " close html body/page " - print "" - print "" + print("") + print("") def printMenuBar(oldJquery=False): baseDir = "../" " print the menubar. Mostly copied from src/hg/hgMenuBar.c " print ("\n") menuPath = "../htdocs/inc/globalNavBar.inc" navBarStr = open(menuPath, "r").read() print (navBarStr) # fixup old menubar, copied from hgGtexTrackSettings, for now print("") print("") @@ -368,82 +376,126 @@ print # this newline is essential, it means: end of header lines if doWarnBot: print ("
") print ("We have a suspicion that you are an automated web bot software, not a real user. ") print ("To keep our site fast for other users, we have slowed down this page. ") print ("The slowdown will gradually disappear. ") print ("If you think this is a mistake, please contact us at genome-www@soe.ucsc.edu. ") print ("Also note that all data for hgGeneGraph can be obtained through our public MySQL server and") print ("all our software source code is available and can be installed locally onto your own computer. ") print ("If you are unsure how to use these resources, do not hesitate to contact us.") print ("
") -def queryBottleneck(host, port, ip): - " contact UCSC-style bottleneck server to get current delay time. From hg/lib/botDelay.c " +def botDelayTime(host, port, botString): + " contact UCSC-style bottleneck server to get current delay time. From hg/lib/botDelay.c:botDelayTime()" # send ip address import socket s = socket.socket() s.connect((host, int(port))) - msg = ip + msg = botString d = chr(len(msg))+msg s.send(d) - # read delay time + # read delay time as ASCII chars expLen = ord(s.recv(1)) totalLen = 0 buf = list() while True: resp = s.recv(1024) buf.append(resp) totalLen+= len(resp) if totalLen==expLen: break return int("".join(buf)) -def hgBotDelay(): +# global variable, only used by findCookieData, local use without explicit 'global' will trigger a Python error +cookies = None + +def findCookieData(cookieName): + " return value of cookie or None is not set, port of lib/cheapcgi.c:findCookieData " + global cookies + if not cookies: + if "HTTP_COOKIE" in os.environ: + cookies = http_cookies.SimpleCookie(os.environ["HTTP_COOKIE"]) + else: + cookies = {} + + # unlike cheapcgi, Python does not even allow duplicate cookies, so no need to handle this case + cookie = cookies.get(cookieName) + if cookie: + return cookie.value + + return None + +def getCookieUser(): + " port of lib/botDelay.c:getCookieUser: get hguid cookie value " + user = None + centralCookie = cfgOption("central.cookie", default="hguid") + + if centralCookie: + user = findCookieData(centralCookie) + + return user + +def getBotCheckString(ip, fraction): + " port of lib/botDelay.c:getBotCheckString: compose user.ip fraction for bot check " + user = getCookieUser() + if user: + botCheckString = "%s.%s %f" % (user, ip, fraction) + else: + botCheckString = "%s %f" % (ip, fraction) + return botCheckString + +def hgBotDelay(fraction=1.0): """ Implement bottleneck delay, get bottleneck server from hg.conf. This behaves similar to the function src/hg/lib/botDelay.c:hgBotDelay It does not use the hgsid, currently it always uses the IP address. Using the hgsid makes little sense. It is more lenient than the C version. """ - import time - if "DOCUMENT_ROOT" not in os.environ: # skip if not called from Apache - return global hgConf global doWarnBot global botDelayMsecs - hgConf = parseHgConf() - if "bottleneck.host" not in hgConf: + + ip = os.environ.get("REMOTE_ADDR") + if not ip: # skip if not called from Apache + return + + host = cfgOption("bottleneck.host") + port = cfgOption("bottleneck.port") + + if not "bottleneck.host" or not "bottleneck.port" or not ip: return - ip = os.environ["REMOTE_ADDR"] - delay = queryBottleneck(hgConf["bottleneck.host"], hgConf["bottleneck.port"], ip) - debug(1, "Bottleneck delay: %d msecs" % delay) - botDelayMsecs = delay - if delay>botDelayBlock: + botCheckString = getBotCheckString(ip, fraction) + millis = botDelayTime(host, port, botCheckString) + debug(1, "Bottleneck delay: %d msecs" % millis) + botDelayMsecs = millis + + if millis>botDelayBlock: + # retry-after time factor 10 is based on the example in the bottleneck help message errAbort("Too many HTTP requests and not enough delay between them. " "Your IP has been blocked to keep this website responsive for other users. " "Please contact genome-www@soe.ucsc.edu to unblock your IP address. We can also help you obtain the data you need without " - "web crawling. ", status=429, headers = {"Retry-after" : str(botDelayMsecs / 1000)}) + "web crawling. ", status=429, headers = {"Retry-after" : str(millis / 10)}) sys.exit(0) - if delay>botDelayWarn: - time.sleep(delay/1000.0) + if millis>botDelayWarn: + time.sleep(millis/1000.0) doWarnBot = True # = show warning message later in printContentType() def parseRa(text): " Parse ra-style string and return as dict name -> value " import string lines = text.split("\n") data = dict() for l in lines: if len(l)==0: continue key, val = string.split(l, " ", maxsplit=1) data[key] = val return data def lineFileNextRow(inFile): @@ -462,31 +514,31 @@ else: fh = inFile line1 = fh.readline() line1 = line1.rstrip("\n").lstrip("#") headers = line1.split("\t") Record = namedtuple('tsvRec', headers) for line in fh: if line.startswith("#"): continue line = line.rstrip("\n") fields = string.split(line, "\t", maxsplit=len(headers)-1) try: rec = Record(*fields) - except Exception, msg: + except Exception as msg: logging.error("Exception occured while parsing line, %s" % msg) logging.error("Filename %s" % fh.name) logging.error("Line was: %s" % line) logging.error("Does number of fields match headers?") logging.error("Headers are: %s" % headers) raise Exception("header count: %d != field count: %d wrong field count in line %s" % (len(headers), len(fields), line)) yield rec def parseDict(fname): """ Parse text file in format keyvalue and return as dict key->val. Does not abort on duplicate keys, for performance reasons. """ import gzip d = {} @@ -522,35 +574,35 @@ " hdb.c : get the local filename on disk or construct a URL to a /gbdb file and return it " if not isfile(fname): # try using gbdbLoc1 fname2 = gbdbReplace(fname, "gbdbLoc1") if fname2 and isfile(fname2): return fname2 if not isfile(fname): # try using gbdbLoc2, which can be a URL fname2 = gbdbReplace(fname, "gbdbLoc2") return fname2 return fname def netUrlOpen(url): " net.c: open a URL and return a file object " - import urllib2, time, errno + import errno from socket import error as SocketError # let our webservers know that we are not a Firefox - opener = urllib2.build_opener() + opener = urllib.request.build_opener() opener.addheaders = [('User-Agent', 'Genome Browser pyLib/hgLib.py:netUrlOpen()')] resp = None for x in range(5): # limit number of retries try: resp = opener.open(url) break except SocketError as e: if e.errno != errno.ECONNRESET: raise # re-raise any other error time.sleep(1) return resp def readSmallFile(fname): """ read a small file, usually from /gbdb/, entirely into memory and return lines. @@ -740,31 +792,31 @@ jsInlineLines += javascript def jsInlineF(format, *args): " Add javascript text to output file or memory structure " jsInline(format % args) jsInlineFinishCalled = False; def jsInlineFinish(): " finish outputting accumulated inline javascript " global jsInlineFinishCalled global jsInlineLines if jsInlineFinishCalled: # jsInlineFinish can be called multiple times when generating framesets or genomeSpace. warn("jsInlineFinish() called already.") - print "\n" % (getNonce(), jsInlineLines) + print("\n" % (getNonce(), jsInlineLines)) jsInlineLines = "" jsInlineFinishCalled = True def jsInlineReset(): " used by genomeSpace to repeatedly output multiple pages to stdout " global jsInlineFinishCalled jsInlineFinishCalled = False jsEvents = [ "abort", "activate", "afterprint", "afterupdate", "beforeactivate", "beforecopy", @@ -876,49 +928,47 @@ " Add js mapping for inline event " checkValidEvent(eventName) jsInlineF("document.getElementById('%s').on%s = function(event) {if (!event) {event=window.event}; %s};\n", idText, eventName, jsText) def jsOnEventByIdF(eventName, idText, format, *args): " Add js mapping for inline event with printf formatting " checkValidEvent(eventName) jsInlineF("document.getElementById('%s').on%s = function(event) {if (!event) {event=window.event}; ", idText, eventName) jsInlineF(format, *args) jsInlineF("};\n") #============ END of javascript inline-separation routines =============== def cartDbLoadFromId(conn, table, cartId, oldCart): " Like src/hg/lib/cart.c, opens cart table and parses cart contents given a cartId of the format 123123_csctac " - import urlparse - if cartId==None: return {} cartFields = cartId.split("_") if len(cartFields)!=2: errAbort("Could not parse identifier %s for cart table %s" % (cgi.escape(cartId), table)) idStr, secureId = cartFields query = "SELECT contents FROM "+table+" WHERE id=%(id)s and sessionKey=%(sessionKey)s" rows = sqlQuery(conn, query, {"id":idStr, "sessionKey":secureId}) if len(rows)==0: # silently ignore invalid cart IDs for now. Future code may want to generate a new cart. return {} - cartList = urlparse.parse_qs(rows[0][0]) + cartList = urllib.parse.parse_qs(rows[0][0]) # by default, python returns a dict with key -> list of vals. We need only the first one - for key, vals in cartList.iteritems(): + for key, vals in cartList.items(): oldCart[key] =vals[0] return oldCart centralConn = None def hConnectCentral(): " similar to src/hg/lib/hdb.c:hConnectCentral. We use a much simpler cache, because we usually read all rows into memory. " global centralConn if centralConn: return centralConn centralDb = cfgOption("central.db") if centralDb is None: errAbort("Could not find central.db in hg.conf. Installation error.") @@ -942,43 +992,33 @@ def cartNew(conn, table): " create a new cart and return ID " sessionKey = makeRandomKey() sqlQuery(conn, "INSERT %s VALUES(0,'',0,now(),now(),0,%s" % (table, sessionKey)); sqlLastId = conn.insert_id() newId = "%d_%s" % (sqlLastId, sessionKey) return newId def cartAndCookieSimple(): """ Make the cart from the user cookie (user settings) and the hgsid CGI parameter (session settings, e.g. a browser tab). This is somewhat similar to cartAndCookie from hg/lib/cart.c Two important differences: this cart does not add all CGI arguments automatically. It also does not run cart.c:cartJustify, so track priorities are not applied. Also, if there is no hgsid parameter or no cookie, we do not create a new cart. """ - import Cookie - # no cgiApoptosis yet - maybe needed in the future. see cart.c / cartNew - if "HTTP_COOKIE" in os.environ: - cookies = Cookie.SimpleCookie(os.environ["HTTP_COOKIE"]) - cookieName = cfgOption("central.cookie", "hguid") - hguid = cookies.get(cookieName) - if hguid is not None: - hguid = hguid.value # cookies have values and other attributes. We only need the value - else: - hguid = None - + hguid = getCookieUser() hgsid = cgiString("hgsid") conn = hConnectCentral() cart = {} userInfo = cartDbLoadFromId(conn, "userDb", hguid, cart) sessionInfo = cartDbLoadFromId(conn, "sessionDb", hgsid, cart) return cart def cartString(cart, default=None): " Get a string from the cart. For better readability for programmers used to the C code. " return cart.get(cart, default) def cgiSetup(): """ do the usual browser CGI setup: parse the hg.conf file, parse the CGI