69053e55a7502bb79acb350f002a277ce330cad9
max
Tue Sep 10 03:26:43 2019 -0700
adding the retry-after header to hgLib's bottleneck when blocking, pending Jonathan's/Hiram's feedback, refs #24094
diff --git src/hg/pyLib/hgLib.py src/hg/pyLib/hgLib.py
index ee13bb9..2cbca3e 100644
--- src/hg/pyLib/hgLib.py
+++ src/hg/pyLib/hgLib.py
@@ -36,44 +36,46 @@
import cgitb
cgitb.enable()
# debug level: a number. the higher, the more debug info is printed
# to see most debug messages, set to 1
# another way to change this variable is by setting the URL variable "debug" to 1
verboseLevel = 0
cgiArgs = None
# like in the kent tree, we keep track of whether we have already output the content-type line
contentLineDone = False
# show the bot delay warning message before other printing is done?
doWarnBot = False
+# current bot delay in milliseconds
+botDelayMsecs = 0
# two global variables: the first is the botDelay limit after which the page is slowed down and a warning is shown
# the second is the limit after which the page is not shown anymore
botDelayWarn = 1000
botDelayBlock = 5000
jksqlTrace = False
def warn(format, *args):
print (format % args)
-def errAbort(msg, status=None):
+def errAbort(msg, status=None, headers = None):
" show msg and abort. Like errAbort.c "
- printContentType(status=status)
+ printContentType(status=status, headers=headers)
print msg
exit(0)
def debug(level, msg):
" output debug message with a given verbosity level "
if verboseLevel >= level:
printContentType()
print(msg+"
")
sys.stdout.flush()
def parseConf(fname):
" parse a hg.conf style file, return as dict key -> value (all strings) "
conf = {}
for line in open(fname):
line = line.strip()
@@ -327,51 +329,55 @@
"""
def webStartGbNoBanner(prefix, title):
" output the
part, largely copied from web.c / webStartGbNoBanner "
print (getGbHeader() % (prefix, title, getNonce(), getNonce()))
def runCmd(cmd, mustRun=True):
" wrapper around system() that prints error messages. cmd preferably a list, not just a string. "
import subprocess
ret = subprocess.call(cmd)
if ret!=0 and mustRun:
errAbort("Could not run command %s" % cmd)
return ret
-def printContentType(contType="text/html", status=None, fname=None):
+def printContentType(contType="text/html", status=None, fname=None, headers=None):
"""
print the HTTP Content-type header line with an optional file name for downloads.
Also optionally prints the bot delay note. The argument 'status' must be an int.
"""
global contentLineDone
if not contentLineDone:
contentLineDone = True
print("Content-type: %s; charset=utf-8" % contType)
if status:
if status==400:
print("Status: 400 Bad Request")
elif status==429:
print("Status: 429 Too Many Requests")
else:
raise Exception("Unknown status code, please update hgLib.py")
if fname is not None:
print("Content-Disposition: attachment; filename=%s" % fname)
+ if headers:
+ for key, val in headers.items():
+ print("%s: %s" % (key, val))
+
print # this newline is essential, it means: end of header lines
if doWarnBot:
print ("")
print ("We have a suspicion that you are an automated web bot software, not a real user. ")
print ("To keep our site fast for other users, we have slowed down this page. ")
print ("The slowdown will gradually disappear. ")
print ("If you think this is a mistake, please contact us at genome-www@soe.ucsc.edu. ")
print ("Also note that all data for hgGeneGraph can be obtained through our public MySQL server and")
print ("all our software source code is available and can be installed locally onto your own computer. ")
print ("If you are unsure how to use these resources, do not hesitate to contact us.")
print ("
")
def queryBottleneck(host, port, ip):
@@ -396,42 +402,44 @@
break
return int("".join(buf))
def hgBotDelay():
"""
Implement bottleneck delay, get bottleneck server from hg.conf.
This behaves similar to the function src/hg/lib/botDelay.c:hgBotDelay
It does not use the hgsid, currently it always uses the IP address.
Using the hgsid makes little sense. It is more lenient than the C version.
"""
import time
if "DOCUMENT_ROOT" not in os.environ: # skip if not called from Apache
return
global hgConf
global doWarnBot
+ global botDelayMsecs
hgConf = parseHgConf()
if "bottleneck.host" not in hgConf:
return
ip = os.environ["REMOTE_ADDR"]
delay = queryBottleneck(hgConf["bottleneck.host"], hgConf["bottleneck.port"], ip)
debug(1, "Bottleneck delay: %d msecs" % delay)
+ botDelayMsecs = delay
if delay>botDelayBlock:
errAbort("Too many HTTP requests and not enough delay between them. "
"Your IP has been blocked to keep this website responsive for other users. "
"Please contact genome-www@soe.ucsc.edu to unblock your IP address. We can also help you obtain the data you need without "
- "web crawling. ", status=429)
+ "web crawling. ", status=429, headers = {"Retry-after" : str(botDelayMsecs / 1000)})
sys.exit(0)
if delay>botDelayWarn:
time.sleep(delay/1000.0)
doWarnBot = True # = show warning message later in printContentType()
def parseRa(text):
" Parse ra-style string and return as dict name -> value "
import string
lines = text.split("\n")
data = dict()
for l in lines:
if len(l)==0:
continue
key, val = string.split(l, " ", maxsplit=1)