a7a5686cb5b960d9dab9536fa40fcfcdf1a72393 max Wed Jan 17 08:03:49 2024 -0800 harden hgGeneGraph, ongoing build patch, no redmine ID yet diff --git src/hg/pyLib/hgLib3.py src/hg/pyLib/hgLib3.py index c188d35..b865f23 100644 --- src/hg/pyLib/hgLib3.py +++ src/hg/pyLib/hgLib3.py @@ -85,30 +85,53 @@ elif line.startswith("include "): inclFname = line.split()[1] absFname = normpath(join(dirname(fname), inclFname)) if os.path.isfile(absFname): inclDict = parseConf(absFname) conf.update(inclDict) elif "=" in line: # string search for "=" key, value = line.split("=",1) conf[key] = value return conf # cache of hg.conf contents hgConf = None +captchaQuestions = { + 1 : "How many chromosomes do humans have?", + 2 : "What is the family name of the UCSC Genome Browser's original creator?", + 3 : "The genome is stored in which type of molecule? It's a three letter acronym.", + 4 : "RNA encodes for which type of molecule?", + 5 : "In eukaryotes, transcripts are composed of exons and ...", + 6 : "What do you call the specific location of a gene on a chromosome?", + 7 : "A gene is composed of multiple .... ? (hint: starts with trans- and has exons and introns)", + 8 : "Enter the name of either one of the human sex chromosomes", + 9 : "This genome browser is hosted by a University named...", + } + +captchaAnswers = { + 1 : ["46", "23"], + 2 : ["kent", "jim"], + 3 : ["dna"], + 4 : ["proteins", "amino acids", "aa", "protein"], + 5 : ["introns"], + 6 : ["locus"], + 7 : ["transcripts", "transcript", "isoforms", "proteins"], + 8 : ["x", "y", "chrx", "chry"], + 9 : ["ucsc"], +} def parseHgConf(): """ return hg.conf as dict key:value. """ global hgConf if hgConf is not None: return hgConf hgConf = dict() # python dict = hash table confDir = os.path.dirname(__file__) # look for hg.conf in parent dir fname = os.path.join(confDir, "..", "hg.conf") hgConf = parseConf(fname) if cfgOptionBoolean("JKSQL_TRACE"): global jksqlTrace jksqlTrace = True @@ -452,65 +475,83 @@ if centralCookie: user = findCookieData(centralCookie) return user def getBotCheckString(ip, fraction): " port of lib/botDelay.c:getBotCheckString: compose user.ip fraction for bot check " user = getCookieUser() if user: botCheckString = "%s.%s %f" % (user, ip, fraction) else: botCheckString = "%s %f" % (ip, fraction) return botCheckString -def hgBotDelay(fraction=1.0): +def hgBotDelay(fraction=1.0, useBytes=None): """ Implement bottleneck delay, get bottleneck server from hg.conf. This behaves similar to the function src/hg/lib/botDelay.c:hgBotDelay It does not use the hgsid, currently it always uses the IP address. Using the hgsid makes little sense. It is more lenient than the C version. + + If useBytes is set, use only the first x bytes of the IP address. This helps + block bots that all use similar IP addresses, at the risk of blocking + entire institutes. """ global hgConf global doWarnBot global botDelayMsecs ip = os.environ.get("REMOTE_ADDR") if not ip: # skip if not called from Apache return + if useBytes is not None and ip.count(".")==3: # do not do this for ip6 addresses + ip = ".".join(ip.split(".")[:useBytes]) host = cfgOption("bottleneck.host") port = cfgOption("bottleneck.port") if not host or not port or not ip: return botCheckString = getBotCheckString(ip, fraction) millis = botDelayTime(host, port, botCheckString) debug(1, "Bottleneck delay: %d msecs" % millis) botDelayMsecs = millis - if millis>botDelayBlock: + captchaId = int(cgiString("captchaId", 0)) + if captchaId!=0: + captchaAnswer = cgiString("captchaAnswer", "").lower() + allowedAnswers = captchaAnswers.get(captchaId, []) + if captchaAnswer in allowedAnswers: + millis -= 10000 + + if millis > (botDelayBlock/fraction): # retry-after time factor 10 is based on the example in the bottleneck help message sys.stderr.write("hgLib.py hogExit\n") - errAbort("Too many HTTP requests and not enough delay between them. " + printContentType(status=429, headers={"Retry-after" : str(millis / 200)}) + print("
") + print("Too many HTTP requests and not enough delay between them." "Your IP has been blocked to keep this website responsive for other users. " - "Please contact genome-www@soe.ucsc.edu to unblock your IP address. We can also help you obtain the data you need without " - "web crawling. ", status=429, headers = {"Retry-after" : str(millis / 10)}) + "Please contact genome-www@soe.ucsc.edu to unblock your IP address or solve the captcha below. We can also help you obtain the data you need without " + "web crawling.
") + showCaptcha() + print("") + sys.exit(0) - if millis>botDelayWarn: + if millis > (botDelayWarn/fraction): time.sleep(millis/1000.0) doWarnBot = True # = show warning message later in printContentType() def parseRa(text): " Parse ra-style string and return as dict name -> value " import string lines = text.split("\n") data = dict() for l in lines: if len(l)==0: continue key, val = l.split(" ", maxsplit=1) data[key] = val return data @@ -1047,36 +1088,52 @@ hguid = getCookieUser() hgsid = cgiString("hgsid") conn = hConnectCentral() cart = {} userInfo = cartDbLoadFromId(conn, "userDb", hguid, cart) sessionInfo = cartDbLoadFromId(conn, "sessionDb", hgsid, cart) return cart def cartString(cart, default=None): " Get a string from the cart. For better readability for programmers used to the C code. " return cart.get(cart, default) -def cgiSetup(): +def showCaptcha(): + import random + captchaId = random.choice(list(captchaAnswers.keys())) + print("Please answer the following question to unblock your IP:") + print("
") + print(captchaQuestions[captchaId]) + print("
") + print('