5472f71152a9d892fc7d7081fad7e7c797d3258a max Fri Jun 27 07:28:30 2025 -0700 requiring valid user cookie to use hgGeneGraph, refs #35391 diff --git src/hg/pyLib/hgLib3.py src/hg/pyLib/hgLib3.py index 043992f3c87..7f02155ba89 100644 --- src/hg/pyLib/hgLib3.py +++ src/hg/pyLib/hgLib3.py @@ -85,53 +85,30 @@ elif line.startswith("include "): inclFname = line.split()[1] absFname = normpath(join(dirname(fname), inclFname)) if os.path.isfile(absFname): inclDict = parseConf(absFname) conf.update(inclDict) elif "=" in line: # string search for "=" key, value = line.split("=",1) conf[key] = value return conf # cache of hg.conf contents hgConf = None -captchaQuestions = { - 1 : "How many chromosomes do humans have?", - 2 : "What is the family name of the UCSC Genome Browser's original creator?", - 3 : "The genome is stored in which type of molecule? It's a three letter acronym.", - 4 : "RNA encodes for which type of molecule?", - 5 : "In eukaryotes, transcripts are composed of exons and ...", - 6 : "What do you call the specific location of a gene on a chromosome?", - 7 : "A gene is composed of multiple .... ? (hint: starts with trans- and has exons and introns)", - 8 : "Enter the name of either one of the human sex chromosomes", - 9 : "This genome browser is hosted by a University named...", - } - -captchaAnswers = { - 1 : ["46", "23"], - 2 : ["kent", "jim"], - 3 : ["dna"], - 4 : ["proteins", "amino acids", "aa", "protein"], - 5 : ["introns"], - 6 : ["locus"], - 7 : ["transcripts", "transcript", "isoforms", "proteins"], - 8 : ["x", "y", "chrx", "chry"], - 9 : ["ucsc"], -} def parseHgConf(): """ return hg.conf as dict key:value. """ global hgConf if hgConf is not None: return hgConf hgConf = dict() # python dict = hash table confDir = os.path.dirname(__file__) # look for hg.conf in parent dir fname = os.path.join(confDir, "..", "hg.conf") hgConf = parseConf(fname) if cfgOptionBoolean("JKSQL_TRACE"): global jksqlTrace jksqlTrace = True @@ -466,103 +443,108 @@ if cookie: return cookie.value return None def getCookieUser(): " port of lib/botDelay.c:getCookieUser: get hguid cookie value " user = None centralCookie = cfgOption("central.cookie", default="hguid") if centralCookie: user = findCookieData(centralCookie) return user +def showCookieError(): + " output error message if cookie not found " + print("Content-type: text/html\n\n") + print("<html><body>") + print("Sorry, the gene interactions viewer requires that you visit the genome browser first once, to defend against bots. ") + print("<a href='hgTracks'>Click here</a> to visit the genome browser, then come back to this page.") + print("</body></html>") + sys.exit(0) + def getBotCheckString(ip, fraction): " port of lib/botDelay.c:getBotCheckString: compose user.ip fraction for bot check " - user = getCookieUser() - useNew = cfgOptionBoolean("newBotDelay") - if (useNew): - hgsid = cgiString("hgsid") - if user: - botCheckString = "uid%s %f" % (user, fraction) - elif hgsid: - botCheckString = "sid%s %f" % (hgsid, fraction) - else: - botCheckString = "%s %f" % (ip, fraction) - else: - if user: - botCheckString = "%s.%s %f" % (user, ip, fraction) - else: - botCheckString = "%s %f" % (ip, fraction) + userId = getCookieUser() + + if not userId: + showCookieError() + + botCheckString = "uid%s %f" % (userId, fraction) + return botCheckString -def hgBotDelay(fraction=1.0, useBytes=None): +def hgBotDelay(fraction=1.0, useBytes=None, botCheckString=None): """ Implement bottleneck delay, get bottleneck server from hg.conf. This behaves similar to the function src/hg/lib/botDelay.c:hgBotDelay It does not use the hgsid, currently it always uses the IP address. Using the hgsid makes little sense. It is more lenient than the C version. If useBytes is set, use only the first x bytes of the IP address. This helps block bots that all use similar IP addresses, at the risk of blocking entire institutes. """ global hgConf global doWarnBot global botDelayMsecs ip = os.environ.get("REMOTE_ADDR") if not ip: # skip if not called from Apache return if useBytes is not None and ip.count(".")==3: # do not do this for ip6 addresses ip = ".".join(ip.split(".")[:useBytes]) host = cfgOption("bottleneck.host") port = cfgOption("bottleneck.port") if not host or not port or not ip: return + warnMsg = None + if botCheckString is None: botCheckString = getBotCheckString(ip, fraction) + else: + warnMsg = "Too many parallel requests for this CGI program. Please wait for a while and try this page again. If the problem persists, " + "please email us at genome@soe.ucsc.edu." + millis = botDelayTime(host, port, botCheckString) debug(1, "Bottleneck delay: %d msecs" % millis) botDelayMsecs = millis - captchaId = int(cgiString("captchaId", 0)) - if captchaId!=0: - captchaAnswer = cgiString("captchaAnswer", "").lower() - allowedAnswers = captchaAnswers.get(captchaId, []) - if captchaAnswer in allowedAnswers: - millis -= 10000 - if millis > (botDelayBlock/fraction): # retry-after time factor 10 is based on the example in the bottleneck help message sys.stderr.write("hgLib.py hogExit\n") printContentType(status=429, headers={"Retry-after" : str(millis / 200)}) print("<html><head></head><body>") + if warnMsg: + print(warnMsg) + print(millis) + print(botDelayBlock) + print(fraction) + else: print("<b>Too many HTTP requests and not enough delay between them.</b><p> " "Your IP has been blocked to keep this website responsive for other users. " "Please contact genome-www@soe.ucsc.edu to unblock your IP address, especially if you were just browsing our site and are not running a bot," - "or solve the captcha below. We can help you obtain the data you need without " - "web crawling.<p>") - showCaptcha() + "We can help you obtain the data you need without web crawling.<p>") print("</html>") sys.exit(0) if millis > (botDelayWarn/fraction): + printContentType(status=429, headers={"Retry-after" : str(millis / 1000.0)}) time.sleep(millis/1000.0) doWarnBot = True # = show warning message later in printContentType() def parseRa(text): " Parse ra-style string and return as dict name -> value " import string lines = text.split("\n") data = dict() for l in lines: if len(l)==0: continue key, val = l.split(" ", maxsplit=1) data[key] = val return data @@ -1029,32 +1011,32 @@ #============ END of javascript inline-separation routines =============== def cartDbLoadFromId(conn, table, cartId, oldCart): " Like src/hg/lib/cart.c, opens cart table and parses cart contents given a cartId of the format 123123_csctac " if cartId==None: return {} cartFields = cartId.split("_") if len(cartFields)!=2: errAbort("Could not parse identifier %s for cart table %s" % (cgi.escape(cartId), table)) idStr, secureId = cartFields query = "SELECT contents FROM "+table+" WHERE id=%(id)s and sessionKey=%(sessionKey)s" rows = sqlQuery(conn, query, {"id":idStr, "sessionKey":secureId}) if len(rows)==0: - # silently ignore invalid cart IDs for now. Future code may want to generate a new cart. - return {} + # invalid cart ID + return None cartList = urllib.parse.parse_qs(rows[0][0]) # by default, python returns a dict with key -> list of vals. We need only the first one for key, vals in cartList.items(): oldCart[key] =vals[0] return oldCart centralConn = None def hConnectCentral(): " similar to src/hg/lib/hdb.c:hConnectCentral. We use a much simpler cache, because we usually read all rows into memory. " global centralConn if centralConn: return centralConn @@ -1092,59 +1074,50 @@ """ Make the cart from the user cookie (user settings) and the hgsid CGI parameter (session settings, e.g. a browser tab). This is somewhat similar to cartAndCookie from hg/lib/cart.c Two important differences: this cart does not add all CGI arguments automatically. It also does not run cart.c:cartJustify, so track priorities are not applied. Also, if there is no hgsid parameter or no cookie, we do not create a new cart. """ # no cgiApoptosis yet - maybe needed in the future. see cart.c / cartNew hguid = getCookieUser() hgsid = cgiString("hgsid") conn = hConnectCentral() cart = {} userInfo = cartDbLoadFromId(conn, "userDb", hguid, cart) + if userInfo is None: + # invalid cookie hguid + showCookieError() + sessionInfo = cartDbLoadFromId(conn, "sessionDb", hgsid, cart) + if sessionInfo is None: + # tolerate invalid hgsid + sessionInfo = {} return cart def cartString(cart, default=None): " Get a string from the cart. For better readability for programmers used to the C code. " return cart.get(cart, default) -def showCaptcha(): - import random - captchaId = random.choice(list(captchaAnswers.keys())) - print("Please answer the following question to unblock your IP:") - print("<p><b>") - print(captchaQuestions[captchaId]) - print("</b><p>") - print('<form action="" method="get">') - print('<label for="textInput">Enter answer:</label>') - - print('<input type="text" id="textInput" name="captchaAnswer" required>') - print('<input type="hidden" name="captchaId" value="'+str(captchaId)+'">') - print('<input type="hidden" name="debug" value="'+cgiString("debug", "0")+'">') - print('<input type="hidden" name="gene" value="'+cgiString("gene", "")+'">') - print('<input type="submit" value="Submit" />') - -def cgiSetup(bottleneckFraction=1.0, useBytes=None): +def cgiSetup(bottleneckFraction=1.0, useBytes=None, botCheckString=None): """ do the usual browser CGI setup: parse the hg.conf file, parse the CGI variables, get the cart, do bottleneck delay. Returns the cart. This is not part of the C code (though it maybe should). """ parseHgConf() global cgiArgs cgiArgs = cgi.FieldStorage() # Python has built-in cgiSpoof support: sys.argv[1] is the query string if run from the command line if cgiString("debug"): global verboseLevel verboseLevel = int(cgiString("debug")) - hgBotDelay(fraction=bottleneckFraction, useBytes=useBytes) + hgBotDelay(fraction=bottleneckFraction, useBytes=useBytes, botCheckString=botCheckString) cart = cartAndCookieSimple() return cart #if __file__=="__main__": #pass