5472f71152a9d892fc7d7081fad7e7c797d3258a
max
  Fri Jun 27 07:28:30 2025 -0700
requiring valid user cookie to use hgGeneGraph, refs #35391

diff --git src/hg/pyLib/hgLib3.py src/hg/pyLib/hgLib3.py
index 043992f3c87..7f02155ba89 100644
--- src/hg/pyLib/hgLib3.py
+++ src/hg/pyLib/hgLib3.py
@@ -85,53 +85,30 @@
         elif line.startswith("include "):
             inclFname = line.split()[1]
             absFname = normpath(join(dirname(fname), inclFname))
             if os.path.isfile(absFname):
                 inclDict = parseConf(absFname)
                 conf.update(inclDict)
         elif "=" in line: # string search for "="
             key, value = line.split("=",1)
             conf[key] = value
     return conf
 
 
 # cache of hg.conf contents
 hgConf = None
 
-captchaQuestions = {
-        1 : "How many chromosomes do humans have?",
-        2 : "What is the family name of the UCSC Genome Browser's original creator?",
-        3 : "The genome is stored in which type of molecule? It's a three letter acronym.",
-        4 : "RNA encodes for which type of molecule?",
-        5 : "In eukaryotes, transcripts are composed of exons and ...",
-        6 : "What do you call the specific location of a gene on a chromosome?",
-        7 : "A gene is composed of multiple .... ? (hint: starts with trans- and has exons and introns)",
-        8 : "Enter the name of either one of the human sex chromosomes",
-        9 : "This genome browser is hosted by a University named...",
-        }
-
-captchaAnswers = {
-        1 : ["46", "23"],
-        2 : ["kent", "jim"],
-        3 : ["dna"],
-        4 : ["proteins", "amino acids", "aa", "protein"],
-        5 : ["introns"],
-        6 : ["locus"],
-        7 : ["transcripts", "transcript", "isoforms", "proteins"],
-        8 : ["x", "y", "chrx", "chry"],
-        9 : ["ucsc"],
-}
 def parseHgConf():
     """ return hg.conf as dict key:value. """
     global hgConf
     if hgConf is not None:
         return hgConf
 
     hgConf = dict() # python dict = hash table
 
     confDir = os.path.dirname(__file__) # look for hg.conf in parent dir
     fname = os.path.join(confDir, "..", "hg.conf")
     hgConf = parseConf(fname)
 
     if cfgOptionBoolean("JKSQL_TRACE"):
         global jksqlTrace
         jksqlTrace = True
@@ -466,103 +443,108 @@
     if cookie:
         return cookie.value
 
     return None
 
 def getCookieUser():
     " port of lib/botDelay.c:getCookieUser: get hguid cookie value  "
     user = None
     centralCookie = cfgOption("central.cookie", default="hguid")
 
     if centralCookie:
         user = findCookieData(centralCookie)
 
     return user
 
+def showCookieError():
+    " output error message if cookie not found "
+    print("Content-type: text/html\n\n")
+    print("<html><body>")
+    print("Sorry, the gene interactions viewer requires that you visit the genome browser first once, to defend against bots. ")
+    print("<a href='hgTracks'>Click here</a> to visit the genome browser, then come back to this page.")
+    print("</body></html>")
+    sys.exit(0)
+
 def getBotCheckString(ip, fraction):
     " port of lib/botDelay.c:getBotCheckString: compose user.ip fraction for bot check  "
-    user = getCookieUser()
-    useNew = cfgOptionBoolean("newBotDelay")
-    if (useNew):
-        hgsid = cgiString("hgsid")
-        if user:
-            botCheckString = "uid%s %f" % (user, fraction)
-        elif hgsid:
-            botCheckString = "sid%s %f" % (hgsid, fraction)
-        else:
-            botCheckString = "%s %f" % (ip, fraction)
-    else:
-        if user:
-            botCheckString = "%s.%s %f" % (user, ip, fraction)
-        else:
-            botCheckString = "%s %f" % (ip, fraction)
+    userId = getCookieUser()
+
+    if not userId:
+        showCookieError()
+
+    botCheckString = "uid%s %f" % (userId, fraction)
+
     return botCheckString
 
-def hgBotDelay(fraction=1.0, useBytes=None):
+def hgBotDelay(fraction=1.0, useBytes=None, botCheckString=None):
     """
     Implement bottleneck delay, get bottleneck server from hg.conf.
     This behaves similar to the function src/hg/lib/botDelay.c:hgBotDelay
     It does not use the hgsid, currently it always uses the IP address.
     Using the hgsid makes little sense. It is more lenient than the C version.
 
     If useBytes is set, use only the first x bytes of the IP address. This helps
     block bots that all use similar IP addresses, at the risk of blocking
     entire institutes.
     """
     global hgConf
     global doWarnBot
     global botDelayMsecs
 
     ip = os.environ.get("REMOTE_ADDR")
     if not ip: # skip if not called from Apache
         return
     if useBytes is not None and ip.count(".")==3: # do not do this for ip6 addresses
         ip = ".".join(ip.split(".")[:useBytes])
 
     host = cfgOption("bottleneck.host")
     port = cfgOption("bottleneck.port")
 
     if not host or not port or not ip:
         return
 
+    warnMsg = None
+    if botCheckString is None:
         botCheckString = getBotCheckString(ip, fraction)
+    else:
+        warnMsg = "Too many parallel requests for this CGI program. Please wait for a while and try this page again. If the problem persists, "
+        "please email us at genome@soe.ucsc.edu."
+
     millis = botDelayTime(host, port, botCheckString)
     debug(1, "Bottleneck delay: %d msecs" % millis)
     botDelayMsecs = millis
 
-    captchaId = int(cgiString("captchaId", 0))
-    if captchaId!=0:
-        captchaAnswer = cgiString("captchaAnswer", "").lower()
-        allowedAnswers = captchaAnswers.get(captchaId, [])
-        if captchaAnswer in allowedAnswers:
-            millis -= 10000
-
     if millis > (botDelayBlock/fraction):
         # retry-after time factor 10 is based on the example in the bottleneck help message
         sys.stderr.write("hgLib.py hogExit\n")
         printContentType(status=429, headers={"Retry-after" : str(millis / 200)})
         print("<html><head></head><body>")
+        if warnMsg:
+            print(warnMsg)
+            print(millis)
+            print(botDelayBlock)
+            print(fraction)
+        else:
             print("<b>Too many HTTP requests and not enough delay between them.</b><p> "
             "Your IP has been blocked to keep this website responsive for other users. "
             "Please contact genome-www@soe.ucsc.edu to unblock your IP address, especially if you were just browsing our site and are not running a bot,"
-        "or solve the captcha below. We can help you obtain the data you need without "
-        "web crawling.<p>")
-        showCaptcha()
+            "We can help you obtain the data you need without web crawling.<p>")
         print("</html>")
         sys.exit(0)
 
     if millis > (botDelayWarn/fraction):
+        printContentType(status=429, headers={"Retry-after" : str(millis / 1000.0)})
         time.sleep(millis/1000.0)
         doWarnBot = True # = show warning message later in printContentType()
 
 def parseRa(text):
     " Parse ra-style string and return as dict name -> value "
     import string
     lines = text.split("\n")
     data = dict()
     for l in lines:
         if len(l)==0:
             continue
         key, val = l.split(" ", maxsplit=1)
         data[key] = val
     return data
 
@@ -1029,32 +1011,32 @@
 
 #============ END of javascript inline-separation routines ===============
 
 def cartDbLoadFromId(conn, table, cartId, oldCart):
     " Like src/hg/lib/cart.c, opens cart table and parses cart contents given a cartId of the format 123123_csctac "
     if cartId==None:
         return {}
     cartFields = cartId.split("_")
     if len(cartFields)!=2:
         errAbort("Could not parse identifier %s for cart table %s" % (cgi.escape(cartId), table))
     idStr, secureId = cartFields
 
     query = "SELECT contents FROM "+table+" WHERE id=%(id)s and sessionKey=%(sessionKey)s"
     rows = sqlQuery(conn, query, {"id":idStr, "sessionKey":secureId})
     if len(rows)==0:
-        # silently ignore invalid cart IDs for now. Future code may want to generate a new cart.
-        return {}
+        # invalid cart ID
+        return None
 
     cartList = urllib.parse.parse_qs(rows[0][0])
 
     # by default, python returns a dict with key -> list of vals. We need only the first one
     for key, vals in cartList.items():
         oldCart[key] =vals[0]
     return oldCart
 
 centralConn = None
 
 def hConnectCentral():
     " similar to src/hg/lib/hdb.c:hConnectCentral. We use a much simpler cache, because we usually read all rows into memory. "
     global centralConn
     if centralConn:
         return centralConn
@@ -1092,59 +1074,50 @@
     """ Make the cart from the user cookie (user settings) and the hgsid CGI parameter (session settings, e.g. a browser tab).
         This is somewhat similar to cartAndCookie from hg/lib/cart.c
         Two important differences: this cart does not add all CGI arguments automatically.
         It also does not run cart.c:cartJustify, so track priorities are not applied.
         Also, if there is no hgsid parameter or no cookie, we do not create a new cart.
     """
     # no cgiApoptosis yet - maybe needed in the future. see cart.c / cartNew
 
     hguid = getCookieUser()
     hgsid = cgiString("hgsid")
 
     conn = hConnectCentral()
 
     cart = {}
     userInfo = cartDbLoadFromId(conn, "userDb", hguid, cart)
+    if userInfo is None:
+        # invalid cookie hguid
+        showCookieError()
+
     sessionInfo = cartDbLoadFromId(conn, "sessionDb", hgsid, cart)
+    if sessionInfo is None:
+        # tolerate invalid hgsid
+        sessionInfo = {}
     return cart
 
 def cartString(cart, default=None):
     " Get a string from the cart. For better readability for programmers used to the C code. "
     return cart.get(cart, default)
 
-def showCaptcha():
-    import random
-    captchaId = random.choice(list(captchaAnswers.keys()))
-    print("Please answer the following question to unblock your IP:")
-    print("<p><b>")
-    print(captchaQuestions[captchaId])
-    print("</b><p>")
-    print('<form action="" method="get">')
-    print('<label for="textInput">Enter answer:</label>')
-
-    print('<input type="text" id="textInput" name="captchaAnswer" required>')
-    print('<input type="hidden" name="captchaId" value="'+str(captchaId)+'">')
-    print('<input type="hidden" name="debug" value="'+cgiString("debug", "0")+'">')
-    print('<input type="hidden" name="gene" value="'+cgiString("gene", "")+'">')
-    print('<input type="submit" value="Submit" />')
-
-def cgiSetup(bottleneckFraction=1.0, useBytes=None):
+def cgiSetup(bottleneckFraction=1.0, useBytes=None, botCheckString=None):
     """ do the usual browser CGI setup: parse the hg.conf file, parse the CGI
     variables, get the cart, do bottleneck delay. Returns the cart.
 
     This is not part of the C code (though it maybe should).
     """
     parseHgConf()
     global cgiArgs
     cgiArgs = cgi.FieldStorage() # Python has built-in cgiSpoof support: sys.argv[1] is the query string if run from the command line
 
     if cgiString("debug"):
         global verboseLevel
         verboseLevel = int(cgiString("debug"))
 
-    hgBotDelay(fraction=bottleneckFraction, useBytes=useBytes)
+    hgBotDelay(fraction=bottleneckFraction, useBytes=useBytes, botCheckString=botCheckString)
 
     cart = cartAndCookieSimple()
     return cart
 
 #if __file__=="__main__":
     #pass