b74aa56e4115f371dbe818e98412f0c9e738e142
max
  Tue Sep 30 07:45:16 2025 -0700
allow apiKey=xxx to get around captcha and also bottleneck on the apiKey then, not on the cookie userId or hgsid, refs #36428

diff --git src/hg/lib/botDelay.c src/hg/lib/botDelay.c
index 51dce3f79e4..9e0d02f1db4 100644
--- src/hg/lib/botDelay.c
+++ src/hg/lib/botDelay.c
@@ -3,30 +3,31 @@
  * being just too demanding. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "net.h"
 #include "portable.h"
 #include "hgConfig.h"
 #include "cheapcgi.h"
 #include "hui.h"
 #include "hCommon.h"
 #include "botDelay.h"
 #include "jsonWrite.h"
 #include "regexHelper.h"
+#include "hubSpaceKeys.h"
 
 #define defaultDelayFrac 1.0   /* standard penalty for most CGIs */
 #define defaultWarnMs 10000    /* warning at 10 to 20 second delay */
 #define defaultExitMs 20000    /* error 429 Too Many Requests after 20+ second delay */
 
 int botDelayWarnMs  = 0;       /* global so the previously used value can be retrieved */
 
 int botDelayTime(char *host, int port, char *botCheckString)
 /* Figure out suggested delay time for ip address in
  * milliseconds. */
 {
 int sd = netMustConnect(host, port);
 char buf[256];
 netSendString(sd, botCheckString);
 netRecieveString(sd, buf);
@@ -80,84 +81,105 @@
 hUserAbort("There is an exceedingly high volume of traffic coming from your "
        "site (IP address %s) as of %s (California time).  It looks like "
        "a web robot is launching queries quickly, and not even waiting for "
        "the results of one query to finish before launching another query. "
        "/* We cannot service requests from your IP address under */ these "
        "conditions.  (code %d)"
        "To use the genome browser functionality from a Unix command line, "
        "please read <a href='http://genome.ucsc.edu/FAQ/FAQdownloads.html#download36'>our FAQ</a> on this topic. "
        "For further help on how to access our data from a command line, "
        "or if "
        "you think this delay is being imposed unfairly, please contact genome-www@soe.ucsc.edu."
        , ip, asctime(localtime(&now)), millis);
 }
 
 static char *getCookieUser()
-/* get user from hguid cookie */
+/* get the ID string stored in the hguid cookie, it looks like our hgsid session strings on the URL */
 {
 char *user = NULL;
 char *centralCookie = hUserCookie();
 
 if (centralCookie)
     user = findCookieData(centralCookie);
 
 return user;
 }
 
 
 boolean isValidHgsidForEarlyBotCheck(char *raw_hgsid)
 /* We want to use the hgsid from the CGI parameters, but sometimes requests come in with bogus strings that
  * need to be ignored.  We don't want to run this against the database just yet, but we can at least check
  * the format. */
 {
 char hgsid[1024];
 // Just in case it's egregiously large, we only need the first part to decide if it's valid.
 safencpy(hgsid, sizeof(hgsid), raw_hgsid, 50);
 if (regexMatch(hgsid, "^[0-9][0-9]*_[a-zA-Z0-9]{28}$"))
     return TRUE;
 return FALSE;
 }
 
 
 char *getBotCheckString(char *ip, double fraction)
 /* compose "user.ip fraction" string for bot check */
 {
-char *user = getCookieUser();
+char *cookieUserId = getCookieUser();
 char *botCheckString = needMem(256);
 boolean useNew = cfgOptionBooleanDefault("newBotDelay", FALSE);
 if (useNew)
     {
-        if (user)
-            safef(botCheckString, 256, "uid%s %f", user, fraction);
+        // the new strategy is: bottleneck on apiKey, then cookie-userId, then
+        // hgsid, and only if none of these is available, on IP address. Also, check
+        // apiKey and cookieId if they are valid, check hgsid if the string looks OK.
+        char *apiKey = cgiOptionalString("apiKey");
+        if (apiKey)
+            {
+            // Here we do a mysql query before the bottleneck is complete. 
+            // In theory, this can overload the MariaDB server.
+            // But there is no way around it, we must check that the apiKey is valid
+            // And this is better than handling the request without bottleneck
+            char *userName = userNameForApiKey(apiKey);
+            if (userName)
+                safef(botCheckString, 256, "apiKey%s %f", apiKey, fraction);
+            else 
+                hUserAbort("Invalid apiKey provided on URL. Make sure that the apiKey is valid. Or contact us.");
+            }
+        else
+            if (cookieUserId)
+                safef(botCheckString, 256, "uid%s %f", cookieUserId, fraction);
             else
                 {
+                // The following happens very rarely on sites like our RR that use the cloudflare captcha,
+                // as all requests (except hgLogin, hgRenderTracks) should come in with a cookie user ID
                 char *hgsid = cgiOptionalString("hgsid");
+                // For now, we do not check the hgsid against the MariaDb table, only check if the string looks OK
                 if (hgsid && isValidHgsidForEarlyBotCheck(hgsid))
                     safef(botCheckString, 256, "sid%s %f", hgsid, fraction);
                 else
                     {
                     if (hgsid)
                         // We were given an invalid hgsid - penalize this source in case of abuse
                         fraction *= 5;
                     safef(botCheckString, 256, "%s %f", ip, fraction);
                     }
                 }
     }
 else
+    // our old system - only relevant on mirrors: bottleneck on cookie or IP address
     {
-    if (user)
-      safef(botCheckString, 256, "%s.%s %f", user, ip, fraction);
+    if (cookieUserId)
+      safef(botCheckString, 256, "%s.%s %f", cookieUserId, ip, fraction);
     else
       safef(botCheckString, 256, "%s %f", ip, fraction);
     }
 return botCheckString;
 }
 
 boolean botException()
 /* check if the remote ip address is on the exceptions list */
 {
 char *exceptIps = cfgOption("bottleneck.except");
 if (exceptIps)
     {
     char *remoteAddr = getenv("REMOTE_ADDR");
     if (remoteAddr)
 	{