0bad11bffc60ba7d250b33495d2f392d1964826b
jcasper
  Tue May 20 08:39:04 2025 -0700
Initial bottleneck change to validate hgsids without database access, refs #35763

diff --git src/hg/lib/botDelay.c src/hg/lib/botDelay.c
index d9bea151b67..51dce3f79e4 100644
--- src/hg/lib/botDelay.c
+++ src/hg/lib/botDelay.c
@@ -2,30 +2,31 @@
  * for a little bit if IP address looks like it is
  * being just too demanding. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "net.h"
 #include "portable.h"
 #include "hgConfig.h"
 #include "cheapcgi.h"
 #include "hui.h"
 #include "hCommon.h"
 #include "botDelay.h"
 #include "jsonWrite.h"
+#include "regexHelper.h"
 
 #define defaultDelayFrac 1.0   /* standard penalty for most CGIs */
 #define defaultWarnMs 10000    /* warning at 10 to 20 second delay */
 #define defaultExitMs 20000    /* error 429 Too Many Requests after 20+ second delay */
 
 int botDelayWarnMs  = 0;       /* global so the previously used value can be retrieved */
 
 int botDelayTime(char *host, int port, char *botCheckString)
 /* Figure out suggested delay time for ip address in
  * milliseconds. */
 {
 int sd = netMustConnect(host, port);
 char buf[256];
 netSendString(sd, botCheckString);
 netRecieveString(sd, buf);
@@ -90,49 +91,69 @@
        , ip, asctime(localtime(&now)), millis);
 }
 
 static char *getCookieUser()
 /* get user from hguid cookie */
 {
 char *user = NULL;
 char *centralCookie = hUserCookie();
 
 if (centralCookie)
     user = findCookieData(centralCookie);
 
 return user;
 }
 
+
+boolean isValidHgsidForEarlyBotCheck(char *raw_hgsid)
+/* We want to use the hgsid from the CGI parameters, but sometimes requests come in with bogus strings that
+ * need to be ignored.  We don't want to run this against the database just yet, but we can at least check
+ * the format. */
+{
+char hgsid[1024];
+// Just in case it's egregiously large, we only need the first part to decide if it's valid.
+safencpy(hgsid, sizeof(hgsid), raw_hgsid, 50);
+if (regexMatch(hgsid, "^[0-9][0-9]*_[a-zA-Z0-9]{28}$"))
+    return TRUE;
+return FALSE;
+}
+
+
 char *getBotCheckString(char *ip, double fraction)
 /* compose "user.ip fraction" string for bot check */
 {
 char *user = getCookieUser();
 char *botCheckString = needMem(256);
 boolean useNew = cfgOptionBooleanDefault("newBotDelay", FALSE);
 if (useNew)
     {
-        char *hgsid = cgiOptionalString("hgsid");
         if (user)
             safef(botCheckString, 256, "uid%s %f", user, fraction);
         else
             {
-            if (hgsid)
+            char *hgsid = cgiOptionalString("hgsid");
+            if (hgsid && isValidHgsidForEarlyBotCheck(hgsid))
                 safef(botCheckString, 256, "sid%s %f", hgsid, fraction);
             else
+                {
+                if (hgsid)
+                    // We were given an invalid hgsid - penalize this source in case of abuse
+                    fraction *= 5;
                 safef(botCheckString, 256, "%s %f", ip, fraction);
                 }
             }
+    }
 else
     {
     if (user)
       safef(botCheckString, 256, "%s.%s %f", user, ip, fraction);
     else
       safef(botCheckString, 256, "%s %f", ip, fraction);
     }
 return botCheckString;
 }
 
 boolean botException()
 /* check if the remote ip address is on the exceptions list */
 {
 char *exceptIps = cfgOption("bottleneck.except");
 if (exceptIps)