b74aa56e4115f371dbe818e98412f0c9e738e142 max Tue Sep 30 07:45:16 2025 -0700 allow apiKey=xxx to get around captcha and also bottleneck on the apiKey then, not on the cookie userId or hgsid, refs #36428 diff --git src/hg/lib/botDelay.c src/hg/lib/botDelay.c index 51dce3f79e4..9e0d02f1db4 100644 --- src/hg/lib/botDelay.c +++ src/hg/lib/botDelay.c @@ -3,30 +3,31 @@ * being just too demanding. */ /* Copyright (C) 2014 The Regents of the University of California * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */ #include "common.h" #include "net.h" #include "portable.h" #include "hgConfig.h" #include "cheapcgi.h" #include "hui.h" #include "hCommon.h" #include "botDelay.h" #include "jsonWrite.h" #include "regexHelper.h" +#include "hubSpaceKeys.h" #define defaultDelayFrac 1.0 /* standard penalty for most CGIs */ #define defaultWarnMs 10000 /* warning at 10 to 20 second delay */ #define defaultExitMs 20000 /* error 429 Too Many Requests after 20+ second delay */ int botDelayWarnMs = 0; /* global so the previously used value can be retrieved */ int botDelayTime(char *host, int port, char *botCheckString) /* Figure out suggested delay time for ip address in * milliseconds. */ { int sd = netMustConnect(host, port); char buf[256]; netSendString(sd, botCheckString); netRecieveString(sd, buf); @@ -80,84 +81,105 @@ hUserAbort("There is an exceedingly high volume of traffic coming from your " "site (IP address %s) as of %s (California time). It looks like " "a web robot is launching queries quickly, and not even waiting for " "the results of one query to finish before launching another query. " "/* We cannot service requests from your IP address under */ these " "conditions. (code %d)" "To use the genome browser functionality from a Unix command line, " "please read <a href='http://genome.ucsc.edu/FAQ/FAQdownloads.html#download36'>our FAQ</a> on this topic. " "For further help on how to access our data from a command line, " "or if " "you think this delay is being imposed unfairly, please contact genome-www@soe.ucsc.edu." , ip, asctime(localtime(&now)), millis); } static char *getCookieUser() -/* get user from hguid cookie */ +/* get the ID string stored in the hguid cookie, it looks like our hgsid session strings on the URL */ { char *user = NULL; char *centralCookie = hUserCookie(); if (centralCookie) user = findCookieData(centralCookie); return user; } boolean isValidHgsidForEarlyBotCheck(char *raw_hgsid) /* We want to use the hgsid from the CGI parameters, but sometimes requests come in with bogus strings that * need to be ignored. We don't want to run this against the database just yet, but we can at least check * the format. */ { char hgsid[1024]; // Just in case it's egregiously large, we only need the first part to decide if it's valid. safencpy(hgsid, sizeof(hgsid), raw_hgsid, 50); if (regexMatch(hgsid, "^[0-9][0-9]*_[a-zA-Z0-9]{28}$")) return TRUE; return FALSE; } char *getBotCheckString(char *ip, double fraction) /* compose "user.ip fraction" string for bot check */ { -char *user = getCookieUser(); +char *cookieUserId = getCookieUser(); char *botCheckString = needMem(256); boolean useNew = cfgOptionBooleanDefault("newBotDelay", FALSE); if (useNew) { - if (user) - safef(botCheckString, 256, "uid%s %f", user, fraction); + // the new strategy is: bottleneck on apiKey, then cookie-userId, then + // hgsid, and only if none of these is available, on IP address. Also, check + // apiKey and cookieId if they are valid, check hgsid if the string looks OK. + char *apiKey = cgiOptionalString("apiKey"); + if (apiKey) + { + // Here we do a mysql query before the bottleneck is complete. + // In theory, this can overload the MariaDB server. + // But there is no way around it, we must check that the apiKey is valid + // And this is better than handling the request without bottleneck + char *userName = userNameForApiKey(apiKey); + if (userName) + safef(botCheckString, 256, "apiKey%s %f", apiKey, fraction); + else + hUserAbort("Invalid apiKey provided on URL. Make sure that the apiKey is valid. Or contact us."); + } + else + if (cookieUserId) + safef(botCheckString, 256, "uid%s %f", cookieUserId, fraction); else { + // The following happens very rarely on sites like our RR that use the cloudflare captcha, + // as all requests (except hgLogin, hgRenderTracks) should come in with a cookie user ID char *hgsid = cgiOptionalString("hgsid"); + // For now, we do not check the hgsid against the MariaDb table, only check if the string looks OK if (hgsid && isValidHgsidForEarlyBotCheck(hgsid)) safef(botCheckString, 256, "sid%s %f", hgsid, fraction); else { if (hgsid) // We were given an invalid hgsid - penalize this source in case of abuse fraction *= 5; safef(botCheckString, 256, "%s %f", ip, fraction); } } } else + // our old system - only relevant on mirrors: bottleneck on cookie or IP address { - if (user) - safef(botCheckString, 256, "%s.%s %f", user, ip, fraction); + if (cookieUserId) + safef(botCheckString, 256, "%s.%s %f", cookieUserId, ip, fraction); else safef(botCheckString, 256, "%s %f", ip, fraction); } return botCheckString; } boolean botException() /* check if the remote ip address is on the exceptions list */ { char *exceptIps = cfgOption("bottleneck.except"); if (exceptIps) { char *remoteAddr = getenv("REMOTE_ADDR"); if (remoteAddr) {