43d1009151fe8dfff680d387ed6772491b17d51f
max
  Tue May 27 16:59:30 2025 -0700
adding cloudflare bot protection, without extranous files, refs #35790

diff --git src/hg/lib/cart.c src/hg/lib/cart.c
index 1a769655b32..471adae1e1e 100644
--- src/hg/lib/cart.c
+++ src/hg/lib/cart.c
@@ -26,30 +26,32 @@
 #endif /* GBROWSE */
 #include "hgMaf.h"
 #include "hui.h"
 #include "geoMirror.h"
 #include "hubConnect.h"
 #include "trackHub.h"
 #include "cgiApoptosis.h"
 #include "customComposite.h"
 #include "regexHelper.h"
 #include "windowsToAscii.h"
 #include "jsonWrite.h"
 #include "verbose.h"
 #include "genark.h"
 #include "quickLift.h"
 
+#include <curl/curl.h>
+
 static char *sessionVar = "hgsid";	/* Name of cgi variable session is stored in. */
 static char *positionCgiName = "position";
 
 DbConnector cartDefaultConnector = hConnectCart;
 DbDisconnect cartDefaultDisconnector = hDisconnectCart;
 static boolean cartDidContentType = FALSE;
 
 struct slPair *httpHeaders = NULL; // A list of headers to output before the content-type
 
 static void hashUpdateDynamicVal(struct hash *hash, char *name, void *val)
 /* Val is a dynamically allocated (freeMem-able) entity to put
  * in hash.  Override existing hash item with that name if any.
  * Otherwise make new hash item. */
 {
 struct hashEl *hel = hashLookup(hash, name);
@@ -1438,64 +1440,195 @@
 else
     {
     char *url = genarkUrl(db);
 
     if (url != NULL)
         {
         cartSetString(cart, "genome", db);
         cartAddString(cart, "hubUrl", url);
         cartRemove(cart, "db");
         }
     else if (!hDbIsActive(db))
 	errAbort("Can not find database '%s'", db);
     }
 }
 
+// ------ libify this in the next release ----
+//
+struct curlString {
+    char *ptr;
+    size_t len;
+};
+void init_string(struct curlString *s) {
+    s->len = 0;
+    s->ptr = malloc(1);
+    s->ptr[0] = '\0';
+}
+
+size_t writefunc(void *ptr, size_t size, size_t nmemb, void *userData) {
+    struct curlString *s = (struct curlString *)userData;
+    size_t new_len = s->len + size * nmemb;
+    s->ptr = realloc(s->ptr, new_len + 1);
+    memcpy(s->ptr + s->len, ptr, size * nmemb);
+    s->ptr[new_len] = '\0';
+    s->len = new_len;
+    return size * nmemb;
+}
+
+char* curlPostUrl(char *url, char *data)
+/* post data to URL and return as string. Must be freed. */
+{
+CURL *curl = curl_easy_init();
+if (!curl) 
+    errAbort("Cannot init curl library");
+
+struct curlString response;
+init_string(&response);
+
+curl_easy_setopt(curl, CURLOPT_URL, url);
+curl_easy_setopt(curl, CURLOPT_POSTFIELDS, data);
+curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writefunc);
+curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
+curl_easy_perform(curl);
+curl_easy_cleanup(curl);
+
+char *resp = cloneString(response.ptr);
+free(response.ptr);
+return resp;
+}
+
+boolean isValidToken(char *token)
+/* send https req to cloudflare, check if the token that we got from the captcha is really the one made by cloudflare */
+{
+    char *url = "https://challenges.cloudflare.com/turnstile/v0/siteverify";
+    char *secret = cfgVal("cloudFlareSecretKey");
+    if (!secret)
+        errAbort("'cloudFlareSecretKey' must be set in hg.conf if cloudflare is activated in hg.conf");
+
+    char data[3000]; // cloudflare token is at most 2000 bytes
+    safef(data, sizeof(data), "secret=%s&response=%s", secret, token);
+    char *reply = curlPostUrl(url, data);
+
+    boolean res = startsWith("{\"success\":true", reply);
+    freez(&reply);
+    return res;
+}
+
+#define CLOUDFLARESITEKEY "cloudFlareSiteKey"
+
+void printCaptcha() 
+/* print an html page that shows the captcha and on success, reloads the page with the token added as token=x */
+{
+    char *cfSiteKey = cfgVal(CLOUDFLARESITEKEY);
+    if (!cfSiteKey)
+        return;
+
+    puts("Content-Type:text/html");
+    puts("\n");
+    puts("<html><head>");
+    puts("<script>");
+    printf("function showWidget() { \n"
+       "turnstile.render('#myWidget', {\n"
+         "sitekey: '%s',\n"
+         "theme: 'light',\n"
+         "callback: function (token) {\n"
+         "     const parser = new URL(window.location);\n"
+         "     parser.searchParams.set('token', token);\n"
+         "     window.location = parser.href;\n"
+         "   },\n"
+       "});\n"
+       "}\n", cfSiteKey);
+    puts("</script>");
+    puts("</head><body>");
+    puts("<style>body, h1, h2, h3, h4, h5, h6  { font-family: Helvetica, Arial, sans-serif; }</style>\n");
+    puts("<h4>The Genome Browser is protecting itself from bots. This will just take a few seconds.</h4>");
+    puts("<small>If you are a bot and were made for a research project, please contact us by email.</small>");
+    puts("<script src='https://challenges.cloudflare.com/turnstile/v0/api.js?onload=showWidget' async defer></script>");
+    puts("<div id='myWidget'></div>");
+    puts("</body></html>");
+    sqlCleanupAll(); // we are wondering about hanging connections, so just in case, close them.
+    exit(0);
+}
+
+void forceUserIdOrCaptcha(struct cart* cart, char *userId, boolean userIdFound, boolean fromCommandLine)
+/* print captcha is user did not sent a valid hguid cookie or a valid cloudflare token. Always allow rtracklayer. */
+{
+if (fromCommandLine || !cfgOption(CLOUDFLARESITEKEY))
+    return;
+
+if (!cfgOption("blockRtracklayer") && sameOk(cgiUserAgent(), "rtracklayer"))
+    return;
+
+// so QA can add a user agent after release, in case someone complains
+char *okUserAgent = cfgOption("okUserAgent");
+if (okUserAgent && sameOk(cgiUserAgent(), okUserAgent))
+    return;
+
+if (userId && userIdFound)
+    return;
+
+char *token = cgiOptionalString("token");
+
+if (token && isValidToken(token))
+{
+    cartRemove(cart, "token");
+    return;
+}
+
+printCaptcha();
+}
+
+void cartRemove(struct cart *cart, char *var);
+
 struct cart *cartNew(char *userId, char *sessionId,
                      char **exclude, struct hash *oldVars)
 /* Load up cart from user & session id's.  Exclude is a null-terminated list of
  * strings to not include */
 {
 cgiApoptosisSetup();
 if (cfgOptionBooleanDefault("showEarlyErrors", TRUE))
     errAbortSetDoContentType(TRUE);
 
 if (cfgOptionBooleanDefault("suppressVeryEarlyErrors", FALSE))
     htmlSuppressErrors();
 setUdcCacheDir();
 
 netSetTimeoutErrorMsg("A connection timeout means that either the server is offline or its firewall, the UCSC firewall or any router between the two blocks the connection.");
 
-
 struct cart *cart;
 struct sqlConnection *conn = cartDefaultConnector();
 char *ex;
 boolean userIdFound = FALSE, sessionIdFound = FALSE;
 
 AllocVar(cart);
 cart->hash = newHash(12);
 cart->exclude = newHash(7);
 cart->userId = userId;
 cart->sessionId = sessionId;
 cart->userInfo = loadDb(conn, userDbTable(), userId, &userIdFound);
+
 cart->sessionInfo = loadDb(conn, sessionDbTable(), sessionId, &sessionIdFound);
 
-boolean fromCli = cgiWasSpoofed();
+boolean fromCli = cgiWasSpoofed(); // QA runs our CGIs from the command line and we debug from there
 
-if (sessionIdFound && !userIdFound && !fromCli && cfgOptionBooleanDefault("cartTrace", FALSE))
-    fprintf(stderr, "HGSID_WITHOUT_COOKIE\n");
+forceUserIdOrCaptcha(cart, userId, userIdFound, fromCli);
 
+// we rely on the cookie being validated, so if we reset a cookie, do this after the captcha
+if ( cgiOptionalString("ignoreCookie") != NULL )
+    cart->userInfo = loadDb(conn, userDbTable(), NULL, &userIdFound);
+
+// Leaving this in the code temporarily, until June 2025 release.
 if (!fromCli && 
     ((sessionId && !sessionIdFound) || !sessionId) && 
     (!userId || !userIdFound) && 
     cfgOptionBooleanDefault("punishInvalidHgsid", FALSE))
     {
     fprintf(stderr, "HGSID_WAIT no sessionId and no cookie: 5 seconds penalty");
     sleep(5);
     if (sessionId && !sessionIdFound)
         {
         fprintf(stderr, "HGSID_WAIT2 sessionId sent but invalid: 10 seconds penalty");
         sleep(10);
         }
     }
 
 if (sessionIdFound)
@@ -2510,35 +2643,31 @@
 char *noProxy = cfgOption("noProxy");
 if (noProxy)
     setenv("no_proxy", noProxy, TRUE);
 char *logProxy = cfgOption("logProxy");
 if (logProxy)
     setenv("log_proxy", logProxy, TRUE);
 
 /* noSqlInj settings so they are accessible in src/lib too */
 char *noSqlInj_level = cfgOption("noSqlInj.level");
 if (noSqlInj_level)
     setenv("noSqlInj_level", noSqlInj_level, TRUE);
 char *noSqlInj_dumpStack = cfgOption("noSqlInj.dumpStack");
 if (noSqlInj_dumpStack)
     setenv("noSqlInj_dumpStack", noSqlInj_dumpStack, TRUE);
 
-
-// if ignoreCookie is on the URL, don't check for cookies
-char *hguid = NULL;
-if ( cgiOptionalString("ignoreCookie") == NULL )
-    hguid = getCookieId(cookieName);
+char *hguid = getCookieId(cookieName);
 
 // if _dumpToLog is on the URL, we can exit early with whatever
 // message we are trying to write to the stderr/error_log
 char *logMsg = NULL;
 if ( (logMsg = cgiOptionalString("_dumpToLog")) != NULL)
     {
     cartJsonStart();
     fprintf(stderr, "%s", logMsg);
     cartJsonEnd(NULL);
     exit(0);
     }
 char *hgsid = getSessionId();
 struct cart *cart = cartNew(hguid, hgsid, exclude, oldVars);
 cartExclude(cart, sessionVar);