a47f57d008f2db28a7b140ece87cbcfb8e8a5c0c
hiram
  Thu Sep 12 14:18:19 2019 -0700
expand earlyBotCheck to manage two types of outputs, recognize defaults, and adding hgGateway to the game refs #23217

diff --git src/hg/lib/botDelay.c src/hg/lib/botDelay.c
index 5304629..9b7d1a9 100644
--- src/hg/lib/botDelay.c
+++ src/hg/lib/botDelay.c
@@ -1,296 +1,343 @@
 /* botDelay.c - contact bottleneck server and sleep
  * for a little bit if IP address looks like it is
  * being just too demanding. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "net.h"
 #include "portable.h"
 #include "hgConfig.h"
 #include "cheapcgi.h"
 #include "hui.h"
 #include "hCommon.h"
 #include "botDelay.h"
+#include "jsonWrite.h"
 
 #define defaultDelayFrac 1.0   /* standard penalty for most CGIs */
 #define defaultWarnMs 10000    /* warning at 10 to 20 second delay */
 #define defaultExitMs 20000    /* error 429 Too Many Requests after 20+ second delay */
 
 int botDelayTime(char *host, int port, char *botCheckString)
 /* Figure out suggested delay time for ip address in
  * milliseconds. */
 {
 int sd = netMustConnect(host, port);
 char buf[256];
 netSendString(sd, botCheckString);
 netRecieveString(sd, buf);
 close(sd);
 return atoi(buf);
 }
 
 void botDelayMessage(char *ip, int millis)
 /* Print out message saying why you are stalled. */
 {
 time_t now = time(NULL);
 warn("There is a very high volume of traffic coming from your "
        "site (IP address %s) as of %s (California time).  So that other "
        "users get a fair share "
        "of our bandwidth, we are putting in a delay of %3.1f seconds "
        "before we service your request.  This delay will slowly "
        "decrease over a half hour as activity returns to normal.  This "
        "high volume of traffic is likely due to program-driven rather than "
        "interactive access, or the submission of queries on a large "
        "number of sequences.  If you are making large batch queries, "
        "please write to our genome@soe.ucsc.edu public mailing list "
        "and inquire about more efficient ways to access our data.  "
        "If you are sharing an IP address with someone who is submitting "
        "large batch queries, we apologize for the "
        "inconvenience. "
        "To use the genome browser functionality from a Unix command line, "
        "please read <a href='http://genome.ucsc.edu/FAQ/FAQdownloads.html#download36'>our FAQ</a> on this topic. "
        "For further help on how to access our data from a command line, "
        "or if "
        "you think this delay is being imposed unfairly, please contact genome-www@soe.ucsc.edu.",
 	    ip, asctime(localtime(&now)), .001*millis);
 }
 
 void botTerminateMessage(char *ip, int millis)
 /* Print out message saying why you are terminated. */
 {
 time_t now = time(NULL);
 hUserAbort("There is an exceedingly high volume of traffic coming from your "
        "site (IP address %s) as of %s (California time).  It looks like "
        "a web robot is launching queries quickly, and not even waiting for "
        "the results of one query to finish before launching another query. "
        "/* We cannot service requests from your IP address under */ these "
        "conditions.  (code %d)"
        "To use the genome browser functionality from a Unix command line, "
        "please read <a href='http://genome.ucsc.edu/FAQ/FAQdownloads.html#download36'>our FAQ</a> on this topic. "
        "For further help on how to access our data from a command line, "
        "or if "
        "you think this delay is being imposed unfairly, please contact genome-www@soe.ucsc.edu."
        , ip, asctime(localtime(&now)), millis);
 }
 
 static char *getCookieUser()
 /* get user from hguid cookie */
 {
 char *user = NULL;
 char *centralCookie = hUserCookie();
 
 if (centralCookie)
     user = findCookieData(centralCookie);
 
 return user;
 }
 
 char *getBotCheckString(char *ip, double fraction)
 /* compose "user.ip fraction" string for bot check */
 {
 char *user = getCookieUser();
 char *botCheckString = needMem(256);
 if (user)
   safef(botCheckString, 256, "%s.%s %f", user, ip, fraction);
 else
   safef(botCheckString, 256, "%s %f", ip, fraction);
 return botCheckString;
 }
 
 void botDelayCgi(char *host, int port, boolean noWarn, double fraction)
 /* Connect with bottleneck server and sleep the
  * amount it suggests for IP address calling CGI script,
  * after imposing the specified fraction of the access penalty. */
 {
 int millis;
 char *ip = getenv("REMOTE_ADDR");
 if (ip != NULL)
     {
     char *botCheckString = getBotCheckString(ip, fraction);
     millis = botDelayTime(host, port, botCheckString);
     freeMem(botCheckString);
     if (millis > 0)
 	{
 	if (millis > 10000)
 	    {
 	    if (millis > 20000)
 	        botTerminateMessage(ip, millis);
 	    else
 		{
 		if (!noWarn)
 		    botDelayMessage(ip, millis);
 		}
 	    }
 	sleep1000(millis);
 	}
     }
 }
 
 boolean botException()
 /* check if the remote ip address is on the exceptions list */
 {
 char *exceptIps = cfgOption("bottleneck.except");
 if (exceptIps)
     {
     char *remoteAddr = getenv("REMOTE_ADDR");
     if (remoteAddr)
 	{
 	char *s = exceptIps;
 	boolean found = FALSE;
 	while (s && !found)
 	    {
 	    char *e = strchr(s, ' ');
 	    if (e)
 		*e = 0;
 	    if (sameString(remoteAddr, s))
 		found = TRUE;
 	    if (e)
 		*e++ = ' ';
 	    s = e;
 	    }
 	if (found)
 	    return TRUE;
 	}
     }
 return FALSE;
 }
 
 static void hgBotDelayExt(boolean noWarn, double fraction)
 /* High level bot delay call - looks up bottleneck server
  * in hg.conf. */
 {
 if (botException())
     return;
 
 char *host = cfgOption("bottleneck.host");
 char *port = cfgOption("bottleneck.port");
 
 if (host != NULL && port != NULL)
     botDelayCgi(host, atoi(port), noWarn, fraction);
 }
 
 void hgBotDelay()
 /* High level bot delay call - for use with regular webpage output */
 {
 hgBotDelayExt(FALSE, defaultDelayFrac);
 }
 
 void hgBotDelayFrac(double fraction)
 /* Like hgBotDelay, but imposes a fraction of the standard access penalty */
 {
 hgBotDelayExt(FALSE, fraction);
 }
 
 void hgBotDelayNoWarn()
 /* High level bot delay call without warning - for use with non-webpage outputs */
 {
 hgBotDelayExt(TRUE, defaultDelayFrac);
 }
 
 void hgBotDelayNoWarnFrac(double fraction)
 /* Like hgBotDelayNoWarn, but imposes a fraction of the standard access penalty */
 {
 hgBotDelayExt(TRUE, fraction);
 }
 
 int hgBotDelayTime()
 {
 return hgBotDelayTimeFrac(defaultDelayFrac);
 }
 
 int hgBotDelayTimeFrac(double fraction)
 /* Get suggested delay time from cgi using the standard penalty. */
 {
 char *ip = getenv("REMOTE_ADDR");
 char *host = cfgOption("bottleneck.host");
 char *port = cfgOption("bottleneck.port");
 
 int delay = 0;
 if (host != NULL && port != NULL && ip != NULL)
     {
     char *botCheckString = getBotCheckString(ip, fraction);
     delay = botDelayTime(host, atoi(port), botCheckString);
     freeMem(botCheckString);
     }
 return delay;
 }
 
 #define err429  429
 #define err429Msg       "Too Many Requests"
 int botDelayMillis = 0;
 
-static void hogExit(char *cgiName, long enteredMainTime)
+static void jsonHogExit(char *cgiExitName, long enteredMainTime, char *hogHost,
+    int retryAfterSeconds)
+/* err429 Too Many Requests to be returned as JSON data */
+{
+puts("Content-Type:application/json");
+printf("Status: %d %s\n", err429, err429Msg);
+if (retryAfterSeconds > 0)
+    printf("Retry-After: %d", retryAfterSeconds);
+puts("\n");	/* blank line between header and body */
+
+struct jsonWrite *jw = jsonWriteNew();
+jsonWriteObjectStart(jw, NULL);
+jsonWriteString(jw, "error", err429Msg);
+jsonWriteNumber(jw, "statusCode", err429);
+
+char msg[1024];
+
+safef(msg, sizeof(msg), "Your host, %s, has been sending too many requests "
+       "lately and is unfairly loading our site, impacting performance for "
+       "other users.  Please contact genome@soe.ucsc.edu to ask that your site "
+       "be reenabled.  Also, please consider downloading sequence and/or "
+       "annotations in bulk -- see http://genome.ucsc.edu/downloads.html.",
+       hogHost);
+
+jsonWriteString(jw, "statusMessage", msg);
+if (retryAfterSeconds > 0)
+    jsonWriteNumber(jw, "retryAfterSeconds", retryAfterSeconds);
+
+jsonWriteObjectEnd(jw);
+
+puts(jw->dy->string);
+}
+
+static void hogExit(char *cgiName, long enteredMainTime, char *exitType,
+    int retryAfterSeconds)
 /* earlyBotCheck requests exit before CGI has done any output or
  * setups of any kind.  HTML output has not yet started.
  */
 {
 char *hogHost = getenv("REMOTE_ADDR");
 char cgiExitName[1024];
 safef(cgiExitName, ArraySize(cgiExitName), "%s hogExit", cgiName);
 
+if (sameOk("json", exitType))
+   jsonHogExit(cgiExitName, enteredMainTime, hogHost, retryAfterSeconds);
+else
+    {
+
     puts("Content-Type:text/html");
     printf("Status: %d %s\n", err429, err429Msg);
-puts("Retry-After: 30");
-puts("\n");
+    if (retryAfterSeconds > 0)
+        printf("Retry-After: %d", retryAfterSeconds);
+    puts("\n");	/* blank line between header and body */
 
     puts("<!DOCTYPE HTML 4.01 Transitional>\n");
     puts("<html lang='en'>");
     puts("<head>");
     puts("<meta charset=\"utf-8\">");
     printf("<title>Status %d: %s</title></head>\n", err429, err429Msg);
 
     printf("<body><h1>Status %d: %s</h1><p>\n", err429, err429Msg);
     time_t now = time(NULL);
     printf("There is an exceedingly high volume of traffic coming from your "
            "site (IP address %s) as of %s (California time).  It looks like "
            "a web robot is launching queries quickly, and not even waiting for "
            "the results of one query to finish before launching another query. "
            "<b>We cannot service requests from your IP address under</b> these "
            "conditions.  (code %d) "
            "To use the genome browser functionality from a Unix command line, "
            "please read <a href='http://genome.ucsc.edu/FAQ/FAQdownloads.html#download36'>our FAQ</a> on this topic. "
            "For further help on how to access our data from a command line, "
            "or if "
            "you think this delay is being imposed unfairly, please contact genome-www@soe.ucsc.edu."
            ,hogHost, asctime(localtime(&now)), botDelayMillis);
     puts("</body></html>");
+    }
 cgiExitTime(cgiExitName, enteredMainTime);
 exit(0);
 }       /*      static void hogExit()   */
 
-
-boolean earlyBotCheck(long enteredMainTime, char *cgiName, double delayFrac, int warnMs, int exitMs)
+boolean earlyBotCheck(long enteredMainTime, char *cgiName, double delayFrac, int warnMs, int exitMs, char *exitType)
 /* similar to botDelayCgi but for use before the CGI has started any
  * output or setup the cart of done any MySQL operations.  The boolean
  * return is used later in the CGI after it has done all its setups and
  * started output so it can issue the warning.  Pass in delayFrac 0.0
- * to use the default 1.0
+ * to use the default 1.0, pass in 0 for warnMs and exitMs to use defaults,
+ * and exitType is either 'html' or 'json' to do that type of exit output in
+ * the case of hogExit();
  */
 {
 boolean issueWarning = FALSE;
 
 if (botException())	/* don't do this if caller is on the exception list */
     return issueWarning;
 
 if (delayFrac < 0.000001) /* passed in zero, use default */
     delayFrac = defaultDelayFrac;
 if (warnMs < 1)	/* passed in zero, use default */
     warnMs = defaultWarnMs;
 if (exitMs < 1)	/* passed in zero, use default */
     exitMs = defaultExitMs;
 
 botDelayMillis = hgBotDelayTimeFrac(delayFrac);
 if (botDelayMillis > 0)
     {
-    sleep1000(botDelayMillis);
+    int msAboveWarning = botDelayMillis - warnMs;
+    int retryAfterSeconds = 0;
+    if (msAboveWarning > 0)
+       retryAfterSeconds = 1 + (msAboveWarning / 10);
     if (botDelayMillis > warnMs)
 	{
-	if (botDelayMillis > exitMs)
-	    hogExit(cgiName, enteredMainTime);
+	if (botDelayMillis > exitMs) /* returning immediately */
+	    hogExit(cgiName, enteredMainTime, exitType, retryAfterSeconds);
 	else
 	    issueWarning = TRUE;
 	}
+    sleep1000(botDelayMillis); /* sleeping while still < exitMs */
     }
-return issueWarning;
+return issueWarning;	/* caller can decide on their type of warning */
 }	/*	boolean earlyBotCheck()	*/