6fe66e36fae18d828825a6c551b5bb5ec05afab1
galt
  Thu Sep 13 11:41:17 2012 -0700
add tracking of connect-failures to hgCustom and hgTracks which may have many bigDataUrls pointing to the same failing site; also fixed a bug in the timeout handling of EINTR
diff --git src/lib/net.c src/lib/net.c
index 03fb764..e05fff6 100644
--- src/lib/net.c
+++ src/lib/net.c
@@ -9,56 +9,147 @@
 #include <string.h>
 #include <sys/time.h>
 #include <utime.h>
 #include <pthread.h>
 #include "internet.h"
 #include "errabort.h"
 #include "hash.h"
 #include "net.h"
 #include "linefile.h"
 #include "base64.h"
 #include "cheapcgi.h"
 #include "https.h"
 #include "sqlNum.h"
 #include "obscure.h"
 
-
 /* Brought errno in to get more useful error messages */
 
 extern int errno;
 
+/* when there are many cts, threads, hubtracks, etc
+ * need a quick way to remember failures to not repeat them */
+
+struct connFailure
+/* remember connect failure */
+    {
+    char *hostName;       /* hostName */
+    int port;             /* port */
+    char *errorString;    /* error message to report next time */
+    };
+
+#define MAXCONNFAILURES 1024
+static struct connFailure connFailures[MAXCONNFAILURES];
+static int numConnFailures = 0;
+static pthread_mutex_t cfMutex = PTHREAD_MUTEX_INITIALIZER;
+static boolean connFailuresEnabled = FALSE;
+
+void setConnFailuresEnabled(boolean val)
+/* Turn on or off the connFailures feature */
+{
+connFailuresEnabled = val;
+}
+
+boolean checkConnFailure(char *hostName, int port, char **pErrStr)
+/* check if this hostName:port has already had failure
+ *  which can save time and avoid more timeouts */
+{
+if (!connFailuresEnabled)
+    return FALSE;
+pthread_mutex_lock( &cfMutex );
+int imax = numConnFailures;
+pthread_mutex_unlock( &cfMutex );
+struct connFailure *cf = connFailures;
+int i;
+boolean result = FALSE;
+for(i=0;i<imax;++i)
+    {
+    if (sameString(cf->hostName, hostName) && cf->port == port)
+	{
+	if (pErrStr)
+	    {
+	    *pErrStr = cf->errorString;
+	    }
+	result = TRUE;
+	break;
+	}
+    ++cf;
+    }
+return result;
+}
+
+
+void addConnFailure(char *hostName, int port, char *format, ...)
+/* add a failure to connFailures[]
+ *  which can save time and avoid more timeouts */
+{
+if (!connFailuresEnabled)
+    return;
+char errorString[1024];
+va_list args;
+va_start(args, format);
+vsprintf(errorString, format, args);
+va_end(args);
+if (!checkConnFailure(hostName,port,NULL))
+    {
+    pthread_mutex_lock( &cfMutex );
+    if (numConnFailures < MAXCONNFAILURES)
+	{
+	struct connFailure *cf = connFailures + numConnFailures;
+	cf->hostName = cloneString(hostName);
+	cf->port = port;
+	cf->errorString = cloneString(errorString);
+	numConnFailures++;
+	}
+    pthread_mutex_unlock( &cfMutex );
+    }
+}
+
+
+
 static int netStreamSocket()
 /* Create a TCP/IP streaming socket.  Complain and return something
  * negative if can't */
 {
 int sd = socket(AF_INET, SOCK_STREAM, 0);
 if (sd < 0)
     warn("Couldn't make AF_INET socket.");
 return sd;
 }
 
 static int netConnectWithTimeout(char *hostName, int port, long msTimeout)
 /* In order to avoid a very long default timeout (several minutes) for hosts that will
  * not answer the port, we are forced to connect non-blocking.
  * After the connection has been established, we return to blocking mode. */
 {
 int sd;
 struct sockaddr_in sai;		/* Some system socket info. */
 int res;
 fd_set mySet;
 struct timeval lTime;
 long fcntlFlags;
+struct timeval startTime;
+gettimeofday(&startTime, NULL);
+struct timeval remainingTime;
+remainingTime.tv_sec = (long) (msTimeout/1000);
+remainingTime.tv_usec = (long) (((msTimeout/1000)-remainingTime.tv_sec)*1000000);
+
+char *errorString = NULL;
+if (checkConnFailure(hostName, port, &errorString))
+    {
+    warn(errorString);
+    return -1;
+    }
 
 if (hostName == NULL)
     {
     warn("NULL hostName in netConnect");
     return -1;
     }
 if (!internetFillInAddress(hostName, port, &sai))
     return -1;
 if ((sd = netStreamSocket()) < 0)
     return sd;
 
 // Set non-blocking
 if ((fcntlFlags = fcntl(sd, F_GETFL, NULL)) < 0) 
     {
     warn("Error fcntl(..., F_GETFL) (%s)", strerror(errno));
@@ -69,68 +160,106 @@
 if (fcntl(sd, F_SETFL, fcntlFlags) < 0) 
     {
     warn("Error fcntl(..., F_SETFL) (%s)", strerror(errno));
     close(sd);
     return -1;
     }
 
 // Trying to connect with timeout
 res = connect(sd, (struct sockaddr*) &sai, sizeof(sai));
 if (res < 0)
     {
     if (errno == EINPROGRESS)
 	{
 	while (1) 
 	    {
-	    lTime.tv_sec = (long) (msTimeout/1000);
-	    lTime.tv_usec = (long) (((msTimeout/1000)-lTime.tv_sec)*1000000);
+	    lTime.tv_sec = remainingTime.tv_sec;
+    	    lTime.tv_usec = remainingTime.tv_usec;
 	    FD_ZERO(&mySet);
 	    FD_SET(sd, &mySet);
-	    res = select(sd+1, NULL, &mySet, &mySet, &lTime);
+	    res = select(sd+1, NULL, &mySet, &mySet, &lTime);  // some platforms may modify lTime.
 	    if (res < 0) 
 		{
-		if (errno != EINTR) 
+		if (errno == EINTR)  // ignore the interrupt but subtract the elapsed time from remainingTime since some platforms need this.
+		    {
+		    struct timeval newTime;
+		    gettimeofday(&newTime, NULL);
+		    struct timeval elapsedTime;
+		    // subtract startTime from newTime.
+		    if (newTime.tv_usec < startTime.tv_usec)
+			{
+			newTime.tv_usec += 1000000;
+			newTime.tv_sec--;
+			}
+		    elapsedTime.tv_usec = newTime.tv_usec - startTime.tv_usec;
+		    elapsedTime.tv_sec  = newTime.tv_sec  - startTime.tv_sec;
+		    // the elapsedTime should never be negative
+		    // subtract elapsedTime from remainingTime
+		    if (remainingTime.tv_usec < elapsedTime.tv_usec)
+			{
+			remainingTime.tv_usec += 1000000;
+			remainingTime.tv_sec--;
+			}
+		    remainingTime.tv_usec = remainingTime.tv_usec - elapsedTime.tv_usec;
+		    remainingTime.tv_sec  = remainingTime.tv_sec  - elapsedTime.tv_sec;
+		    // the remainingTime.tv_usec should never be negative
+		    // the remainingTime.tv_sec may be negative
+		    if (remainingTime.tv_sec < 0)  // means our timeout has more than expired
+			{
+			remainingTime.tv_sec = 0;
+			remainingTime.tv_usec = 0;
+			}
+		    // for the next cycle set start = new
+		    startTime.tv_sec = newTime.tv_sec;
+		    startTime.tv_usec = newTime.tv_usec;
+		    }
+		else
 		    {
 		    warn("Error in select() during TCP non-blocking connect %d - %s", errno, strerror(errno));
 		    close(sd);
 		    return -1;
 		    }
 		}
 	    else if (res > 0)
 		{
 		// Socket selected for write when it is ready
 		int valOpt;
 		socklen_t lon;
                 // But check the socket for any errors
                 lon = sizeof(valOpt);
                 if (getsockopt(sd, SOL_SOCKET, SO_ERROR, (void*) (&valOpt), &lon) < 0)
                     {
                     warn("Error in getsockopt() %d - %s", errno, strerror(errno));
                     close(sd);
                     return -1;
                     }
                 // Check the value returned...
                 if (valOpt)
                     {
                     warn("Error in TCP non-blocking connect() %d - %s", valOpt, strerror(valOpt));
+		    if (valOpt == 110)
+    			addConnFailure(hostName, port,
+			 "Error in TCP non-blocking connect() %d - %s", valOpt, strerror(valOpt));
                     close(sd);
                     return -1;
                     }
 		break;
 		}
 	    else
 		{
+		addConnFailure(hostName, port,
+		     "TCP non-blocking connect() to %s timed-out in select() after %ld milliseconds - Cancelling!", hostName, msTimeout);
 		warn("TCP non-blocking connect() to %s timed-out in select() after %ld milliseconds - Cancelling!", hostName, msTimeout);
 		close(sd);
 		return -1;
 		}
 	    }
 	}
     else
 	{
 	warn("TCP non-blocking connect() error %d - %s", errno, strerror(errno));
 	close(sd);
 	return -1;
 	}
     }
 
 // Set to blocking mode again