6fe66e36fae18d828825a6c551b5bb5ec05afab1 galt Thu Sep 13 11:41:17 2012 -0700 add tracking of connect-failures to hgCustom and hgTracks which may have many bigDataUrls pointing to the same failing site; also fixed a bug in the timeout handling of EINTR diff --git src/lib/net.c src/lib/net.c index 03fb764..e05fff6 100644 --- src/lib/net.c +++ src/lib/net.c @@ -9,56 +9,147 @@ #include #include #include #include #include "internet.h" #include "errabort.h" #include "hash.h" #include "net.h" #include "linefile.h" #include "base64.h" #include "cheapcgi.h" #include "https.h" #include "sqlNum.h" #include "obscure.h" - /* Brought errno in to get more useful error messages */ extern int errno; +/* when there are many cts, threads, hubtracks, etc + * need a quick way to remember failures to not repeat them */ + +struct connFailure +/* remember connect failure */ + { + char *hostName; /* hostName */ + int port; /* port */ + char *errorString; /* error message to report next time */ + }; + +#define MAXCONNFAILURES 1024 +static struct connFailure connFailures[MAXCONNFAILURES]; +static int numConnFailures = 0; +static pthread_mutex_t cfMutex = PTHREAD_MUTEX_INITIALIZER; +static boolean connFailuresEnabled = FALSE; + +void setConnFailuresEnabled(boolean val) +/* Turn on or off the connFailures feature */ +{ +connFailuresEnabled = val; +} + +boolean checkConnFailure(char *hostName, int port, char **pErrStr) +/* check if this hostName:port has already had failure + * which can save time and avoid more timeouts */ +{ +if (!connFailuresEnabled) + return FALSE; +pthread_mutex_lock( &cfMutex ); +int imax = numConnFailures; +pthread_mutex_unlock( &cfMutex ); +struct connFailure *cf = connFailures; +int i; +boolean result = FALSE; +for(i=0;ihostName, hostName) && cf->port == port) + { + if (pErrStr) + { + *pErrStr = cf->errorString; + } + result = TRUE; + break; + } + ++cf; + } +return result; +} + + +void addConnFailure(char *hostName, int port, char *format, ...) +/* add a failure to connFailures[] + * which can save time and avoid more timeouts */ +{ +if (!connFailuresEnabled) + return; +char errorString[1024]; +va_list args; +va_start(args, format); +vsprintf(errorString, format, args); +va_end(args); +if (!checkConnFailure(hostName,port,NULL)) + { + pthread_mutex_lock( &cfMutex ); + if (numConnFailures < MAXCONNFAILURES) + { + struct connFailure *cf = connFailures + numConnFailures; + cf->hostName = cloneString(hostName); + cf->port = port; + cf->errorString = cloneString(errorString); + numConnFailures++; + } + pthread_mutex_unlock( &cfMutex ); + } +} + + + static int netStreamSocket() /* Create a TCP/IP streaming socket. Complain and return something * negative if can't */ { int sd = socket(AF_INET, SOCK_STREAM, 0); if (sd < 0) warn("Couldn't make AF_INET socket."); return sd; } static int netConnectWithTimeout(char *hostName, int port, long msTimeout) /* In order to avoid a very long default timeout (several minutes) for hosts that will * not answer the port, we are forced to connect non-blocking. * After the connection has been established, we return to blocking mode. */ { int sd; struct sockaddr_in sai; /* Some system socket info. */ int res; fd_set mySet; struct timeval lTime; long fcntlFlags; +struct timeval startTime; +gettimeofday(&startTime, NULL); +struct timeval remainingTime; +remainingTime.tv_sec = (long) (msTimeout/1000); +remainingTime.tv_usec = (long) (((msTimeout/1000)-remainingTime.tv_sec)*1000000); + +char *errorString = NULL; +if (checkConnFailure(hostName, port, &errorString)) + { + warn(errorString); + return -1; + } if (hostName == NULL) { warn("NULL hostName in netConnect"); return -1; } if (!internetFillInAddress(hostName, port, &sai)) return -1; if ((sd = netStreamSocket()) < 0) return sd; // Set non-blocking if ((fcntlFlags = fcntl(sd, F_GETFL, NULL)) < 0) { warn("Error fcntl(..., F_GETFL) (%s)", strerror(errno)); @@ -69,68 +160,106 @@ if (fcntl(sd, F_SETFL, fcntlFlags) < 0) { warn("Error fcntl(..., F_SETFL) (%s)", strerror(errno)); close(sd); return -1; } // Trying to connect with timeout res = connect(sd, (struct sockaddr*) &sai, sizeof(sai)); if (res < 0) { if (errno == EINPROGRESS) { while (1) { - lTime.tv_sec = (long) (msTimeout/1000); - lTime.tv_usec = (long) (((msTimeout/1000)-lTime.tv_sec)*1000000); + lTime.tv_sec = remainingTime.tv_sec; + lTime.tv_usec = remainingTime.tv_usec; FD_ZERO(&mySet); FD_SET(sd, &mySet); - res = select(sd+1, NULL, &mySet, &mySet, &lTime); + res = select(sd+1, NULL, &mySet, &mySet, &lTime); // some platforms may modify lTime. if (res < 0) { - if (errno != EINTR) + if (errno == EINTR) // ignore the interrupt but subtract the elapsed time from remainingTime since some platforms need this. + { + struct timeval newTime; + gettimeofday(&newTime, NULL); + struct timeval elapsedTime; + // subtract startTime from newTime. + if (newTime.tv_usec < startTime.tv_usec) + { + newTime.tv_usec += 1000000; + newTime.tv_sec--; + } + elapsedTime.tv_usec = newTime.tv_usec - startTime.tv_usec; + elapsedTime.tv_sec = newTime.tv_sec - startTime.tv_sec; + // the elapsedTime should never be negative + // subtract elapsedTime from remainingTime + if (remainingTime.tv_usec < elapsedTime.tv_usec) + { + remainingTime.tv_usec += 1000000; + remainingTime.tv_sec--; + } + remainingTime.tv_usec = remainingTime.tv_usec - elapsedTime.tv_usec; + remainingTime.tv_sec = remainingTime.tv_sec - elapsedTime.tv_sec; + // the remainingTime.tv_usec should never be negative + // the remainingTime.tv_sec may be negative + if (remainingTime.tv_sec < 0) // means our timeout has more than expired + { + remainingTime.tv_sec = 0; + remainingTime.tv_usec = 0; + } + // for the next cycle set start = new + startTime.tv_sec = newTime.tv_sec; + startTime.tv_usec = newTime.tv_usec; + } + else { warn("Error in select() during TCP non-blocking connect %d - %s", errno, strerror(errno)); close(sd); return -1; } } else if (res > 0) { // Socket selected for write when it is ready int valOpt; socklen_t lon; // But check the socket for any errors lon = sizeof(valOpt); if (getsockopt(sd, SOL_SOCKET, SO_ERROR, (void*) (&valOpt), &lon) < 0) { warn("Error in getsockopt() %d - %s", errno, strerror(errno)); close(sd); return -1; } // Check the value returned... if (valOpt) { warn("Error in TCP non-blocking connect() %d - %s", valOpt, strerror(valOpt)); + if (valOpt == 110) + addConnFailure(hostName, port, + "Error in TCP non-blocking connect() %d - %s", valOpt, strerror(valOpt)); close(sd); return -1; } break; } else { + addConnFailure(hostName, port, + "TCP non-blocking connect() to %s timed-out in select() after %ld milliseconds - Cancelling!", hostName, msTimeout); warn("TCP non-blocking connect() to %s timed-out in select() after %ld milliseconds - Cancelling!", hostName, msTimeout); close(sd); return -1; } } } else { warn("TCP non-blocking connect() error %d - %s", errno, strerror(errno)); close(sd); return -1; } } // Set to blocking mode again