bebcb6453c54164564b24899c6f407310b551a93
max
  Thu Jan 5 06:37:34 2023 -0800
first go at s3:// support, refs #30445

adding cache file to UDC protocol support, refs #30445

handling expired presigned URLs in udc protocol, refs #30445

diff --git src/lib/udc.c src/lib/udc.c
index b44ade3..3a34d36 100644
--- src/lib/udc.c
+++ src/lib/udc.c
@@ -27,30 +27,31 @@
 #include <sys/mman.h>
 #include "common.h"
 #include "hash.h"
 #include "obscure.h"
 #include "bits.h"
 #include "linefile.h"
 #include "portable.h"
 #include "sig.h"
 #include "net.h"
 #include "cheapcgi.h"
 #include "htmlPage.h"
 #include "udc.h"
 #include "hex.h"
 #include <dirent.h>
 #include <openssl/sha.h>
+#include <sys/wait.h>
 
 /* The stdio stream we'll use to output statistics on file i/o.  Off by default. */
 FILE *udcLogStream = NULL;
 
 void udcSetLog(FILE *fp)
 /* Turn on logging of file i/o. 
  * For each UDC file two lines are written.  One line for the open, and one line for the close. 
  * The Open line just has the URL being opened.
  * The Close line has the the URL plus a bunch of counts of the number of seeks, reads, and writes
  *   for the following four files: the udc bitmap, the udc sparse data, the incoming calls
  *   to the UDC layer, and the network connection to the (possibly) remote file.
  *   There are two additional counts: the number of socket connects, and the 
  *   number of times a socket is reused instead of closed and reopened.
  */
 {
@@ -80,30 +81,31 @@
     };
 
 #define udcBlockSize (8*1024)
 /* All fetch requests are rounded up to block size. */
 
 #define udcMaxBytesPerRemoteFetch (udcBlockSize * 32)
 /* Very large remote reads are broken down into chunks this size. */
 
 struct connInfo
 /* Socket descriptor and associated info, for keeping net connections open. */
     {
     int socket;                 /* Socket descriptor for data connection (or 0). */
     bits64 offset;		/* Current file offset of socket. */
     int ctrlSocket;             /* (FTP only) Control socket descriptor or 0. */
     char *redirUrl;             /* (HTTP(S) only) use redirected url */
+    char *resolvedUrl;          /* resolved HTTPS URL, if url is a pseudo-URL (s3://) */
     };
 
 typedef int (*UdcDataCallback)(char *url, bits64 offset, int size, void *buffer,
 			       struct udcFile *file);
 /* Type for callback function that fetches file data. */
 
 struct udcRemoteFileInfo
 /* Information about a remote file. */
     {
     bits64 updateTime;	/* Last update in seconds since 1970 */
     bits64 size;	/* Remote file size */
     struct connInfo ci; /* Connection info for open net connection */
     };
 
 typedef boolean (*UdcInfoCallback)(char *url, struct udcRemoteFileInfo *retInfo);
@@ -120,64 +122,90 @@
 
 struct udcFile
 /* A file handle for our caching system. */
     {
     struct udcFile *next;	/* Next in list. */
     char *url;			/* Name of file - includes protocol */
     char *protocol;		/* The URL up to the first colon.  http: etc. */
     struct udcProtocol *prot;	/* Protocol specific data and methods. */
     time_t updateTime;		/* Last modified timestamp. */
     bits64 size;		/* Size of file. */
     bits64 offset;		/* Current offset in file. */
     char *cacheDir;		/* Directory for cached file parts. */
     char *bitmapFileName;	/* Name of bitmap file. */
     char *sparseFileName;	/* Name of sparse data file. */
     char *redirFileName;	/* Name of redir file. */
+    char *resolvedFileName;     /* Name of file that stores final, resolved URL */
     int fdSparse;		/* File descriptor for sparse data file. */
     boolean sparseReadAhead;    /* Read-ahead has something in the buffer */
     char *sparseReadAheadBuf;   /* Read-ahead buffer, if any */
     bits64 sparseRAOffset;      /* Read-ahead buffer offset */
     struct udcBitmap *bits;     /* udcBitMap */
     bits64 startData;		/* Start of area in file we know to have data. */
     bits64 endData;		/* End of area in file we know to have data. */
     bits32 bitmapVersion;	/* Version of associated bitmap we were opened with. */
     struct connInfo connInfo;   /* Connection info for open net connection. */
     void *mmapBase;             /* pointer to memory address if file has been mmapped, or NULL */
     struct ios ios;             /* Statistics on file access. */
     };
 
 struct udcBitmap
 /* The control structure including the bitmap of blocks that are cached. */
     {
     struct udcBitmap *next;	/* Next in list. */
     bits32 blockSize;		/* Number of bytes per block of file. */
     bits64 remoteUpdate;	/* Remote last update time. */
     bits64 fileSize;		/* File size */
     bits32 version;		/* Version - increments each time cache is stale. */
     bits64 localUpdate;		/* Time we last fetched new data into cache. */
     bits64 localAccess;		/* Time we last accessed data. */
     boolean isSwapped;		/* If true need to swap all bytes on read. */
     int fd;			/* File descriptor for file with current block. */
     };
 static char *bitmapName = "bitmap";
 static char *sparseDataName = "sparseData";
 static char *redirName = "redir";
+static char *resolvedName = "resolv";
+
 #define udcBitmapHeaderSize (64)
 static int cacheTimeout = 0;
 
 #define MAX_SKIP_TO_SAVE_RECONNECT (udcMaxBytesPerRemoteFetch / 2)
 
+/* pseudo-URLs with this prefix (e.g. "s3://" get run through a command to get resolved to real HTTPS URLs) */
+struct slName *resolvProts = NULL;
+static char *resolvCmd = NULL;
+
+bool udcIsResolvable(char *url) 
+/* check if third-party protocol resolving (e.g. for "s3://") is enabled and if the url starts with a protocol handled by the resolver */
+{
+if (!resolvProts || !resolvCmd)
+    return FALSE;
+
+char *colon = strchr(url, ':');
+if (!colon)
+    return FALSE;
+
+int colonPos = colon - url;
+char *protocol = cloneStringZ(url, colonPos);
+bool isFound = (slNameFind(resolvProts, protocol) != NULL);
+if (isFound)
+    verbose(4, "Check: URL %s has special protocol://, will need resolving\n", url);
+freez(&protocol);
+return isFound;
+}
+
 static off_t ourMustLseek(struct ioStats *ioStats, int fd, off_t offset, int whence)
 {
 ioStats->numSeeks++;
 return mustLseek(fd, offset, whence);
 }
 
 
 static void ourMustWrite(struct ioStats *ioStats, int fd, void *buf, size_t size)
 {
 ioStats->numWrites++;
 ioStats->bytesWritten += size;
 mustWriteFd(fd, buf, size);
 }
 
 static size_t ourRead(struct ioStats *ioStats, int fd, void *buf, size_t size)
@@ -422,30 +450,37 @@
 sleep1000(500);
 struct stat status;
 int ret = stat(fileName, &status);
 if (ret < 0)
     return FALSE;
 retInfo->updateTime = status.st_mtime;
 retInfo->size = status.st_size;
 return TRUE;
 }
 
 /********* Section for http protocol **********/
 
 static char *defaultDir = "/tmp/udcCache";
 static bool udcInitialized = FALSE;
 
+void udcSetResolver(char *prots, char *cmd)
+/* Set protocols and local wrapper program to resolve s3:// and similar URLs to HTTPS */
+{
+    resolvProts = slNameListFromString(cloneString(prots), ',');
+    resolvCmd = cmd;
+}
+
 static void initializeUdc()
 /* Use the $TMPDIR environment variable, if set, to amend the default location
  * of the cache */
 {
 if (udcInitialized)
     return;
 char *tmpDir = getenv("TMPDIR");
 if (isNotEmpty(tmpDir))
     {
     char buffer[2048];
     safef(buffer, sizeof(buffer), "%s/udcCache", tmpDir);
     udcSetDefaultDir(buffer);
     }
 }
 
@@ -465,36 +500,94 @@
 }
 
 void udcDisableCache()
 /* Switch off caching. Re-enable with udcSetDefaultDir */
 {
 defaultDir = NULL;
 udcInitialized = TRUE;
 }
 
 static bool udcCacheEnabled()
 /* TRUE if caching is activated */
 {
 return (defaultDir != NULL);
 }
 
+static char* resolveUrl(char *url) 
+/* return the HTTPS URL given a pseudo-URL e.g. s3://xxxx. Result must be freed. */
+{
+char filename[1024];
+safef(filename, sizeof filename, "%s/udcTmp-XXXXXX", getTempDir());
+mkstemp(filename);
+
+verbose(4, "Resolving url %s using command %s\n", url, resolvCmd);
+
+char* program = resolvCmd;
+char* args[4];
+args[0] = program;
+args[1] = url;
+args[2] = filename;
+args[3] = NULL;
+
+pid_t pid = 0;
+int status;
+pid = fork();
+
+if (pid < 0)
+    errAbort("udc:resolveUrl: error in fork");
+if (pid == 0)
+    {
+    // child process
+    int err = execv(program, args);
+    if (err!=0)
+        errAbort("Cannot run %s", program);
+    exit(0);
+    }
+
+// pid > 0 = main process
+pid = wait(&status);
+char* newUrl = NULL;
+size_t len = 0;
+readInGulp(filename, &newUrl, &len);
+unlink(filename);
+if (len <= 0)
+    errAbort("Got empty string in output file, from %s, args %s %s", program, url, filename);
+
+stripString(newUrl, "\n");
+verbose(4, "Resolved url: %s -> %s\n", url, newUrl);
+return newUrl;
+}
+
 int udcDataViaHttpOrFtp( char *url, bits64 offset, int size, void *buffer, struct udcFile *file)
 /* Fetch a block of data of given size into buffer using url's protocol,
  * which must be http, https or ftp.  Returns number of bytes actually read.
  * Does an errAbort on error.
  * Typically will be called with size in the 8k-64k range. */
 {
+if (udcIsResolvable(url)) 
+    {
+        if (file->connInfo.resolvedUrl) {
+            verbose(4, "URL %s was already resolved to %s\n", url, file->connInfo.resolvedUrl);
+            url = file->connInfo.resolvedUrl;
+        }
+        else
+            {
+            url = resolveUrl(url);
+            file->connInfo.resolvedUrl = url;
+            }
+    }
+
 if (startsWith("http://",url) || startsWith("https://",url) || startsWith("ftp://",url))
     verbose(4, "reading http/https/ftp data - %d bytes at %lld - on %s\n", size, offset, url);
 else
     errAbort("Invalid protocol in url [%s] in udcDataViaFtp, only http, https, or ftp supported",
 	     url); 
 int sd = connInfoGetSocket(file, url, offset, size);
 if (sd < 0)
     errAbort("Can't get data socket for %s", url);
 int rd = 0, total = 0, remaining = size;
 char *buf = (char *)buffer;
 while ((remaining > 0) && ((rd = ourRead(&file->ios.net, sd, buf, remaining)) > 0))
     {
     total += rd;
     buf += rd;
     remaining -= rd;
@@ -507,73 +600,106 @@
 else
     ci->offset += total;
 return total;
 }
 
 boolean udcInfoViaHttp(char *url, struct udcRemoteFileInfo *retInfo)
 /* Gets size and last modified time of URL
  * and returns status of HEAD or GET byterange 0-0. */
 {
 verbose(4, "checking http remote info on %s\n", url);
 // URLs passed into here should not have byterange clause.
 int redirectCount = 0;
 struct hash *hash;
 int status;
 char *sizeString = NULL;
+char *origUrl = url;
+
+// an unusual case, usually deactivated: URLS of the style s3:// or similar
+bool needsResolving = udcIsResolvable(url);
+if (needsResolving)
+    {
+    if (retInfo->ci.resolvedUrl) 
+        {
+        verbose(4, "udcInfoViaHttp: URL %s was already resolved to %s\n", url, retInfo->ci.resolvedUrl);
+        url = retInfo->ci.resolvedUrl;
+        }
+    else 
+        {
+        url = resolveUrl(url); // url is never freed 
+        retInfo->ci.resolvedUrl = url;
+        }
+    }
+
 /*
  For caching, sites should support byte-range and last-modified.
  However, several groups including ENCODE have made sites that use CGIs to 
  dynamically generate hub text files such as hub.txt, genome.txt, trackDb.txt.
  Byte-range and last-modified are difficult to support for this case,
  so they do without them, effectively defeat caching. Every 5 minutes (udcTimeout),
  they get re-downloaded, even when the data has not changed.  
 */
+
 while (TRUE)
     {
     hash = newHash(0);
+
+    verbose(4, "HTTP HEAD for %s\n", url);
     status = netUrlHead(url, hash);
     sizeString = hashFindValUpperCase(hash, "Content-Length:");
     if (status == 200 && sizeString)
 	break;
     /*
     Using HEAD with HIPPAA-compliant signed AmazonS3 URLs generates 403.
     The signed URL generated for GET cannot be used with HEAD.
     Instead call GET with byterange=0-0 in netUrlFakeHeadByGet().
     This supplies both size via Content-Range response header,
     as well as Last-Modified header which is important for caching.
     There are also sites which support byte-ranges 
     but they do not return Content-Length with HEAD.
     */
     if (status == 403 || (status==200 && !sizeString))
 	{ 
+        verbose(4, "Got 403 or no size from HEAD, trying netUrlFakeHeadByGet = HTTP GET with byterange 0-0 to get size, URL %s\n", url);
 	hashFree(&hash);
 	hash = newHash(0);
 	status = netUrlFakeHeadByGet(url, hash);
 	if (status == 206) 
 	    break;
 	if (status == 200)  // helps get more info to user
 	    break;
+        verbose(4, "netUrlFakeHeadByGet: got status %d for URL %s\n", status, url);
+        // presigned Amazon URLs return 403 after they are expired
+        if (status == 403 && needsResolving)
+            {
+            verbose(4, "403 = expired URL: need to resolve URL %s again\n", origUrl);
+            url = resolveUrl(origUrl); // XX url is never freed
+            retInfo->ci.resolvedUrl = url;
+            continue;
+            }
 	}
     if (status != 301 && status != 302 && status != 307 && status != 308)
 	return FALSE;
     ++redirectCount;
     if (redirectCount > 5)
 	{
 	warn("code %d redirects: exceeded limit of 5 redirects, %s", status, url);
 	return FALSE;
 	}
+
     char *newUrl = hashFindValUpperCase(hash, "Location:");
+
      if (!newUrl)
 	{
 	warn("code %d redirects: redirect location missing, %s", status, url);
 	return FALSE;
 	}
 
     // path may be relative
     if (hasProtocol(newUrl))
 	{
         newUrl = cloneString(newUrl);
 	}
     else
 	{
 	newUrl = expandUrlOnBase(url, newUrl);
 	}
@@ -836,31 +962,31 @@
 {
 struct udcProtocol *prot;
 AllocVar(prot);
 if (sameString(upToColon, "local"))
     {
     prot->fetchData = udcDataViaLocal;
     prot->fetchInfo = udcInfoViaLocal;
     prot->type = "local";
     }
 else if (sameString(upToColon, "slow"))
     {
     prot->fetchData = udcDataViaSlow;
     prot->fetchInfo = udcInfoViaSlow;
     prot->type = "slow";
     }
-else if (sameString(upToColon, "http") || sameString(upToColon, "https"))
+else if (sameString(upToColon, "http") || sameString(upToColon, "https") || (resolvProts && slNameFind(resolvProts, upToColon)))
     {
     prot->fetchData = udcDataViaHttpOrFtp;
     prot->fetchInfo = udcInfoViaHttp;
     prot->type = "http";
     }
 else if (sameString(upToColon, "ftp"))
     {
     prot->fetchData = udcDataViaHttpOrFtp;
     prot->fetchInfo = udcInfoViaFtp;
     prot->type = "ftp";
     }
 else if (sameString(upToColon, "transparent"))
     {
     prot->fetchData = udcDataViaTransparent;
     prot->fetchInfo = udcInfoViaTransparent;
@@ -1078,53 +1204,82 @@
 
 addElementToDy(dy, maxLen, name);
 
 return dyStringCannibalize(&dy);
 }
 
 void udcPathAndFileNames(struct udcFile *file, char *cacheDir, char *protocol, char *afterProtocol)
 /* Initialize udcFile path and names */
 {
 if (cacheDir==NULL)
     return;
 char *hashedAfterProtocol = longDirHash(cacheDir, afterProtocol);
 int len = strlen(cacheDir) + 1 + strlen(protocol) + 1 + strlen(hashedAfterProtocol) + 1;
 file->cacheDir = needMem(len);
 safef(file->cacheDir, len, "%s/%s/%s", cacheDir, protocol, hashedAfterProtocol);
+verbose(4, "UDC dir: %s\n", file->cacheDir);
 
 /* Create file names for bitmap and data portions. */
 file->bitmapFileName = fileNameInCacheDir(file, bitmapName);
 file->sparseFileName = fileNameInCacheDir(file, sparseDataName);
 file->redirFileName = fileNameInCacheDir(file, redirName);
+file->resolvedFileName = fileNameInCacheDir(file, resolvedName);
 }
 
 static long long int udcSizeAndModTimeFromBitmap(char *bitmapFileName, time_t *retTime)
 /* Look up the file size from the local cache bitmap file, or -1 if there
  * is no cache for url. If retTime is non-null, store the remote update time in it. */
 {
 long long int ret = -1;
 struct udcBitmap *bits = udcBitmapOpen(bitmapFileName);
 if (bits != NULL)
     {
     ret = bits->fileSize;
     if (retTime)
 	*retTime = bits->remoteUpdate;
     }
 udcBitmapClose(&bits);
 return ret;
 }
 
+void udcLoadCachedResolvedUrl(struct udcFile *file)
+/* load resolved URL from cache or create a new one file and write it */
+{
+char *cacheFname = file->resolvedFileName;
+
+if (!cacheFname)
+    return; // URL does not need resolving
+
+if (fileExists(cacheFname)) 
+    {
+    // read URL from cache
+    char *newUrl = NULL;
+    readInGulp(cacheFname, &newUrl, NULL);
+    verbose(4, "Read resolved URL %s from cache", newUrl);
+    file->connInfo.resolvedUrl = newUrl;
+    }
+else if (file->connInfo.resolvedUrl)
+    {
+    // write URL to cache
+    char *newUrl = file->connInfo.resolvedUrl;
+    char *temp = catTwoStrings(cacheFname, ".temp");
+    writeGulp(temp, newUrl, strlen(newUrl));
+    rename(temp, cacheFname);
+    freeMem(temp);
+    }
+}
+
 static void udcTestAndSetRedirect(struct udcFile *file, char *protocol, boolean useCacheInfo)
 /* update redirect info */
 {
 if (startsWith("http", protocol))
     {
     char *newUrl = NULL;
     // read redir from cache if it exists
     if (fileExists(file->redirFileName))
 	{
 	readInGulp(file->redirFileName, &newUrl, NULL);
 	}
     if (useCacheInfo)
 	{
 	file->connInfo.redirUrl = cloneString(newUrl);
 	}
@@ -1186,76 +1341,85 @@
     {
     if (udcCacheEnabled())
         useCacheInfo = (udcCacheAge(url, cacheDir) < udcCacheTimeout());
     if (!useCacheInfo)
 	{
 	if (!prot->fetchInfo(url, &info))
 	    {
 	    udcProtocolFree(&prot);
 	    freeMem(protocol);
 	    freeMem(afterProtocol);
 	    return NULL;
 	    }
 	}
     }
 
+if (useCacheInfo)
+    verbose(4, "Cache is used for %s", url);
+else
+    verbose(4, "Cache is not used for %s", url);
+
 /* Allocate file object and start filling it in. */
 struct udcFile *file;
 AllocVar(file);
 file->url = cloneString(url);
 file->protocol = protocol;
 file->prot = prot;
 if (isTransparent)
     {
     /* If transparent dummy up things so that the "sparse" file pointer is actually
      * the file itself, which appears to be completely loaded in cache. */
     if (!fileExists(url))
 	return NULL;
     int fd = file->fdSparse = mustOpenFd(url, O_RDONLY);
     struct stat status;
     fstat(fd, &status);
     file->startData = 0;
     file->endData = file->size = status.st_size;
     }
 else 
     {
     udcPathAndFileNames(file, cacheDir, protocol, afterProtocol);
+
+    file->connInfo.resolvedUrl = info.ci.resolvedUrl; // no need to resolve again if udcInfoViaHttp already did that
+    if (udcIsResolvable(file->url) && !file->connInfo.resolvedUrl)
+        udcLoadCachedResolvedUrl(file);
+
     if (!useCacheInfo)
 	{
 	file->updateTime = info.updateTime;
 	file->size = info.size;
 	memcpy(&(file->connInfo), &(info.ci), sizeof(struct connInfo));
 	// update cache file mod times, so if we're caching we won't do this again
 	// until the timeout has expired again:
     	if (udcCacheTimeout() > 0 && udcCacheEnabled() && fileExists(file->bitmapFileName))
 	    (void)maybeTouchFile(file->bitmapFileName);
 
 	}
 
     if (udcCacheEnabled())
         {
         /* Make directory. */
         makeDirsOnPath(file->cacheDir);
 
         /* Figure out a little bit about the extent of the good cached data if any. Open bits bitmap. */
         setInitialCachedDataBounds(file, useCacheInfo);
 
         file->fdSparse = mustOpenFd(file->sparseFileName, O_RDWR);
 
 	// update redir with latest redirect status	
 	udcTestAndSetRedirect(file, protocol, useCacheInfo);
-	
         }
 
     }
 freeMem(afterProtocol);
 return file;
 }
 
 struct udcFile *udcFileOpen(char *url, char *cacheDir)
 /* Open up a cached file.  cacheDir may be null in which case udcDefaultDir() will be
  * used.  Abort if file doesn't exist. */
 {
 struct udcFile *udcFile = udcFileMayOpen(url, cacheDir);
 if (udcFile == NULL)
     errAbort("Couldn't open %s", url);
 return udcFile;
@@ -2049,31 +2213,31 @@
 off_t udcFileSize(char *url)
 /* fetch file size from given URL or local path 
  * returns -1 if not found. */
 {
 if (udcIsLocal(url))
     return fileSize(url);
 
 // don't go to the network if we can avoid it
 off_t cacheSize = udcSizeFromCache(url, NULL);
 if (cacheSize!=-1)
     return cacheSize;
 
 off_t ret = -1;
 struct udcRemoteFileInfo info;
 
-if (startsWith("http://",url) || startsWith("https://",url))
+if (startsWith("http://",url) || startsWith("https://",url) || udcIsResolvable(url) )
     {
     if (udcInfoViaHttp(url, &info))
 	ret = info.size;
     }
 else if (startsWith("ftp://",url))
     {
     if (udcInfoViaFtp(url, &info))
 	ret = info.size;
     }
 else
     errAbort("udc/udcFileSize: invalid protocol for url %s, can only do http/https/ftp", url);
 
 return ret;
 }