bebcb6453c54164564b24899c6f407310b551a93 max Thu Jan 5 06:37:34 2023 -0800 first go at s3:// support, refs #30445 adding cache file to UDC protocol support, refs #30445 handling expired presigned URLs in udc protocol, refs #30445 diff --git src/lib/udc.c src/lib/udc.c index b44ade3..3a34d36 100644 --- src/lib/udc.c +++ src/lib/udc.c @@ -27,30 +27,31 @@ #include #include "common.h" #include "hash.h" #include "obscure.h" #include "bits.h" #include "linefile.h" #include "portable.h" #include "sig.h" #include "net.h" #include "cheapcgi.h" #include "htmlPage.h" #include "udc.h" #include "hex.h" #include #include +#include /* The stdio stream we'll use to output statistics on file i/o. Off by default. */ FILE *udcLogStream = NULL; void udcSetLog(FILE *fp) /* Turn on logging of file i/o. * For each UDC file two lines are written. One line for the open, and one line for the close. * The Open line just has the URL being opened. * The Close line has the the URL plus a bunch of counts of the number of seeks, reads, and writes * for the following four files: the udc bitmap, the udc sparse data, the incoming calls * to the UDC layer, and the network connection to the (possibly) remote file. * There are two additional counts: the number of socket connects, and the * number of times a socket is reused instead of closed and reopened. */ { @@ -80,30 +81,31 @@ }; #define udcBlockSize (8*1024) /* All fetch requests are rounded up to block size. */ #define udcMaxBytesPerRemoteFetch (udcBlockSize * 32) /* Very large remote reads are broken down into chunks this size. */ struct connInfo /* Socket descriptor and associated info, for keeping net connections open. */ { int socket; /* Socket descriptor for data connection (or 0). */ bits64 offset; /* Current file offset of socket. */ int ctrlSocket; /* (FTP only) Control socket descriptor or 0. */ char *redirUrl; /* (HTTP(S) only) use redirected url */ + char *resolvedUrl; /* resolved HTTPS URL, if url is a pseudo-URL (s3://) */ }; typedef int (*UdcDataCallback)(char *url, bits64 offset, int size, void *buffer, struct udcFile *file); /* Type for callback function that fetches file data. */ struct udcRemoteFileInfo /* Information about a remote file. */ { bits64 updateTime; /* Last update in seconds since 1970 */ bits64 size; /* Remote file size */ struct connInfo ci; /* Connection info for open net connection */ }; typedef boolean (*UdcInfoCallback)(char *url, struct udcRemoteFileInfo *retInfo); @@ -120,64 +122,90 @@ struct udcFile /* A file handle for our caching system. */ { struct udcFile *next; /* Next in list. */ char *url; /* Name of file - includes protocol */ char *protocol; /* The URL up to the first colon. http: etc. */ struct udcProtocol *prot; /* Protocol specific data and methods. */ time_t updateTime; /* Last modified timestamp. */ bits64 size; /* Size of file. */ bits64 offset; /* Current offset in file. */ char *cacheDir; /* Directory for cached file parts. */ char *bitmapFileName; /* Name of bitmap file. */ char *sparseFileName; /* Name of sparse data file. */ char *redirFileName; /* Name of redir file. */ + char *resolvedFileName; /* Name of file that stores final, resolved URL */ int fdSparse; /* File descriptor for sparse data file. */ boolean sparseReadAhead; /* Read-ahead has something in the buffer */ char *sparseReadAheadBuf; /* Read-ahead buffer, if any */ bits64 sparseRAOffset; /* Read-ahead buffer offset */ struct udcBitmap *bits; /* udcBitMap */ bits64 startData; /* Start of area in file we know to have data. */ bits64 endData; /* End of area in file we know to have data. */ bits32 bitmapVersion; /* Version of associated bitmap we were opened with. */ struct connInfo connInfo; /* Connection info for open net connection. */ void *mmapBase; /* pointer to memory address if file has been mmapped, or NULL */ struct ios ios; /* Statistics on file access. */ }; struct udcBitmap /* The control structure including the bitmap of blocks that are cached. */ { struct udcBitmap *next; /* Next in list. */ bits32 blockSize; /* Number of bytes per block of file. */ bits64 remoteUpdate; /* Remote last update time. */ bits64 fileSize; /* File size */ bits32 version; /* Version - increments each time cache is stale. */ bits64 localUpdate; /* Time we last fetched new data into cache. */ bits64 localAccess; /* Time we last accessed data. */ boolean isSwapped; /* If true need to swap all bytes on read. */ int fd; /* File descriptor for file with current block. */ }; static char *bitmapName = "bitmap"; static char *sparseDataName = "sparseData"; static char *redirName = "redir"; +static char *resolvedName = "resolv"; + #define udcBitmapHeaderSize (64) static int cacheTimeout = 0; #define MAX_SKIP_TO_SAVE_RECONNECT (udcMaxBytesPerRemoteFetch / 2) +/* pseudo-URLs with this prefix (e.g. "s3://" get run through a command to get resolved to real HTTPS URLs) */ +struct slName *resolvProts = NULL; +static char *resolvCmd = NULL; + +bool udcIsResolvable(char *url) +/* check if third-party protocol resolving (e.g. for "s3://") is enabled and if the url starts with a protocol handled by the resolver */ +{ +if (!resolvProts || !resolvCmd) + return FALSE; + +char *colon = strchr(url, ':'); +if (!colon) + return FALSE; + +int colonPos = colon - url; +char *protocol = cloneStringZ(url, colonPos); +bool isFound = (slNameFind(resolvProts, protocol) != NULL); +if (isFound) + verbose(4, "Check: URL %s has special protocol://, will need resolving\n", url); +freez(&protocol); +return isFound; +} + static off_t ourMustLseek(struct ioStats *ioStats, int fd, off_t offset, int whence) { ioStats->numSeeks++; return mustLseek(fd, offset, whence); } static void ourMustWrite(struct ioStats *ioStats, int fd, void *buf, size_t size) { ioStats->numWrites++; ioStats->bytesWritten += size; mustWriteFd(fd, buf, size); } static size_t ourRead(struct ioStats *ioStats, int fd, void *buf, size_t size) @@ -422,30 +450,37 @@ sleep1000(500); struct stat status; int ret = stat(fileName, &status); if (ret < 0) return FALSE; retInfo->updateTime = status.st_mtime; retInfo->size = status.st_size; return TRUE; } /********* Section for http protocol **********/ static char *defaultDir = "/tmp/udcCache"; static bool udcInitialized = FALSE; +void udcSetResolver(char *prots, char *cmd) +/* Set protocols and local wrapper program to resolve s3:// and similar URLs to HTTPS */ +{ + resolvProts = slNameListFromString(cloneString(prots), ','); + resolvCmd = cmd; +} + static void initializeUdc() /* Use the $TMPDIR environment variable, if set, to amend the default location * of the cache */ { if (udcInitialized) return; char *tmpDir = getenv("TMPDIR"); if (isNotEmpty(tmpDir)) { char buffer[2048]; safef(buffer, sizeof(buffer), "%s/udcCache", tmpDir); udcSetDefaultDir(buffer); } } @@ -465,36 +500,94 @@ } void udcDisableCache() /* Switch off caching. Re-enable with udcSetDefaultDir */ { defaultDir = NULL; udcInitialized = TRUE; } static bool udcCacheEnabled() /* TRUE if caching is activated */ { return (defaultDir != NULL); } +static char* resolveUrl(char *url) +/* return the HTTPS URL given a pseudo-URL e.g. s3://xxxx. Result must be freed. */ +{ +char filename[1024]; +safef(filename, sizeof filename, "%s/udcTmp-XXXXXX", getTempDir()); +mkstemp(filename); + +verbose(4, "Resolving url %s using command %s\n", url, resolvCmd); + +char* program = resolvCmd; +char* args[4]; +args[0] = program; +args[1] = url; +args[2] = filename; +args[3] = NULL; + +pid_t pid = 0; +int status; +pid = fork(); + +if (pid < 0) + errAbort("udc:resolveUrl: error in fork"); +if (pid == 0) + { + // child process + int err = execv(program, args); + if (err!=0) + errAbort("Cannot run %s", program); + exit(0); + } + +// pid > 0 = main process +pid = wait(&status); +char* newUrl = NULL; +size_t len = 0; +readInGulp(filename, &newUrl, &len); +unlink(filename); +if (len <= 0) + errAbort("Got empty string in output file, from %s, args %s %s", program, url, filename); + +stripString(newUrl, "\n"); +verbose(4, "Resolved url: %s -> %s\n", url, newUrl); +return newUrl; +} + int udcDataViaHttpOrFtp( char *url, bits64 offset, int size, void *buffer, struct udcFile *file) /* Fetch a block of data of given size into buffer using url's protocol, * which must be http, https or ftp. Returns number of bytes actually read. * Does an errAbort on error. * Typically will be called with size in the 8k-64k range. */ { +if (udcIsResolvable(url)) + { + if (file->connInfo.resolvedUrl) { + verbose(4, "URL %s was already resolved to %s\n", url, file->connInfo.resolvedUrl); + url = file->connInfo.resolvedUrl; + } + else + { + url = resolveUrl(url); + file->connInfo.resolvedUrl = url; + } + } + if (startsWith("http://",url) || startsWith("https://",url) || startsWith("ftp://",url)) verbose(4, "reading http/https/ftp data - %d bytes at %lld - on %s\n", size, offset, url); else errAbort("Invalid protocol in url [%s] in udcDataViaFtp, only http, https, or ftp supported", url); int sd = connInfoGetSocket(file, url, offset, size); if (sd < 0) errAbort("Can't get data socket for %s", url); int rd = 0, total = 0, remaining = size; char *buf = (char *)buffer; while ((remaining > 0) && ((rd = ourRead(&file->ios.net, sd, buf, remaining)) > 0)) { total += rd; buf += rd; remaining -= rd; @@ -507,73 +600,106 @@ else ci->offset += total; return total; } boolean udcInfoViaHttp(char *url, struct udcRemoteFileInfo *retInfo) /* Gets size and last modified time of URL * and returns status of HEAD or GET byterange 0-0. */ { verbose(4, "checking http remote info on %s\n", url); // URLs passed into here should not have byterange clause. int redirectCount = 0; struct hash *hash; int status; char *sizeString = NULL; +char *origUrl = url; + +// an unusual case, usually deactivated: URLS of the style s3:// or similar +bool needsResolving = udcIsResolvable(url); +if (needsResolving) + { + if (retInfo->ci.resolvedUrl) + { + verbose(4, "udcInfoViaHttp: URL %s was already resolved to %s\n", url, retInfo->ci.resolvedUrl); + url = retInfo->ci.resolvedUrl; + } + else + { + url = resolveUrl(url); // url is never freed + retInfo->ci.resolvedUrl = url; + } + } + /* For caching, sites should support byte-range and last-modified. However, several groups including ENCODE have made sites that use CGIs to dynamically generate hub text files such as hub.txt, genome.txt, trackDb.txt. Byte-range and last-modified are difficult to support for this case, so they do without them, effectively defeat caching. Every 5 minutes (udcTimeout), they get re-downloaded, even when the data has not changed. */ + while (TRUE) { hash = newHash(0); + + verbose(4, "HTTP HEAD for %s\n", url); status = netUrlHead(url, hash); sizeString = hashFindValUpperCase(hash, "Content-Length:"); if (status == 200 && sizeString) break; /* Using HEAD with HIPPAA-compliant signed AmazonS3 URLs generates 403. The signed URL generated for GET cannot be used with HEAD. Instead call GET with byterange=0-0 in netUrlFakeHeadByGet(). This supplies both size via Content-Range response header, as well as Last-Modified header which is important for caching. There are also sites which support byte-ranges but they do not return Content-Length with HEAD. */ if (status == 403 || (status==200 && !sizeString)) { + verbose(4, "Got 403 or no size from HEAD, trying netUrlFakeHeadByGet = HTTP GET with byterange 0-0 to get size, URL %s\n", url); hashFree(&hash); hash = newHash(0); status = netUrlFakeHeadByGet(url, hash); if (status == 206) break; if (status == 200) // helps get more info to user break; + verbose(4, "netUrlFakeHeadByGet: got status %d for URL %s\n", status, url); + // presigned Amazon URLs return 403 after they are expired + if (status == 403 && needsResolving) + { + verbose(4, "403 = expired URL: need to resolve URL %s again\n", origUrl); + url = resolveUrl(origUrl); // XX url is never freed + retInfo->ci.resolvedUrl = url; + continue; + } } if (status != 301 && status != 302 && status != 307 && status != 308) return FALSE; ++redirectCount; if (redirectCount > 5) { warn("code %d redirects: exceeded limit of 5 redirects, %s", status, url); return FALSE; } + char *newUrl = hashFindValUpperCase(hash, "Location:"); + if (!newUrl) { warn("code %d redirects: redirect location missing, %s", status, url); return FALSE; } // path may be relative if (hasProtocol(newUrl)) { newUrl = cloneString(newUrl); } else { newUrl = expandUrlOnBase(url, newUrl); } @@ -836,31 +962,31 @@ { struct udcProtocol *prot; AllocVar(prot); if (sameString(upToColon, "local")) { prot->fetchData = udcDataViaLocal; prot->fetchInfo = udcInfoViaLocal; prot->type = "local"; } else if (sameString(upToColon, "slow")) { prot->fetchData = udcDataViaSlow; prot->fetchInfo = udcInfoViaSlow; prot->type = "slow"; } -else if (sameString(upToColon, "http") || sameString(upToColon, "https")) +else if (sameString(upToColon, "http") || sameString(upToColon, "https") || (resolvProts && slNameFind(resolvProts, upToColon))) { prot->fetchData = udcDataViaHttpOrFtp; prot->fetchInfo = udcInfoViaHttp; prot->type = "http"; } else if (sameString(upToColon, "ftp")) { prot->fetchData = udcDataViaHttpOrFtp; prot->fetchInfo = udcInfoViaFtp; prot->type = "ftp"; } else if (sameString(upToColon, "transparent")) { prot->fetchData = udcDataViaTransparent; prot->fetchInfo = udcInfoViaTransparent; @@ -1078,53 +1204,82 @@ addElementToDy(dy, maxLen, name); return dyStringCannibalize(&dy); } void udcPathAndFileNames(struct udcFile *file, char *cacheDir, char *protocol, char *afterProtocol) /* Initialize udcFile path and names */ { if (cacheDir==NULL) return; char *hashedAfterProtocol = longDirHash(cacheDir, afterProtocol); int len = strlen(cacheDir) + 1 + strlen(protocol) + 1 + strlen(hashedAfterProtocol) + 1; file->cacheDir = needMem(len); safef(file->cacheDir, len, "%s/%s/%s", cacheDir, protocol, hashedAfterProtocol); +verbose(4, "UDC dir: %s\n", file->cacheDir); /* Create file names for bitmap and data portions. */ file->bitmapFileName = fileNameInCacheDir(file, bitmapName); file->sparseFileName = fileNameInCacheDir(file, sparseDataName); file->redirFileName = fileNameInCacheDir(file, redirName); +file->resolvedFileName = fileNameInCacheDir(file, resolvedName); } static long long int udcSizeAndModTimeFromBitmap(char *bitmapFileName, time_t *retTime) /* Look up the file size from the local cache bitmap file, or -1 if there * is no cache for url. If retTime is non-null, store the remote update time in it. */ { long long int ret = -1; struct udcBitmap *bits = udcBitmapOpen(bitmapFileName); if (bits != NULL) { ret = bits->fileSize; if (retTime) *retTime = bits->remoteUpdate; } udcBitmapClose(&bits); return ret; } +void udcLoadCachedResolvedUrl(struct udcFile *file) +/* load resolved URL from cache or create a new one file and write it */ +{ +char *cacheFname = file->resolvedFileName; + +if (!cacheFname) + return; // URL does not need resolving + +if (fileExists(cacheFname)) + { + // read URL from cache + char *newUrl = NULL; + readInGulp(cacheFname, &newUrl, NULL); + verbose(4, "Read resolved URL %s from cache", newUrl); + file->connInfo.resolvedUrl = newUrl; + } +else if (file->connInfo.resolvedUrl) + { + // write URL to cache + char *newUrl = file->connInfo.resolvedUrl; + char *temp = catTwoStrings(cacheFname, ".temp"); + writeGulp(temp, newUrl, strlen(newUrl)); + rename(temp, cacheFname); + freeMem(temp); + } +} + static void udcTestAndSetRedirect(struct udcFile *file, char *protocol, boolean useCacheInfo) /* update redirect info */ { if (startsWith("http", protocol)) { char *newUrl = NULL; // read redir from cache if it exists if (fileExists(file->redirFileName)) { readInGulp(file->redirFileName, &newUrl, NULL); } if (useCacheInfo) { file->connInfo.redirUrl = cloneString(newUrl); } @@ -1186,76 +1341,85 @@ { if (udcCacheEnabled()) useCacheInfo = (udcCacheAge(url, cacheDir) < udcCacheTimeout()); if (!useCacheInfo) { if (!prot->fetchInfo(url, &info)) { udcProtocolFree(&prot); freeMem(protocol); freeMem(afterProtocol); return NULL; } } } +if (useCacheInfo) + verbose(4, "Cache is used for %s", url); +else + verbose(4, "Cache is not used for %s", url); + /* Allocate file object and start filling it in. */ struct udcFile *file; AllocVar(file); file->url = cloneString(url); file->protocol = protocol; file->prot = prot; if (isTransparent) { /* If transparent dummy up things so that the "sparse" file pointer is actually * the file itself, which appears to be completely loaded in cache. */ if (!fileExists(url)) return NULL; int fd = file->fdSparse = mustOpenFd(url, O_RDONLY); struct stat status; fstat(fd, &status); file->startData = 0; file->endData = file->size = status.st_size; } else { udcPathAndFileNames(file, cacheDir, protocol, afterProtocol); + + file->connInfo.resolvedUrl = info.ci.resolvedUrl; // no need to resolve again if udcInfoViaHttp already did that + if (udcIsResolvable(file->url) && !file->connInfo.resolvedUrl) + udcLoadCachedResolvedUrl(file); + if (!useCacheInfo) { file->updateTime = info.updateTime; file->size = info.size; memcpy(&(file->connInfo), &(info.ci), sizeof(struct connInfo)); // update cache file mod times, so if we're caching we won't do this again // until the timeout has expired again: if (udcCacheTimeout() > 0 && udcCacheEnabled() && fileExists(file->bitmapFileName)) (void)maybeTouchFile(file->bitmapFileName); } if (udcCacheEnabled()) { /* Make directory. */ makeDirsOnPath(file->cacheDir); /* Figure out a little bit about the extent of the good cached data if any. Open bits bitmap. */ setInitialCachedDataBounds(file, useCacheInfo); file->fdSparse = mustOpenFd(file->sparseFileName, O_RDWR); // update redir with latest redirect status udcTestAndSetRedirect(file, protocol, useCacheInfo); - } } freeMem(afterProtocol); return file; } struct udcFile *udcFileOpen(char *url, char *cacheDir) /* Open up a cached file. cacheDir may be null in which case udcDefaultDir() will be * used. Abort if file doesn't exist. */ { struct udcFile *udcFile = udcFileMayOpen(url, cacheDir); if (udcFile == NULL) errAbort("Couldn't open %s", url); return udcFile; @@ -2049,31 +2213,31 @@ off_t udcFileSize(char *url) /* fetch file size from given URL or local path * returns -1 if not found. */ { if (udcIsLocal(url)) return fileSize(url); // don't go to the network if we can avoid it off_t cacheSize = udcSizeFromCache(url, NULL); if (cacheSize!=-1) return cacheSize; off_t ret = -1; struct udcRemoteFileInfo info; -if (startsWith("http://",url) || startsWith("https://",url)) +if (startsWith("http://",url) || startsWith("https://",url) || udcIsResolvable(url) ) { if (udcInfoViaHttp(url, &info)) ret = info.size; } else if (startsWith("ftp://",url)) { if (udcInfoViaFtp(url, &info)) ret = info.size; } else errAbort("udc/udcFileSize: invalid protocol for url %s, can only do http/https/ftp", url); return ret; }