be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/lib/udc.c src/lib/udc.c index 6322bbd..c1071f9 100644 --- src/lib/udc.c +++ src/lib/udc.c @@ -1,2099 +1,2104 @@ /* Copyright (C) 2014 The Regents of the University of California * See README in this or parent directory for licensing information. */ /* udc - url data cache - a caching system that keeps blocks of data fetched from URLs in * sparse local files for quick use the next time the data is needed. * * This cache is enormously simplified by there being no local _write_ to the cache, * just reads. * * The overall strategy of the implementation is to have a root cache directory * with a subdir for each file being cached. The directory for a single cached file * contains two files - "bitmap" and "sparseData" that contains information on which * parts of the URL are cached and the actual cached data respectively. The subdirectory name * associated with the file is constructed from the URL in a straightforward manner. * http://genome.ucsc.edu/cgi-bin/hgGateway * gets mapped to: * rootCacheDir/http/genome.ucsc.edu/cgi-bin/hgGateway/ * The URL protocol is the first directory under the root, and the remainder of the * URL, with some necessary escaping, is used to define the rest of the cache directory * structure, with each '/' after the protocol line translating into another directory * level. * * The bitmap file contains time stamp and size data as well as an array with one bit * for each block of the file that has been fetched. Currently the block size is 8K. */ #include #include #include "common.h" #include "hash.h" #include "obscure.h" #include "bits.h" #include "linefile.h" #include "portable.h" #include "sig.h" #include "net.h" #include "cheapcgi.h" #include "htmlPage.h" #include "udc.h" #include "hex.h" #include #include /* The stdio stream we'll use to output statistics on file i/o. Off by default. */ FILE *udcLogStream = NULL; void udcSetLog(FILE *fp) /* Turn on logging of file i/o. * For each UDC file two lines are written. One line for the open, and one line for the close. * The Open line just has the URL being opened. * The Close line has the the URL plus a bunch of counts of the number of seeks, reads, and writes * for the following four files: the udc bitmap, the udc sparse data, the incoming calls * to the UDC layer, and the network connection to the (possibly) remote file. * There are two additional counts: the number of socket connects, and the * number of times a socket is reused instead of closed and reopened. */ { udcLogStream = fp; fprintf(fp, "Begin\n"); } struct ioStats /* Statistics concerning reads and seeks. */ { bits64 numSeeks; /* The number of seeks on this file */ bits64 numReads; /* The number of reads from this file */ bits64 bytesRead; /* The number of bytes read from this file */ bits64 numWrites; /* The number of writes to this file */ bits64 bytesWritten; /* The number of bytes written to this file */ }; struct ios /* Statistics concerning reads and seeks for sparse, bitmap, url, and to us. */ { struct ioStats bit; /* Statistics on file i/o to the bitmap file. */ struct ioStats sparse; /* Statistics on file i/o to the sparse data file. */ struct ioStats udc; /* Statistics on file i/o from the application to us. */ struct ioStats net; /* Statistics on file i/o over the network. */ bits64 numConnects; /* The number of socket connections made. */ bits64 numReuse; /* The number of socket reuses. */ }; #define udcBlockSize (8*1024) /* All fetch requests are rounded up to block size. */ #define udcMaxBytesPerRemoteFetch (udcBlockSize * 32) /* Very large remote reads are broken down into chunks this size. */ struct connInfo /* Socket descriptor and associated info, for keeping net connections open. */ { int socket; /* Socket descriptor for data connection (or 0). */ bits64 offset; /* Current file offset of socket. */ int ctrlSocket; /* (FTP only) Control socket descriptor or 0. */ char *redirUrl; /* (HTTP(S) only) use redirected url */ }; typedef int (*UdcDataCallback)(char *url, bits64 offset, int size, void *buffer, struct udcFile *file); /* Type for callback function that fetches file data. */ struct udcRemoteFileInfo /* Information about a remote file. */ { bits64 updateTime; /* Last update in seconds since 1970 */ bits64 size; /* Remote file size */ struct connInfo ci; /* Connection info for open net connection */ }; typedef boolean (*UdcInfoCallback)(char *url, struct udcRemoteFileInfo *retInfo); /* Type for callback function that fetches file timestamp and size. */ struct udcProtocol /* Something to handle a communications protocol like http, https, ftp, local file i/o, etc. */ { struct udcProtocol *next; /* Next in list */ UdcDataCallback fetchData; /* Data fetcher */ UdcInfoCallback fetchInfo; /* Timestamp & size fetcher */ char *type; }; struct udcFile /* A file handle for our caching system. */ { struct udcFile *next; /* Next in list. */ char *url; /* Name of file - includes protocol */ char *protocol; /* The URL up to the first colon. http: etc. */ struct udcProtocol *prot; /* Protocol specific data and methods. */ time_t updateTime; /* Last modified timestamp. */ bits64 size; /* Size of file. */ bits64 offset; /* Current offset in file. */ char *cacheDir; /* Directory for cached file parts. */ char *bitmapFileName; /* Name of bitmap file. */ char *sparseFileName; /* Name of sparse data file. */ char *redirFileName; /* Name of redir file. */ int fdSparse; /* File descriptor for sparse data file. */ boolean sparseReadAhead; /* Read-ahead has something in the buffer */ char *sparseReadAheadBuf; /* Read-ahead buffer, if any */ bits64 sparseRAOffset; /* Read-ahead buffer offset */ struct udcBitmap *bits; /* udcBitMap */ bits64 startData; /* Start of area in file we know to have data. */ bits64 endData; /* End of area in file we know to have data. */ bits32 bitmapVersion; /* Version of associated bitmap we were opened with. */ struct connInfo connInfo; /* Connection info for open net connection. */ void *mmapBase; /* pointer to memory address if file has been mmapped, or NULL */ struct ios ios; /* Statistics on file access. */ }; struct udcBitmap /* The control structure including the bitmap of blocks that are cached. */ { struct udcBitmap *next; /* Next in list. */ bits32 blockSize; /* Number of bytes per block of file. */ bits64 remoteUpdate; /* Remote last update time. */ bits64 fileSize; /* File size */ bits32 version; /* Version - increments each time cache is stale. */ bits64 localUpdate; /* Time we last fetched new data into cache. */ bits64 localAccess; /* Time we last accessed data. */ boolean isSwapped; /* If true need to swap all bytes on read. */ int fd; /* File descriptor for file with current block. */ }; static char *bitmapName = "bitmap"; static char *sparseDataName = "sparseData"; static char *redirName = "redir"; #define udcBitmapHeaderSize (64) static int cacheTimeout = 0; #define MAX_SKIP_TO_SAVE_RECONNECT (udcMaxBytesPerRemoteFetch / 2) static off_t ourMustLseek(struct ioStats *ioStats, int fd, off_t offset, int whence) { ioStats->numSeeks++; return mustLseek(fd, offset, whence); } static void ourMustWrite(struct ioStats *ioStats, int fd, void *buf, size_t size) { ioStats->numWrites++; ioStats->bytesWritten += size; mustWriteFd(fd, buf, size); } static size_t ourRead(struct ioStats *ioStats, int fd, void *buf, size_t size) { ioStats->numReads++; size_t bytesRead = read(fd, buf, size); ioStats->bytesRead += bytesRead; return bytesRead; } static void ourMustRead(struct ioStats *ioStats, int fd, void *buf, size_t size) { ioStats->numReads++; ioStats->bytesRead += size; mustReadFd(fd, buf, size); } static size_t ourFread(struct ioStats *ioStats, void *buf, size_t size, size_t nmemb, FILE *stream) { ioStats->numReads++; ioStats->bytesRead += size * nmemb; return fread(buf, size, nmemb, stream); } static void udcReadAndIgnore(struct ioStats *ioStats, int sd, bits64 size) /* Read size bytes from sd and return. */ { static char *buf = NULL; if (buf == NULL) buf = needMem(udcBlockSize); bits64 remaining = size, total = 0; while (remaining > 0) { bits64 chunkSize = min(remaining, udcBlockSize); ssize_t rd = ourRead(ioStats, sd, buf, chunkSize); if (rd < 0) errnoAbort("udcReadAndIgnore: error reading socket after %lld bytes", total); remaining -= rd; total += rd; } if (total < size) errAbort("udcReadAndIgnore: got EOF at %lld bytes (wanted %lld)", total, size); } static int connInfoGetSocket(struct udcFile *file, char *url, bits64 offset, int size) /* If ci has an open socket and the given offset matches ci's current offset, * reuse ci->socket. Otherwise close the socket, open a new one, and update ci, * or return -1 if there is an error opening a new one. */ { /* NOTE: This doesn't use HTTP 1.1 keep alive to do multiple request on the * same socket. The only way subsequent random requests on the same socket * work is because previous request are open-ended and this can continue * reading where it left off. The HTTP requests are issued as 1.0, even * through range requests are a 1.1 feature. * * For FTP, the serial read approach is essential. FTP only supports resuming * from an offset, but doesn't not support limiting the number of bytes * transferred. All that can be done to stop the transfer is to abort the * operation, when then requires reconnecting. */ struct connInfo *ci = &file->connInfo; if (ci != NULL && ci->socket > 0 && ci->offset != offset) { bits64 skipSize = (offset - ci->offset); if (skipSize > 0 && skipSize <= MAX_SKIP_TO_SAVE_RECONNECT) { verbose(4, "skipping %lld bytes @%lld to avoid reconnect\n", skipSize, ci->offset); udcReadAndIgnore(&file->ios.net, ci->socket, skipSize); ci->offset = offset; file->ios.numReuse++; } else { verbose(4, "Offset mismatch (ci %lld != new %lld), reopening.\n", ci->offset, offset); mustCloseFd(&(ci->socket)); if (ci->ctrlSocket > 0) mustCloseFd(&(ci->ctrlSocket)); ZeroVar(ci); } } int sd; if (ci == NULL || ci->socket <= 0) { file->ios.numConnects++; if (ci->redirUrl) { url = transferParamsToRedirectedUrl(url, ci->redirUrl); } // IMPORTANT NOTE: byterange is not a real URL parameter, this is a hack to pass // the range to the net.c functions, which then parse it. char rangeUrl[2048]; if (ci == NULL) { safef(rangeUrl, sizeof(rangeUrl), "%s;byterange=%lld-%lld", url, offset, (offset + size - 1)); sd = netUrlOpen(rangeUrl); } else { safef(rangeUrl, sizeof(rangeUrl), "%s;byterange=%lld-", url, offset); sd = ci->socket = netUrlOpenSockets(rangeUrl, &(ci->ctrlSocket)); ci->offset = offset; } if (sd < 0) return -1; if (startsWith("http", url)) { char *newUrl = NULL; int newSd = 0; if (!netSkipHttpHeaderLinesHandlingRedirect(sd, rangeUrl, &newSd, &newUrl)) return -1; if (newUrl) { freeMem(newUrl); sd = newSd; if (ci != NULL) ci->socket = newSd; } } } else sd = ci->socket; return sd; } /********* Section for local file protocol **********/ static char *assertLocalUrl(char *url) /* Make sure that url is local and return bits past the protocol. */ { if (startsWith("local:", url)) url += 6; if (url[0] != '/') errAbort("Local urls must start at /"); if (stringIn("..", url) || stringIn("~", url) || stringIn("//", url) || stringIn("/./", url) || endsWith("/.", url)) { errAbort("relative paths not allowed in local urls (%s)", url); } return url; } static int udcDataViaLocal(char *url, bits64 offset, int size, void *buffer, struct udcFile *file) /* Fetch a block of data of given size into buffer using the http: protocol. * Returns number of bytes actually read. Does an errAbort on * error. Typically will be called with size in the 8k - 64k range. */ { /* Need to check time stamp here. */ verbose(4, "reading remote data - %d bytes at %lld - on %s\n", size, offset, url); url = assertLocalUrl(url); FILE *f = mustOpen(url, "rb"); fseek(f, offset, SEEK_SET); int sizeRead = ourFread(&file->ios.net, buffer, 1, size, f); if (ferror(f)) { warn("udcDataViaLocal failed to fetch %d bytes at %lld", size, offset); errnoAbort("file %s", url); } carefulClose(&f); return sizeRead; } static boolean udcInfoViaLocal(char *url, struct udcRemoteFileInfo *retInfo) /* Fill in *retTime with last modified time for file specified in url. * Return FALSE if file does not even exist. */ { verbose(4, "checking remote info on %s\n", url); url = assertLocalUrl(url); struct stat status; int ret = stat(url, &status); if (ret < 0) return FALSE; retInfo->updateTime = status.st_mtime; retInfo->size = status.st_size; return TRUE; } /********* Section for transparent file protocol **********/ static int udcDataViaTransparent(char *url, bits64 offset, int size, void *buffer, struct udcFile *file) /* Fetch a block of data of given size into buffer using the http: protocol. * Returns number of bytes actually read. Does an errAbort on * error. Typically will be called with size in the 8k - 64k range. */ { internalErr(); /* Should not get here. */ return size; } static boolean udcInfoViaTransparent(char *url, struct udcRemoteFileInfo *retInfo) /* Fill in *retInfo with last modified time for file specified in url. * Return FALSE if file does not even exist. */ { internalErr(); /* Should not get here. */ return FALSE; } /********* Section for slow local file protocol - simulates network... **********/ static int udcDataViaSlow(char *url, bits64 offset, int size, void *buffer, struct udcFile *file) /* Fetch a block of data of given size into buffer using the http: protocol. * Returns number of bytes actually read. Does an errAbort on * error. Typically will be called with size in the 8k - 64k range. */ { verbose(4, "slow reading remote data - %d bytes at %lld - on %s\n", size, offset, url); sleep1000(500); char *fileName = url + 5; /* skip over 'slow:' */ FILE *f = mustOpen(fileName, "rb"); fseek(f, offset, SEEK_SET); char *pt = buffer; int i, step=1024; int sizeRead = 0; for (i=0; i step) readChunk = step; int oneReadSize = ourFread(&file->ios.net, pt, 1, readChunk, f); verbose(4, "slowly read %d bytes\n", oneReadSize); if (ferror(f)) { warn("udcDataViaSlow failed to fetch %d bytes at %lld", size, offset); errnoAbort("file %s", fileName); } pt += step; sizeRead += oneReadSize; } carefulClose(&f); return sizeRead; } static boolean udcInfoViaSlow(char *url, struct udcRemoteFileInfo *retInfo) /* Fill in *retTime with last modified time for file specified in url. * Return FALSE if file does not even exist. */ { char *fileName = url + 5; /* skip over 'slow:' */ verbose(4, "slow checking remote info on %s\n", url); sleep1000(500); struct stat status; int ret = stat(fileName, &status); if (ret < 0) return FALSE; retInfo->updateTime = status.st_mtime; retInfo->size = status.st_size; return TRUE; } /********* Section for http protocol **********/ static char *defaultDir = "/tmp/udcCache"; char *udcDefaultDir() /* Get default directory for cache */ { return defaultDir; } void udcSetDefaultDir(char *path) /* Set default directory for cache. */ { defaultDir = cloneString(path); } void udcDisableCache() /* Switch off caching. Re-enable with udcSetDefaultDir */ { defaultDir = NULL; } static bool udcCacheEnabled() /* TRUE if caching is activated */ { return (defaultDir != NULL); } int udcDataViaHttpOrFtp( char *url, bits64 offset, int size, void *buffer, struct udcFile *file) /* Fetch a block of data of given size into buffer using url's protocol, * which must be http, https or ftp. Returns number of bytes actually read. * Does an errAbort on error. * Typically will be called with size in the 8k-64k range. */ { if (startsWith("http://",url) || startsWith("https://",url) || startsWith("ftp://",url)) verbose(4, "reading http/https/ftp data - %d bytes at %lld - on %s\n", size, offset, url); else errAbort("Invalid protocol in url [%s] in udcDataViaFtp, only http, https, or ftp supported", url); int sd = connInfoGetSocket(file, url, offset, size); if (sd < 0) errAbort("Can't get data socket for %s", url); int rd = 0, total = 0, remaining = size; char *buf = (char *)buffer; while ((remaining > 0) && ((rd = ourRead(&file->ios.net, sd, buf, remaining)) > 0)) { total += rd; buf += rd; remaining -= rd; } if (rd == -1) errnoAbort("udcDataViaHttpOrFtp: error reading socket"); struct connInfo *ci = &file->connInfo; if (ci == NULL) mustCloseFd(&sd); else ci->offset += total; return total; } boolean udcInfoViaHttp(char *url, struct udcRemoteFileInfo *retInfo) /* Gets size and last modified time of URL * and returns status of HEAD or GET byterange 0-0. */ { verbose(4, "checking http remote info on %s\n", url); // URLs passed into here should not have byterange clause. int redirectCount = 0; struct hash *hash; int status; char *sizeString = NULL; /* For caching, sites should support byte-range and last-modified. However, several groups including ENCODE have made sites that use CGIs to dynamically generate hub text files such as hub.txt, genome.txt, trackDb.txt. Byte-range and last-modified are difficult to support for this case, so they do without them, effectively defeat caching. Every 5 minutes (udcTimeout), they get re-downloaded, even when the data has not changed. */ while (TRUE) { hash = newHash(0); status = netUrlHead(url, hash); sizeString = hashFindValUpperCase(hash, "Content-Length:"); if (status == 200 && sizeString) break; /* Using HEAD with HIPPAA-compliant signed AmazonS3 URLs generates 403. The signed URL generated for GET cannot be used with HEAD. Instead call GET with byterange=0-0 in netUrlFakeHeadByGet(). This supplies both size via Content-Range response header, as well as Last-Modified header which is important for caching. There are also sites which support byte-ranges but they do not return Content-Length with HEAD. */ if (status == 403 || (status==200 && !sizeString)) { hashFree(&hash); hash = newHash(0); status = netUrlFakeHeadByGet(url, hash); if (status == 206) break; if (status == 200) // helps get more info to user break; } if (status != 301 && status != 302 && status != 307 && status != 308) return FALSE; ++redirectCount; if (redirectCount > 5) { warn("code %d redirects: exceeded limit of 5 redirects, %s", status, url); return FALSE; } char *newUrl = hashFindValUpperCase(hash, "Location:"); if (!newUrl) { warn("code %d redirects: redirect location missing, %s", status, url); return FALSE; } // path may be relative if (hasProtocol(newUrl)) { newUrl = cloneString(newUrl); } else { newUrl = expandUrlOnBase(url, newUrl); } retInfo->ci.redirUrl = newUrl; url = transferParamsToRedirectedUrl(url, newUrl); hashFree(&hash); } char *sizeHeader = NULL; if (status == 200) { sizeHeader = "Content-Length:"; // input pattern: Content-Length: 2738262 } if (status == 206) { sizeHeader = "Content-Range:"; // input pattern: Content-Range: bytes 0-99/2738262 } sizeString = hashFindValUpperCase(hash, sizeHeader); if (sizeString) { char *parseString = sizeString; if (status == 206) { parseString = strchr(sizeString, '/'); if (!parseString) { warn("Header value %s is missing '/' in %s in response for url %s", sizeString, sizeHeader, url); return FALSE; } ++parseString; // skip past slash } if (parseString) { retInfo->size = atoll(parseString); } else { warn("Header value %s is missing or invalid in %s in response for url %s", sizeString, sizeHeader, url); return FALSE; } } else { warn("Response is missing required header %s for url %s", sizeHeader, url); return FALSE; } char *lastModString = hashFindValUpperCase(hash, "Last-Modified:"); if (lastModString == NULL) { // Date is a poor substitute! It will always appear that the cache is stale. // But at least we can read files from dropbox.com. lastModString = hashFindValUpperCase(hash, "Date:"); if (lastModString == NULL) { hashFree(&hash); errAbort("No Last-Modified: or Date: returned in header for %s, can't proceed, sorry", url); } } struct tm tm; time_t t; // Last-Modified: Wed, 15 Nov 1995 04:58:08 GMT // This will always be GMT if (strptime(lastModString, "%a, %d %b %Y %H:%M:%S %Z", &tm) == NULL) { /* Handle error */; hashFree(&hash); errAbort("unable to parse last-modified string [%s]", lastModString); } t = mktimeFromUtc(&tm); if (t == -1) { /* Handle error */; hashFree(&hash); errAbort("mktimeFromUtc failed while converting last-modified string [%s] from UTC time", lastModString); } retInfo->updateTime = t; hashFree(&hash); return status; } /********* Section for ftp protocol **********/ // fetchData method: See udcDataViaHttpOrFtp above. boolean udcInfoViaFtp(char *url, struct udcRemoteFileInfo *retInfo) /* Gets size and last modified time of FTP URL */ { verbose(4, "checking ftp remote info on %s\n", url); long long size = 0; time_t t, tUtc; struct tm *tm = NULL; // TODO: would be nice to add int *retCtrlSocket to netGetFtpInfo so we can stash // in retInfo->connInfo and keep socket open. boolean ok = netGetFtpInfo(url, &size, &tUtc); if (!ok) return FALSE; // Convert UTC to localtime tm = localtime(&tUtc); t = mktimeFromUtc(tm); if (t == -1) { /* Handle error */; errAbort("mktimeFromUtc failed while converting FTP UTC last-modified time %ld to local time", (long) tUtc); } retInfo->size = size; retInfo->updateTime = t; return TRUE; } /********* Non-protocol-specific bits **********/ boolean udcFastReadString(struct udcFile *f, char buf[256]) /* Read a string into buffer, which must be long enough * to hold it. String is in 'writeString' format. */ { UBYTE bLen; int len; if (!udcReadOne(f, bLen)) return FALSE; if ((len = bLen)> 0) udcMustRead(f, buf, len); buf[len] = 0; return TRUE; } void msbFirstWriteBits64(FILE *f, bits64 x); static char *fileNameInCacheDir(struct udcFile *file, char *fileName) /* Return the name of a file in the cache dir, from the cache root directory on down. * Do a freeMem on this when done. */ { int dirLen = strlen(file->cacheDir); int nameLen = strlen(fileName); char *path = needMem(dirLen + nameLen + 2); memcpy(path, file->cacheDir, dirLen); path[dirLen] = '/'; memcpy(path+dirLen+1, fileName, nameLen); return path; } static void udcNewCreateBitmapAndSparse(struct udcFile *file, bits64 remoteUpdate, bits64 remoteSize, bits32 version) /* Create a new bitmap file around the given remoteUpdate time. */ { int fd = mustOpenFd(file->bitmapFileName, O_WRONLY | O_CREAT | O_TRUNC); bits32 sig = udcBitmapSig; bits32 blockSize = udcBlockSize; bits64 reserved64 = 0; bits32 reserved32 = 0; int blockCount = (remoteSize + udcBlockSize - 1)/udcBlockSize; int bitmapSize = bitToByteSize(blockCount); /* Write out fixed part of header. */ writeOneFd(fd, sig); writeOneFd(fd, blockSize); writeOneFd(fd, remoteUpdate); writeOneFd(fd, remoteSize); writeOneFd(fd, version); writeOneFd(fd, reserved32); writeOneFd(fd, reserved64); writeOneFd(fd, reserved64); writeOneFd(fd, reserved64); writeOneFd(fd, reserved64); long long offset = ourMustLseek(&file->ios.bit, fd, 0, SEEK_CUR); if (offset != udcBitmapHeaderSize) errAbort("offset in fd=%d, f=%s is %lld, not expected udcBitmapHeaderSize %d", fd, file->bitmapFileName, offset, udcBitmapHeaderSize); /* Write out initial all-zero bitmap, using sparse-file method: write 0 to final address. */ unsigned char zero = 0; ourMustLseek(&file->ios.bit, fd, bitmapSize-1, SEEK_CUR); ourMustWrite(&file->ios.bit, fd, &zero, 1); /* Clean up bitmap file and name. */ mustCloseFd(&fd); /* Create an empty data file. */ fd = mustOpenFd(file->sparseFileName, O_WRONLY | O_CREAT | O_TRUNC); mustCloseFd(&fd); } static struct udcBitmap *udcBitmapOpen(char *fileName) /* Open up a bitmap file and read and verify header. Return NULL if file doesn't * exist, abort on error. */ { /* Open file, returning NULL if can't. */ int fd = open(fileName, O_RDWR); if (fd < 0) { if (errno == ENOENT) return NULL; else errnoAbort("Can't open(%s, O_RDWR)", fileName); } /* Get status info from file. */ struct stat status; fstat(fd, &status); if (status.st_size < udcBitmapHeaderSize) // check for truncated invalid bitmap files. { close(fd); return NULL; // returning NULL will cause the fresh creation of bitmap and sparseData files. } /* Read signature and decide if byte-swapping is needed. */ // TODO: maybe buffer the I/O for performance? Don't read past header - // fd offset needs to point to first data block when we return. bits32 magic; boolean isSwapped = FALSE; mustReadOneFd(fd, magic); if (magic != udcBitmapSig) { magic = byteSwap32(magic); isSwapped = TRUE; if (magic != udcBitmapSig) errAbort("%s is not a udcBitmap file", fileName); } /* Allocate bitmap object, fill it in, and return it. */ struct udcBitmap *bits; AllocVar(bits); bits->blockSize = fdReadBits32(fd, isSwapped); bits->remoteUpdate = fdReadBits64(fd, isSwapped); bits->fileSize = fdReadBits64(fd, isSwapped); bits->version = fdReadBits32(fd, isSwapped); fdReadBits32(fd, isSwapped); // ignore result fdReadBits64(fd, isSwapped); // ignore result fdReadBits64(fd, isSwapped); // ignore result fdReadBits64(fd, isSwapped); // ignore result fdReadBits64(fd, isSwapped); // ignore result bits->localUpdate = status.st_mtime; bits->localAccess = status.st_atime; bits->isSwapped = isSwapped; bits->fd = fd; return bits; } static void udcBitmapClose(struct udcBitmap **pBits) /* Free up resources associated with udcBitmap. */ { struct udcBitmap *bits = *pBits; if (bits != NULL) { mustCloseFd(&(bits->fd)); freez(pBits); } } static struct udcProtocol *udcProtocolNew(char *upToColon) /* Build up a new protocol around a string such as "http" or "local" */ { struct udcProtocol *prot; AllocVar(prot); if (sameString(upToColon, "local")) { prot->fetchData = udcDataViaLocal; prot->fetchInfo = udcInfoViaLocal; prot->type = "local"; } else if (sameString(upToColon, "slow")) { prot->fetchData = udcDataViaSlow; prot->fetchInfo = udcInfoViaSlow; prot->type = "slow"; } else if (sameString(upToColon, "http") || sameString(upToColon, "https")) { prot->fetchData = udcDataViaHttpOrFtp; prot->fetchInfo = udcInfoViaHttp; prot->type = "http"; } else if (sameString(upToColon, "ftp")) { prot->fetchData = udcDataViaHttpOrFtp; prot->fetchInfo = udcInfoViaFtp; prot->type = "ftp"; } else if (sameString(upToColon, "transparent")) { prot->fetchData = udcDataViaTransparent; prot->fetchInfo = udcInfoViaTransparent; prot->type = "transparent"; } else { errAbort("Unrecognized protocol %s in udcProtNew", upToColon); } return prot; } static void udcProtocolFree(struct udcProtocol **pProt) /* Free up protocol resources. */ { freez(pProt); } static void setInitialCachedDataBounds(struct udcFile *file, boolean useCacheInfo) /* Open up bitmap file and read a little bit of it to see if cache is stale, * and if not to see if the initial part is cached. Sets the data members * startData, and endData. If the case is stale it makes fresh empty * cacheDir/sparseData and cacheDir/bitmap files. */ { bits32 version = 0; /* Get existing bitmap, and if it's stale clean up. */ struct udcBitmap *bits = udcBitmapOpen(file->bitmapFileName); if (bits != NULL) { if (useCacheInfo) { file->size = bits->fileSize; file->updateTime = bits->remoteUpdate; } version = bits->version; if (bits->remoteUpdate != file->updateTime || bits->fileSize != file->size || !fileExists(file->sparseFileName) || (fileSize(file->sparseFileName) == 0 && file->size > 0 && fileSize(file->bitmapFileName) > udcBitmapHeaderSize)) { verbose(4, "removing stale version (%lld! = %lld or %lld! = %lld or %s doesn't exist or should not be size 0), " "new version %d\n", bits->remoteUpdate, (long long)file->updateTime, bits->fileSize, file->size, file->sparseFileName, version); udcBitmapClose(&bits); remove(file->bitmapFileName); remove(file->sparseFileName); if (fileExists(file->redirFileName)) remove(file->redirFileName); ++version; } } else verbose(4, "bitmap file %s does not already exist, creating.\n", file->bitmapFileName); /* If no bitmap, then create one, and also an empty sparse data file. */ if (bits == NULL) { udcNewCreateBitmapAndSparse(file, file->updateTime, file->size, version); bits = udcBitmapOpen(file->bitmapFileName); if (bits == NULL) errAbort("Unable to open bitmap file %s", file->bitmapFileName); } file->bitmapVersion = bits->version; /* Read in a little bit from bitmap while we have it open to see if we have anything cached. */ if (file->size > 0) { Bits b; off_t wasAt = lseek(bits->fd, 0, SEEK_CUR); mustReadOneFd(bits->fd, b); int endBlock = (file->size + udcBlockSize - 1)/udcBlockSize; if (endBlock > 8) endBlock = 8; int initialCachedBlocks = bitFindClear(&b, 0, endBlock); file->endData = initialCachedBlocks * udcBlockSize; ourMustLseek(&file->ios.bit, bits->fd, wasAt, SEEK_SET); } file->bits = bits; } static boolean qEscaped(char c) /* Returns TRUE if character needs to be escaped in q-encoding. */ { if (isalnum(c)) return c == 'Q'; else return c != '_' && c != '-' && c != '/' && c != '.'; } static char *qEncode(char *input) /* Do a simple encoding to convert input string into "normal" characters. * Abnormal letters, and '!' get converted into Q followed by two hexadecimal digits. */ { /* First go through and figure out encoded size. */ int size = 0; char *s, *d, c; s = input; while ((c = *s++) != 0) { if (qEscaped(c)) size += 3; else size += 1; } /* Allocate and fill in output. */ char *output = needMem(size+1); s = input; d = output; while ((c = *s++) != 0) { if (qEscaped(c)) { sprintf(d, "Q%02X", (unsigned)c); d += 3; } else *d++ = c; } return output; } void udcParseUrlFull(char *url, char **retProtocol, char **retAfterProtocol, char **retColon, char **retAuth) /* Parse the URL into components that udc treats separately. * *retAfterProtocol is Q-encoded to keep special chars out of filenames. * Free all *ret's except *retColon when done. */ { char *protocol, *afterProtocol; char *colon = strchr(url, ':'); if (!colon) { *retColon = NULL; return; } int colonPos = colon - url; protocol = cloneStringZ(url, colonPos); afterProtocol = url + colonPos + 1; while (afterProtocol[0] == '/') afterProtocol += 1; char *userPwd = strchr(afterProtocol, '@'); if (userPwd) { if (retAuth) { char auth[1024]; safencpy(auth, sizeof(auth), afterProtocol, userPwd+1-afterProtocol); *retAuth = qEncode(auth); } char *afterHost = strchr(afterProtocol, '/'); if (!afterHost) { afterHost = afterProtocol+strlen(afterProtocol); } if (userPwd < afterHost) afterProtocol = userPwd + 1; } else if (retAuth) *retAuth = NULL; afterProtocol = qEncode(afterProtocol); *retProtocol = protocol; *retAfterProtocol = afterProtocol; *retColon = colon; } void udcParseUrl(char *url, char **retProtocol, char **retAfterProtocol, char **retColon) /* Parse the URL into components that udc treats separately. * *retAfterProtocol is Q-encoded to keep special chars out of filenames. * Free *retProtocol and *retAfterProtocol but not *retColon when done. */ { udcParseUrlFull(url, retProtocol, retAfterProtocol, retColon, NULL); } static void addElementToDy(struct dyString *dy, int maxLen, char *name) /* add one element of a path to a dyString, hashing it if it's longer * than NAME_MAX */ { if (strlen(name) > maxLen) { unsigned char hash[SHA_DIGEST_LENGTH]; char newName[(SHA_DIGEST_LENGTH + 1) * 2]; SHA1((const unsigned char *)name, strlen(name), hash); hexBinaryString(hash, SHA_DIGEST_LENGTH, newName, (SHA_DIGEST_LENGTH + 1) * 2); dyStringAppend(dy, newName); } else dyStringAppend(dy, name); } static char *longDirHash(char *cacheDir, char *name) /* take a path and hash the elements that are longer than NAME_MAX */ { int maxLen = pathconf(cacheDir, _PC_NAME_MAX); if (maxLen < 0) // if we can't get the real system max, assume it's 255 maxLen = 255; struct dyString *dy = newDyString(strlen(name)); char *ptr = strchr(name, '/'); while(ptr) { *ptr = 0; addElementToDy(dy, maxLen, name); dyStringAppend(dy, "/"); name = ptr + 1; ptr = strchr(name, '/'); } addElementToDy(dy, maxLen, name); return dyStringCannibalize(&dy); } void udcPathAndFileNames(struct udcFile *file, char *cacheDir, char *protocol, char *afterProtocol) /* Initialize udcFile path and names */ { if (cacheDir==NULL) return; char *hashedAfterProtocol = longDirHash(cacheDir, afterProtocol); int len = strlen(cacheDir) + 1 + strlen(protocol) + 1 + strlen(hashedAfterProtocol) + 1; file->cacheDir = needMem(len); safef(file->cacheDir, len, "%s/%s/%s", cacheDir, protocol, hashedAfterProtocol); /* Create file names for bitmap and data portions. */ file->bitmapFileName = fileNameInCacheDir(file, bitmapName); file->sparseFileName = fileNameInCacheDir(file, sparseDataName); file->redirFileName = fileNameInCacheDir(file, redirName); } static long long int udcSizeAndModTimeFromBitmap(char *bitmapFileName, time_t *retTime) /* Look up the file size from the local cache bitmap file, or -1 if there * is no cache for url. If retTime is non-null, store the remote update time in it. */ { long long int ret = -1; struct udcBitmap *bits = udcBitmapOpen(bitmapFileName); if (bits != NULL) { ret = bits->fileSize; if (retTime) *retTime = bits->remoteUpdate; } udcBitmapClose(&bits); return ret; } static void udcTestAndSetRedirect(struct udcFile *file, char *protocol, boolean useCacheInfo) /* update redirect info */ { if (startsWith("http", protocol)) { char *newUrl = NULL; // read redir from cache if it exists if (fileExists(file->redirFileName)) { readInGulp(file->redirFileName, &newUrl, NULL); } if (useCacheInfo) { file->connInfo.redirUrl = cloneString(newUrl); } else { if (file->connInfo.redirUrl) { if (!sameOk(file->connInfo.redirUrl, newUrl)) { // write redir to cache char *temp = addSuffix(file->redirFileName, ".temp"); writeGulp(temp, file->connInfo.redirUrl, strlen(file->connInfo.redirUrl)); rename(temp, file->redirFileName); freeMem(temp); } } else { // delete redir from cache (if it exists) if (newUrl) remove(file->redirFileName); } } freeMem(newUrl); } } struct udcFile *udcFileMayOpen(char *url, char *cacheDir) /* Open up a cached file. cacheDir may be null in which case udcDefaultDir() will be * used. Return NULL if file doesn't exist. * Caching is inactive if defaultDir is NULL or the protocol is "transparent". * */ { if (cacheDir == NULL) cacheDir = udcDefaultDir(); verbose(4, "udcfileOpen(%s, %s)\n", url, cacheDir); if (udcLogStream) fprintf(udcLogStream, "Open %s\n", url); /* Parse out protocol. Make it "transparent" if none specified. */ char *protocol = NULL, *afterProtocol = NULL, *colon; boolean isTransparent = FALSE; udcParseUrl(url, &protocol, &afterProtocol, &colon); if (!colon) { freeMem(protocol); protocol = cloneString("transparent"); freeMem(afterProtocol); afterProtocol = cloneString(url); isTransparent = TRUE; } struct udcProtocol *prot; prot = udcProtocolNew(protocol); /* Figure out if anything exists. */ boolean useCacheInfo = FALSE; struct udcRemoteFileInfo info; ZeroVar(&info); if (!isTransparent) { if (udcCacheEnabled()) useCacheInfo = (udcCacheAge(url, cacheDir) < udcCacheTimeout()); if (!useCacheInfo) { if (!prot->fetchInfo(url, &info)) { udcProtocolFree(&prot); freeMem(protocol); freeMem(afterProtocol); return NULL; } } } /* Allocate file object and start filling it in. */ struct udcFile *file; AllocVar(file); file->url = cloneString(url); file->protocol = protocol; file->prot = prot; if (isTransparent) { /* If transparent dummy up things so that the "sparse" file pointer is actually * the file itself, which appears to be completely loaded in cache. */ if (!fileExists(url)) return NULL; int fd = file->fdSparse = mustOpenFd(url, O_RDONLY); struct stat status; fstat(fd, &status); file->startData = 0; file->endData = file->size = status.st_size; } else { udcPathAndFileNames(file, cacheDir, protocol, afterProtocol); if (!useCacheInfo) { file->updateTime = info.updateTime; file->size = info.size; memcpy(&(file->connInfo), &(info.ci), sizeof(struct connInfo)); // update cache file mod times, so if we're caching we won't do this again // until the timeout has expired again: if (udcCacheTimeout() > 0 && udcCacheEnabled() && fileExists(file->bitmapFileName)) (void)maybeTouchFile(file->bitmapFileName); } if (udcCacheEnabled()) { /* Make directory. */ makeDirsOnPath(file->cacheDir); /* Figure out a little bit about the extent of the good cached data if any. Open bits bitmap. */ setInitialCachedDataBounds(file, useCacheInfo); file->fdSparse = mustOpenFd(file->sparseFileName, O_RDWR); // update redir with latest redirect status udcTestAndSetRedirect(file, protocol, useCacheInfo); } } freeMem(afterProtocol); return file; } struct udcFile *udcFileOpen(char *url, char *cacheDir) /* Open up a cached file. cacheDir may be null in which case udcDefaultDir() will be * used. Abort if file doesn't exist. */ { struct udcFile *udcFile = udcFileMayOpen(url, cacheDir); if (udcFile == NULL) errAbort("Couldn't open %s", url); return udcFile; } struct slName *udcFileCacheFiles(char *url, char *cacheDir) /* Return low-level list of files used in cache. */ { char *protocol, *afterProtocol, *colon; struct udcFile *file; udcParseUrl(url, &protocol, &afterProtocol, &colon); if (colon == NULL) return NULL; AllocVar(file); udcPathAndFileNames(file, cacheDir, protocol, afterProtocol); struct slName *list = NULL; slAddHead(&list, slNameNew(file->bitmapFileName)); slAddHead(&list, slNameNew(file->sparseFileName)); slAddHead(&list, slNameNew(file->redirFileName)); slReverse(&list); freeMem(file->cacheDir); freeMem(file->bitmapFileName); freeMem(file->sparseFileName); freeMem(file); freeMem(protocol); freeMem(afterProtocol); return list; } void udcFileClose(struct udcFile **pFile) /* Close down cached file. */ { struct udcFile *file = *pFile; if (file != NULL) { if (udcLogStream) { fprintf(udcLogStream, "Close %s %s %lld %lld bit %lld %lld %lld %lld %lld sparse %lld %lld %lld %lld %lld udc %lld %lld %lld %lld %lld net %lld %lld %lld %lld %lld \n", file->url, file->prot->type, file->ios.numConnects, file->ios.numReuse, file->ios.bit.numSeeks, file->ios.bit.numReads, file->ios.bit.bytesRead, file->ios.bit.numWrites, file->ios.bit.bytesWritten, file->ios.sparse.numSeeks, file->ios.sparse.numReads, file->ios.sparse.bytesRead, file->ios.sparse.numWrites, file->ios.sparse.bytesWritten, file->ios.udc.numSeeks, file->ios.udc.numReads, file->ios.udc.bytesRead, file->ios.udc.numWrites, file->ios.udc.bytesWritten, file->ios.net.numSeeks, file->ios.net.numReads, file->ios.net.bytesRead, file->ios.net.numWrites, file->ios.net.bytesWritten); } if (file->mmapBase != NULL) { if (munmap(file->mmapBase, file->size) < 0) errnoAbort("munmap() failed on %s", file->url); } if (file->connInfo.socket != 0) mustCloseFd(&(file->connInfo.socket)); if (file->connInfo.ctrlSocket != 0) mustCloseFd(&(file->connInfo.ctrlSocket)); freeMem(file->url); freeMem(file->protocol); udcProtocolFree(&file->prot); freeMem(file->cacheDir); freeMem(file->bitmapFileName); freeMem(file->sparseFileName); freeMem(file->sparseReadAheadBuf); if (file->fdSparse != 0) mustCloseFd(&(file->fdSparse)); udcBitmapClose(&file->bits); } freez(pFile); } static void qDecode(const char *input, char *buf, size_t size) /* Reverse the qEncode performed on afterProcotol above into buf or abort. */ { safecpy(buf, size, input); char c, *r = buf, *w = buf; while ((c = *r++) != '\0') { if (c == 'Q') { int q; if (sscanf(r, "%02X", &q)) { *w++ = (char)q; r += 2; } else errAbort("qDecode: input \"%s\" does not appear to be properly formatted " "starting at \"%s\"", input, r); } else *w++ = c; } *w = '\0'; } char *udcPathToUrl(const char *path, char *buf, size_t size, char *cacheDir) /* Translate path into an URL, store in buf, return pointer to buf if successful * and NULL if not. */ { if (cacheDir == NULL) cacheDir = udcDefaultDir(); int offset = 0; if (startsWith(cacheDir, (char *)path)) offset = strlen(cacheDir); if (path[offset] == '/') offset++; char protocol[16]; strncpy(protocol, path+offset, sizeof(protocol)); protocol[ArraySize(protocol)-1] = '\0'; char *p = strchr(protocol, '/'); if (p == NULL) { errAbort("unable to parse protocol (first non-'%s' directory) out of path '%s'\n", cacheDir, path); return NULL; } *p++ = '\0'; char afterProtocol[4096]; qDecode(path+1+strlen(protocol)+1, afterProtocol, sizeof(afterProtocol)); safef(buf, size, "%s://%s", protocol, afterProtocol); return buf; } long long int udcSizeFromCache(char *url, char *cacheDir) /* Look up the file size from the local cache bitmap file, or -1 if there * is no cache for url. */ { long long int ret = -1; if (cacheDir == NULL) cacheDir = udcDefaultDir(); struct slName *sl, *slList = udcFileCacheFiles(url, cacheDir); for (sl = slList; sl != NULL; sl = sl->next) if (endsWith(sl->name, bitmapName)) { ret = udcSizeAndModTimeFromBitmap(sl->name, NULL); break; } slNameFreeList(&slList); return ret; } time_t udcTimeFromCache(char *url, char *cacheDir) /* Look up the file datetime from the local cache bitmap file, or 0 if there * is no cache for url. */ { time_t t = 0; long long int ret = -1; if (cacheDir == NULL) cacheDir = udcDefaultDir(); struct slName *sl, *slList = udcFileCacheFiles(url, cacheDir); for (sl = slList; sl != NULL; sl = sl->next) if (endsWith(sl->name, bitmapName)) { ret = udcSizeAndModTimeFromBitmap(sl->name, &t); if (ret == -1) t = 0; break; } slNameFreeList(&slList); return t; } unsigned long udcCacheAge(char *url, char *cacheDir) /* Return the age in seconds of the oldest cache file. If a cache file is * missing, return the current time (seconds since the epoch). */ { unsigned long now = clock1(), oldestTime = now; if (cacheDir == NULL) cacheDir = udcDefaultDir(); struct slName *sl, *slList = udcFileCacheFiles(url, cacheDir); if (slList == NULL) return now; for (sl = slList; sl != NULL; sl = sl->next) if (endsWith(sl->name, bitmapName)) { if (fileExists(sl->name)) oldestTime = min(fileModTime(sl->name), oldestTime); else return now; } return (now - oldestTime); } static void readBitsIntoBuf(struct udcFile *file, int fd, int headerSize, int bitStart, int bitEnd, Bits **retBits, int *retPartOffset) /* Do some bit-to-byte offset conversions and read in all the bytes that * have information in the bits we're interested in. */ { int byteStart = bitStart/8; int byteEnd = bitToByteSize(bitEnd); int byteSize = byteEnd - byteStart; Bits *bits = needLargeMem(byteSize); ourMustLseek(&file->ios.bit,fd, headerSize + byteStart, SEEK_SET); ourMustRead(&file->ios.bit, fd, bits, byteSize); *retBits = bits; *retPartOffset = byteStart*8; } static boolean allBitsSetInFile(int bitStart, int bitEnd, int partOffset, Bits *bits) /* Return TRUE if all bits in file between start and end are set. */ { int partBitStart = bitStart - partOffset; int partBitEnd = bitEnd - partOffset; int nextClearBit = bitFindClear(bits, partBitStart, partBitEnd); boolean allSet = (nextClearBit >= partBitEnd); return allSet; } // For tests/udcTest.c debugging: not declared in udc.h, but not static either: boolean udcCheckCacheBits(struct udcFile *file, int startBlock, int endBlock) /* Warn and return TRUE if any bit in (startBlock,endBlock] is not set. */ { boolean gotUnset = FALSE; struct udcBitmap *bitmap = udcBitmapOpen(file->bitmapFileName); int partOffset; Bits *bits; readBitsIntoBuf(file, bitmap->fd, udcBitmapHeaderSize, startBlock, endBlock, &bits, &partOffset); int partBitStart = startBlock - partOffset; int partBitEnd = endBlock - partOffset; int nextClearBit = bitFindClear(bits, partBitStart, partBitEnd); while (nextClearBit < partBitEnd) { int clearBlock = nextClearBit + partOffset; warn("... udcFile 0x%04lx: bit for block %d (%lld..%lld] is not set", (unsigned long)file, clearBlock, ((long long)clearBlock * udcBlockSize), (((long long)clearBlock+1) * udcBlockSize)); gotUnset = TRUE; int nextSetBit = bitFindSet(bits, nextClearBit, partBitEnd); nextClearBit = bitFindClear(bits, nextSetBit, partBitEnd); } return gotUnset; } static void fetchMissingBlocks(struct udcFile *file, struct udcBitmap *bits, int startBlock, int blockCount, int blockSize) /* Fetch missing blocks from remote and put them into file. errAbort if trouble. */ { bits64 startPos = (bits64)startBlock * blockSize; bits64 endPos = startPos + (bits64)blockCount * blockSize; if (endPos > file->size) endPos = file->size; if (endPos > startPos) { bits64 readSize = endPos - startPos; void *buf = needLargeMem(readSize); int actualSize = file->prot->fetchData(file->url, startPos, readSize, buf, file); if (actualSize != readSize) errAbort("unable to fetch %lld bytes from %s @%lld (got %d bytes)", readSize, file->url, startPos, actualSize); ourMustLseek(&file->ios.sparse, file->fdSparse, startPos, SEEK_SET); ourMustWrite(&file->ios.sparse, file->fdSparse, buf, readSize); freez(&buf); } } static boolean fetchMissingBits(struct udcFile *file, struct udcBitmap *bits, bits64 start, bits64 end, bits64 *retFetchedStart, bits64 *retFetchedEnd) /* Scan through relevant parts of bitmap, fetching blocks we don't already have. */ { /* Fetch relevant part of bitmap into memory */ int partOffset; Bits *b; int startBlock = start / bits->blockSize; int endBlock = (end + bits->blockSize - 1) / bits->blockSize; readBitsIntoBuf(file, bits->fd, udcBitmapHeaderSize, startBlock, endBlock, &b, &partOffset); if (allBitsSetInFile(startBlock, endBlock, partOffset, b)) { // it is already in the cache freeMem(b); return TRUE; } /* Loop around first skipping set bits, then fetching clear bits. */ boolean dirty = FALSE; int s = startBlock - partOffset; int e = endBlock - partOffset; for (;;) { int nextClearBit = bitFindClear(b, s, e); if (nextClearBit >= e) break; int nextSetBit = bitFindSet(b, nextClearBit, e); int clearSize = nextSetBit - nextClearBit; fetchMissingBlocks(file, bits, nextClearBit + partOffset, clearSize, bits->blockSize); bitSetRange(b, nextClearBit, clearSize); dirty = TRUE; if (nextSetBit >= e) break; s = nextSetBit; } if (dirty) { /* Update bitmap on disk.... */ int byteStart = startBlock/8; int byteEnd = bitToByteSize(endBlock); int byteSize = byteEnd - byteStart; ourMustLseek(&file->ios.bit, bits->fd, byteStart + udcBitmapHeaderSize, SEEK_SET); ourMustWrite(&file->ios.bit, bits->fd, b, byteSize); } freeMem(b); *retFetchedStart = startBlock * bits->blockSize; *retFetchedEnd = endBlock * bits->blockSize; return FALSE; } static boolean rangeIntersectOrTouch64(bits64 start1, bits64 end1, bits64 start2, bits64 end2) /* Return true if two 64-bit ranges intersect or touch. */ { // cannot use the version of this function that is in common.c since it only handles integers. bits64 s = max(start1,start2); bits64 e = min(end1,end2); return e >= s; } static void udcFetchMissing(struct udcFile *file, struct udcBitmap *bits, bits64 start, bits64 end) /* Fetch missing pieces of data from file */ { /* Call lower level routine fetch remote data that is not already here. */ bits64 fetchedStart, fetchedEnd; if (fetchMissingBits(file, bits, start, end, &fetchedStart, &fetchedEnd)) return; /* Update file startData/endData members to include new data (and old as well if * the new data overlaps the old). */ if (rangeIntersectOrTouch64(file->startData, file->endData, fetchedStart, fetchedEnd)) { if (fetchedStart > file->startData) fetchedStart = file->startData; if (fetchedEnd < file->endData) fetchedEnd = file->endData; } file->startData = fetchedStart; file->endData = fetchedEnd; } static boolean udcCachePreload(struct udcFile *file, bits64 offset, bits64 size) /* Make sure that given data is in cache - fetching it remotely if need be. * Return TRUE on success. */ { if (!udcCacheEnabled()) return TRUE; boolean ok = TRUE; /* Original comment said: * "We'll break this operation into blocks of a reasonable size to allow * other processes to get cache access, since we have to lock the cache files." * However there is no locking done, so this whole splitting might be unnecessary * complexity. */ bits64 s,e, endPos=offset+size; for (s = offset; s < endPos; s = e) { /* Figure out bounds of this section. */ e = s + udcMaxBytesPerRemoteFetch; if (e > endPos) e = endPos; struct udcBitmap *bits = file->bits; if (bits->version == file->bitmapVersion) { udcFetchMissing(file, bits, s, e); } else { ok = FALSE; verbose(4, "udcCachePreload version check failed %d vs %d", bits->version, file->bitmapVersion); } if (!ok) break; } return ok; } #define READAHEADBUFSIZE 4096 bits64 udcRead(struct udcFile *file, void *buf, bits64 size) /* Read a block from file. Return amount actually read. */ { file->ios.udc.numReads++; // if not caching, just fetch the data if (!udcCacheEnabled() && !sameString(file->protocol, "transparent")) { int actualSize = file->prot->fetchData(file->url, file->offset, size, buf, file); file->offset += actualSize; file->ios.udc.bytesRead += actualSize; return actualSize; } file->ios.udc.bytesRead += size; /* Figure out region of file we're going to read, and clip it against file size. */ bits64 start = file->offset; if (start > file->size) return 0; bits64 end = start + size; if (end > file->size) end = file->size; size = end - start; char *cbuf = buf; /* use read-ahead buffer if present */ bits64 bytesRead = 0; bits64 raStart; bits64 raEnd; while(TRUE) { if (file->sparseReadAhead) { raStart = file->sparseRAOffset; raEnd = raStart+READAHEADBUFSIZE; if (start >= raStart && start < raEnd) { // copy bytes out of rabuf bits64 endInBuf = min(raEnd, end); bits64 sizeInBuf = endInBuf - start; memcpy(cbuf, file->sparseReadAheadBuf + (start-raStart), sizeInBuf); cbuf += sizeInBuf; bytesRead += sizeInBuf; start = raEnd; size -= sizeInBuf; file->offset += sizeInBuf; if (size == 0) break; } file->sparseReadAhead = FALSE; ourMustLseek(&file->ios.sparse,file->fdSparse, start, SEEK_SET); } bits64 saveEnd = end; if (size < READAHEADBUFSIZE) { file->sparseReadAhead = TRUE; if (!file->sparseReadAheadBuf) file->sparseReadAheadBuf = needMem(READAHEADBUFSIZE); file->sparseRAOffset = start; size = READAHEADBUFSIZE; end = start + size; if (end > file->size) { end = file->size; size = end - start; } } /* If we're outside of the window of file we already know is good, then have to * consult cache on disk, and maybe even fetch data remotely! */ if (start < file->startData || end > file->endData) { if (!udcCachePreload(file, start, size)) { verbose(4, "udcCachePreload failed"); bytesRead = 0; break; } /* Currently only need fseek here. Would be safer, but possibly * slower to move fseek so it is always executed in front of read, in * case other code is moving around file pointer. */ ourMustLseek(&file->ios.sparse,file->fdSparse, start, SEEK_SET); } if (file->sparseReadAhead) { ourMustRead(&file->ios.sparse,file->fdSparse, file->sparseReadAheadBuf, size); end = saveEnd; size = end - start; } else { ourMustRead(&file->ios.sparse,file->fdSparse, cbuf, size); file->offset += size; bytesRead += size; break; } } return bytesRead; } void udcMustRead(struct udcFile *file, void *buf, bits64 size) /* Read a block from file. Abort if any problem, including EOF before size is read. */ { bits64 sizeRead = udcRead(file, buf, size); if (sizeRead < size) errAbort("udc couldn't read %llu bytes from %s, did read %llu", size, file->url, sizeRead); } int udcGetChar(struct udcFile *file) /* Get next character from file or die trying. */ { UBYTE b; udcMustRead(file, &b, 1); return b; } bits64 udcReadBits64(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap 64 bit entity. */ { bits64 val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwap64(val); return val; } bits32 udcReadBits32(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap 32 bit entity. */ { bits32 val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwap32(val); return val; } bits16 udcReadBits16(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap 16 bit entity. */ { bits16 val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwap16(val); return val; } float udcReadFloat(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap floating point number. */ { float val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwapFloat(val); return val; } double udcReadDouble(struct udcFile *file, boolean isSwapped) /* Read and optionally byte-swap double-precision floating point number. */ { double val; udcMustRead(file, &val, sizeof(val)); if (isSwapped) val = byteSwapDouble(val); return val; } char *udcReadLine(struct udcFile *file) /* Fetch next line from udc cache or NULL. */ { char shortBuf[2], *longBuf = NULL, *buf = shortBuf; int i, bufSize = sizeof(shortBuf); for (i=0; ; ++i) { /* See if need to expand buffer, which is initially on stack, but if it gets big goes into * heap. */ if (i >= bufSize) { int newBufSize = bufSize*2; char *newBuf = needLargeMem(newBufSize); memcpy(newBuf, buf, bufSize); freeMem(longBuf); buf = longBuf = newBuf; bufSize = newBufSize; } char c; bits64 sizeRead = udcRead(file, &c, 1); if (sizeRead == 0) + { + // EOF before newline: return NULL for empty string + if (i == 0) return NULL; + break; + } buf[i] = c; if (c == '\n') { buf[i] = 0; break; } } char *retString = cloneString(buf); freeMem(longBuf); return retString; } char *udcReadStringAndZero(struct udcFile *file) /* Read in zero terminated string from file. Do a freeMem of result when done. */ { char shortBuf[2], *longBuf = NULL, *buf = shortBuf; int i, bufSize = sizeof(shortBuf); for (i=0; ; ++i) { /* See if need to expand buffer, which is initially on stack, but if it gets big goes into * heap. */ if (i >= bufSize) { int newBufSize = bufSize*2; char *newBuf = needLargeMem(newBufSize); memcpy(newBuf, buf, bufSize); freeMem(longBuf); buf = longBuf = newBuf; bufSize = newBufSize; } char c = udcGetChar(file); buf[i] = c; if (c == 0) break; } char *retString = cloneString(buf); freeMem(longBuf); return retString; } char *udcFileReadAll(char *url, char *cacheDir, size_t maxSize, size_t *retSize) /* Read a complete file via UDC. The cacheDir may be null in which case udcDefaultDir() * will be used. If maxSize is non-zero, check size against maxSize * and abort if it's bigger. Returns file data (with an extra terminal for the * common case where it's treated as a C string). If retSize is non-NULL then * returns size of file in *retSize. Do a freeMem or freez of the returned buffer * when done. */ { struct udcFile *file = udcFileOpen(url, cacheDir); size_t size = file->size; if (maxSize != 0 && size > maxSize) errAbort("%s is %lld bytes, but maxSize to udcFileReadAll is %lld", url, (long long)size, (long long)maxSize); char *buf = needLargeMem(size+1); udcMustRead(file, buf, size); buf[size] = 0; // add trailing zero for string processing udcFileClose(&file); if (retSize != NULL) *retSize = size; return buf; } struct lineFile *udcWrapShortLineFile(char *url, char *cacheDir, size_t maxSize) /* Read in entire short (up to maxSize) url into memory and wrap a line file around it. * The cacheDir may be null in which case udcDefaultDir() will be used. If maxSize * is zero then a default value (currently 64 meg) will be used. */ { if (maxSize == 0) maxSize = 64 * 1024 * 1024; char *buf = udcFileReadAll(url, cacheDir, maxSize, NULL); return lineFileOnString(url, TRUE, buf); } void udcSeekCur(struct udcFile *file, bits64 offset) /* Seek to a particular position in file. */ { file->ios.udc.numSeeks++; file->offset += offset; if (udcCacheEnabled()) ourMustLseek(&file->ios.sparse,file->fdSparse, offset, SEEK_CUR); } void udcSeek(struct udcFile *file, bits64 offset) /* Seek to a particular position in file. */ { file->ios.udc.numSeeks++; file->offset = offset; if (udcCacheEnabled()) ourMustLseek(&file->ios.sparse,file->fdSparse, offset, SEEK_SET); } bits64 udcTell(struct udcFile *file) /* Return current file position. */ { return file->offset; } static long bitRealDataSize(char *fileName) /* Return number of real bytes indicated by bitmaps */ { struct udcBitmap *bits = udcBitmapOpen(fileName); int blockSize = bits->blockSize; long byteSize = 0; int blockCount = (bits->fileSize + blockSize - 1)/blockSize; if (blockCount > 0) { int bitmapSize = bitToByteSize(blockCount); Bits *b = needLargeMem(bitmapSize); mustReadFd( bits->fd, b, bitmapSize); int bitsSet = bitCountRange(b, 0, blockCount); byteSize = (long)bitsSet*blockSize; freez(&b); } udcBitmapClose(&bits); return byteSize; } static bits64 rCleanup(time_t deleteTime, boolean testOnly) /* Delete any bitmap or sparseData files last accessed before deleteTime */ { struct fileInfo *file, *fileList = listDirX(".", "*", FALSE); bits64 results = 0; for (file = fileList; file != NULL; file = file->next) { if (file->isDir) { setCurrentDir(file->name); bits64 oneResult = rCleanup(deleteTime, testOnly); setCurrentDir(".."); if (oneResult > 0) { if (!testOnly) remove(file->name); results += oneResult; results += file->size; } } else if (sameString(file->name, bitmapName)) { if (verboseLevel() >= 4) { if (file->size > udcBitmapHeaderSize) /* prevent failure on bitmap files of size 0 or less than header size */ verbose(4, "%ld (%ld) %s/%s\n", bitRealDataSize(file->name), (long)file->size, getCurrentDir(), file->name); } if (file->lastAccess < deleteTime) { /* Remove all files when get bitmap, so that can ensure they are deleted in * right order. */ results += file->size; if (!testOnly) { remove(bitmapName); remove(sparseDataName); if (fileExists(redirName)) remove(redirName); } } } else if (sameString(file->name, sparseDataName)) { if (results > 0) results += file->size; } } return results; } bits64 udcCleanup(char *cacheDir, double maxDays, boolean testOnly) /* Remove cached files older than maxDays old. If testOnly is set * no clean up is done, but the size of the files that would be * cleaned up is still. */ { time_t maxSeconds = maxDays * 24 * 60 * 60; char *curPath = cloneString(getCurrentDir()); setCurrentDir(cacheDir); time_t deleteTime = time(NULL) - maxSeconds; bits64 result = rCleanup(deleteTime, testOnly); setCurrentDir(curPath); return result; } int udcCacheTimeout() /* Get cache timeout (if local cache files are newer than this many seconds, * we won't ping the remote server to check the file size and update time). */ { return cacheTimeout; } void udcSetCacheTimeout(int timeout) /* Set cache timeout (if local cache files are newer than this many seconds, * we won't ping the remote server to check the file size and update time). */ { cacheTimeout = timeout; } time_t udcUpdateTime(struct udcFile *udc) /* return udc->updateTime */ { if (sameString("transparent", udc->protocol)) { struct stat status; int ret = stat(udc->url, &status); if (ret < 0) return 0; else return status.st_mtime; } return udc->updateTime; } off_t udcFileSize(char *url) /* fetch file size from given URL or local path * returns -1 if not found. */ { if (udcIsLocal(url)) return fileSize(url); // don't go to the network if we can avoid it off_t cacheSize = udcSizeFromCache(url, NULL); if (cacheSize!=-1) return cacheSize; off_t ret = -1; struct udcRemoteFileInfo info; if (startsWith("http://",url) || startsWith("https://",url)) { if (udcInfoViaHttp(url, &info)) ret = info.size; } else if (startsWith("ftp://",url)) { if (udcInfoViaFtp(url, &info)) ret = info.size; } else errAbort("udc/udcFileSize: invalid protocol for url %s, can only do http/https/ftp", url); return ret; } boolean udcIsLocal(char *url) /* return true if file is not a http or ftp file, just a local file */ { // copied from above char *protocol = NULL, *afterProtocol = NULL, *colon; udcParseUrl(url, &protocol, &afterProtocol, &colon); freez(&protocol); freez(&afterProtocol); return colon==NULL; } boolean udcExists(char *url) /* return true if a local or remote file exists */ { return udcFileSize(url)!=-1; } void udcMMap(struct udcFile *file) /* Enable access to underlying file as memory using mmap. udcMMapFetch * must be called to actually access regions of the file. */ { if (file->mmapBase != NULL) errAbort("File is already mmaped: %s", file->url); file->mmapBase = mmap(NULL, file->size, PROT_READ, MAP_SHARED, file->fdSparse, 0); if (file->mmapBase == MAP_FAILED) errnoAbort("mmap() failed for %s", file->url); } void *udcMMapFetch(struct udcFile *file, bits64 offset, bits64 size) /* Return pointer to a region of the file in memory, ensuring that regions is * cached. udcMMap must have been called to enable access. This must be * called for first access to a range of the file or erroneous (zeros) data * maybe returned. Maybe called multiple times on a range or overlapping * returns. */ { if (file->mmapBase == NULL) errAbort("udcMMap() has not been called for: %s", file->url); if ((offset + size) > file->size) errAbort("udcMMapFetch on offset %lld for %lld bytes exceeds length of file %lld on %s", offset, size, file->size, file->url); if (udcCacheEnabled() && !sameString(file->protocol, "transparent")) udcCachePreload(file, offset, size); return ((char*)file->mmapBase) + offset; }