25970e38be62db2e2fe313fd84b0d2337035de97 galt Thu Apr 13 22:54:22 2017 -0700 Adding some explanatory notes to udc. diff --git src/lib/udc.c src/lib/udc.c index cb1dbfc..01ea80b4 100644 --- src/lib/udc.c +++ src/lib/udc.c @@ -464,49 +464,63 @@ buf += rd; remaining -= rd; } if (rd == -1) errnoAbort("udcDataViaHttpOrFtp: error reading socket"); struct connInfo *ci = &file->connInfo; if (ci == NULL) mustCloseFd(&sd); else ci->offset += total; return total; } boolean udcInfoViaHttp(char *url, struct udcRemoteFileInfo *retInfo) /* Gets size and last modified time of URL - * and returns status of HEAD GET. */ + * and returns status of HEAD or GET byterange 0-0. */ { verbose(4, "checking http remote info on %s\n", url); int redirectCount = 0; struct hash *hash; int status; char *sizeString = NULL; +/* + For caching, sites should support byte-range and last-modified. + However, several groups including ENCODE have made sites that use CGIs to + dynamically generate hub text files such as hub.txt, genome.txt, trackDb.txt. + Byte-range and last-modified are difficult to support for this case, + so they do without them, effectively defeat caching. Every 5 minutes (udcTimeout), + they get re-downloaded, even when the data has not changed. +*/ while (TRUE) { hash = newHash(0); status = netUrlHead(url, hash); sizeString = hashFindValUpperCase(hash, "Content-Length:"); if (status == 200 && sizeString) break; + /* + Using HEAD with HIPPAA-compliant signed AmazonS3 URLs generates 403. + The signed URL generated for GET cannot be used with HEAD. + Instead call GET with byterange=0-0 in netUrlFakeHeadByGet(). + This supplies both size via Content-Range response header, + as well as Last-Modified header which is important for caching. + There are also sites which support byte-ranges + but they do not return Content-Length with HEAD. + */ if (status == 403 || (status==200 && !sizeString)) { - // Avoiding HEAD makes it work with HIPPAA compliant signed AmazonS3 URLs. - // The signed URL generated for GET cannot be used with HEAD. - // There are also a few sites which support byte-ranges but do not return Content-Length with HEAD. hashFree(&hash); hash = newHash(0); status = netUrlFakeHeadByGet(url, hash); if (status == 206) break; } if (status != 301 && status != 302) return FALSE; ++redirectCount; if (redirectCount > 5) { warn("code %d redirects: exceeded limit of 5 redirects, %s", status, url); return FALSE; } char *newUrl = hashFindValUpperCase(hash, "Location:");