a9fde73d32daf74780765442de44324061b01d66 markd Sun Jan 22 22:13:52 2023 -0800 Add URL resolver plugin functionality to allow an external program to convert cloud URLs (s3:, gs: drs:, or really any non-HTTP RUL) to http/https URLs. This can include signed URLs. The cloud URL is used to index the UDC cache rather than the resolved URL. This allows for re-resolving signed URLs if they time out. Joint work by Max and Markd diff --git src/lib/udc.c src/lib/udc.c index 3a34d36..4f64d59 100644 --- src/lib/udc.c +++ src/lib/udc.c @@ -454,31 +454,31 @@ return FALSE; retInfo->updateTime = status.st_mtime; retInfo->size = status.st_size; return TRUE; } /********* Section for http protocol **********/ static char *defaultDir = "/tmp/udcCache"; static bool udcInitialized = FALSE; void udcSetResolver(char *prots, char *cmd) /* Set protocols and local wrapper program to resolve s3:// and similar URLs to HTTPS */ { resolvProts = slNameListFromString(cloneString(prots), ','); - resolvCmd = cmd; + resolvCmd = trimSpaces(cloneString(cmd)); } static void initializeUdc() /* Use the $TMPDIR environment variable, if set, to amend the default location * of the cache */ { if (udcInitialized) return; char *tmpDir = getenv("TMPDIR"); if (isNotEmpty(tmpDir)) { char buffer[2048]; safef(buffer, sizeof(buffer), "%s/udcCache", tmpDir); udcSetDefaultDir(buffer); } @@ -500,71 +500,107 @@ } void udcDisableCache() /* Switch off caching. Re-enable with udcSetDefaultDir */ { defaultDir = NULL; udcInitialized = TRUE; } static bool udcCacheEnabled() /* TRUE if caching is activated */ { return (defaultDir != NULL); } +static void makeUdcTmp(char tmpPath[PATH_LEN]) +/* create a URL temporary file */ +{ +safef(tmpPath, PATH_LEN, "%s/udcTmp-XXXXXX", getTempDir()); +int fd = mkstemp(tmpPath); +if (fd < 0) + errnoAbort("udc:makeUdcTmp: creating temporary file failed: %s", tmpPath); +close(fd); +} + +static void resolveUrlExec(char *url, char *stdoutTmp, char *stderrTmp) +/* exec child process to resolve URL */ +{ +if ((dup2(mustOpenFd("/dev/null", O_RDONLY), STDIN_FILENO) < 0) || + (dup2(mustOpenFd(stdoutTmp, O_WRONLY), STDOUT_FILENO) < 0) || + (dup2(mustOpenFd(stderrTmp, O_WRONLY), STDERR_FILENO) < 0)) + errnoAbort("udc:resolveUrlExec: dup2 failed"); + +// parse into words to get any arguments encoded in string +int numWords = chopByWhite(cloneString(resolvCmd), NULL, 0); +char *words[numWords + 1]; +chopByWhite(resolvCmd, words, numWords); + +char* args[numWords + 2]; +CopyArray(words, args, numWords); +args[numWords] = url; +args[numWords + 1] = NULL; + +execv(resolvCmd, args); +errnoAbort("udc:resolveUrlExec failed: %s", resolvCmd); +exit(1); // should never make it here +} + static char* resolveUrl(char *url) /* return the HTTPS URL given a pseudo-URL e.g. s3://xxxx. Result must be freed. */ { -char filename[1024]; -safef(filename, sizeof filename, "%s/udcTmp-XXXXXX", getTempDir()); -mkstemp(filename); +// Max tried to do this with pipeline but there is some hard to find bug with it and threads +char stdoutTmp[PATH_LEN], stderrTmp[PATH_LEN]; +makeUdcTmp(stdoutTmp); +makeUdcTmp(stderrTmp); verbose(4, "Resolving url %s using command %s\n", url, resolvCmd); -char* program = resolvCmd; -char* args[4]; -args[0] = program; -args[1] = url; -args[2] = filename; -args[3] = NULL; - -pid_t pid = 0; -int status; -pid = fork(); - +pid_t pid = fork(); if (pid < 0) - errAbort("udc:resolveUrl: error in fork"); + errnoAbort("udc:resolveUrl: error in fork"); if (pid == 0) { // child process - int err = execv(program, args); - if (err!=0) - errAbort("Cannot run %s", program); - exit(0); + resolveUrlExec(url, stdoutTmp, stderrTmp); } // pid > 0 = main process -pid = wait(&status); +int status; +if (waitpid(pid, &status, 0) < 0) + errnoAbort("udc:resolveUrl: waitpid failed"); +if (WIFSIGNALED(status)) + errAbort("udc:resolveUrl: resolver signaled (%d)", WTERMSIG(status)); +if (WIFSTOPPED(status) || WIFCONTINUED(status)) + errAbort("udc:resolveUrl: resolver unexpectedly stop or continued"); +if (WIFEXITED(status) && (WEXITSTATUS(status) != 0)) + { + char* errMsg; + readInGulp(stderrTmp, &errMsg, NULL); + errAbort("udc:resolveUrl: resolve program failed %s: %s", resolvCmd, errMsg); + } + +// sucesss; got URL char* newUrl = NULL; -size_t len = 0; -readInGulp(filename, &newUrl, &len); -unlink(filename); -if (len <= 0) - errAbort("Got empty string in output file, from %s, args %s %s", program, url, filename); +readInGulp(stdoutTmp, &newUrl, NULL); +trimSpaces(newUrl); +if (strlen(newUrl) == 0) + errAbort("Got empty URL from URL resolve program: %s %s", resolvCmd, url); + +unlink(stdoutTmp); +unlink(stderrTmp); -stripString(newUrl, "\n"); verbose(4, "Resolved url: %s -> %s\n", url, newUrl); return newUrl; } int udcDataViaHttpOrFtp( char *url, bits64 offset, int size, void *buffer, struct udcFile *file) /* Fetch a block of data of given size into buffer using url's protocol, * which must be http, https or ftp. Returns number of bytes actually read. * Does an errAbort on error. * Typically will be called with size in the 8k-64k range. */ { if (udcIsResolvable(url)) { if (file->connInfo.resolvedUrl) { verbose(4, "URL %s was already resolved to %s\n", url, file->connInfo.resolvedUrl); url = file->connInfo.resolvedUrl; @@ -2027,52 +2063,68 @@ memcpy(newBuf, buf, bufSize); freeMem(longBuf); buf = longBuf = newBuf; bufSize = newBufSize; } char c = udcGetChar(file); buf[i] = c; if (c == 0) break; } char *retString = cloneString(buf); freeMem(longBuf); return retString; } -char *udcFileReadAll(char *url, char *cacheDir, size_t maxSize, size_t *retSize) -/* Read a complete file via UDC. The cacheDir may be null in which case udcDefaultDir() - * will be used. If maxSize is non-zero, check size against maxSize - * and abort if it's bigger. Returns file data (with an extra terminal for the - * common case where it's treated as a C string). If retSize is non-NULL then - * returns size of file in *retSize. Do a freeMem or freez of the returned buffer - * when done. */ +char *udcFileReadAllIfExists(char *url, char *cacheDir, size_t maxSize, size_t *retSize) +/* Read a complete file via UDC. Return NULL if the file doesn't exist. The + * cacheDir may be null in which case udcDefaultDir() will be used. If + * maxSize is non-zero, check size against maxSize and abort if it's bigger. + * Returns file data (with an extra terminal for the common case where it's + * treated as a C string). If retSize is non-NULL then returns size of file + * in *retSize. Do a freeMem or freez of the returned buffer when done. */ { -struct udcFile *file = udcFileOpen(url, cacheDir); +struct udcFile *file = udcFileMayOpen(url, cacheDir); +if (file == NULL) + return NULL; size_t size = file->size; if (maxSize != 0 && size > maxSize) errAbort("%s is %lld bytes, but maxSize to udcFileReadAll is %lld", url, (long long)size, (long long)maxSize); char *buf = needLargeMem(size+1); udcMustRead(file, buf, size); buf[size] = 0; // add trailing zero for string processing udcFileClose(&file); if (retSize != NULL) *retSize = size; return buf; } +char *udcFileReadAll(char *url, char *cacheDir, size_t maxSize, size_t *retSize) +/* Read a complete file via UDC. The cacheDir may be null in which case udcDefaultDir() + * will be used. If maxSize is non-zero, check size against maxSize + * and abort if it's bigger. Returns file data (with an extra terminal for the + * common case where it's treated as a C string). If retSize is non-NULL then + * returns size of file in *retSize. Do a freeMem or freez of the returned buffer + * when done. */ +{ +char *buf = udcFileReadAllIfExists(url, cacheDir, maxSize, retSize); +if (buf == NULL) + errAbort("Couldn't open %s", url); +return buf; +} + struct lineFile *udcWrapShortLineFile(char *url, char *cacheDir, size_t maxSize) /* Read in entire short (up to maxSize) url into memory and wrap a line file around it. * The cacheDir may be null in which case udcDefaultDir() will be used. If maxSize * is zero then a default value (currently 256 meg) will be used. */ { if (maxSize == 0) maxSize = 256 * 1024 * 1024; char *buf = udcFileReadAll(url, cacheDir, maxSize, NULL); return lineFileOnString(url, TRUE, buf); } void udcSeekCur(struct udcFile *file, bits64 offset) /* Seek to a particular position in file. */ { file->ios.udc.numSeeks++; file->offset += offset;