a9fde73d32daf74780765442de44324061b01d66
markd
  Sun Jan 22 22:13:52 2023 -0800
Add URL resolver plugin functionality to allow an external program to convert
cloud URLs (s3:, gs: drs:, or really any non-HTTP RUL) to http/https URLs.
This can include signed URLs.  The cloud URL is used to index the UDC cache
rather than the resolved URL.  This allows for re-resolving signed URLs
if they time out.

Joint work by Max and Markd

diff --git src/lib/udc.c src/lib/udc.c
index 3a34d36..4f64d59 100644
--- src/lib/udc.c
+++ src/lib/udc.c
@@ -454,31 +454,31 @@
     return FALSE;
 retInfo->updateTime = status.st_mtime;
 retInfo->size = status.st_size;
 return TRUE;
 }
 
 /********* Section for http protocol **********/
 
 static char *defaultDir = "/tmp/udcCache";
 static bool udcInitialized = FALSE;
 
 void udcSetResolver(char *prots, char *cmd)
 /* Set protocols and local wrapper program to resolve s3:// and similar URLs to HTTPS */
 {
     resolvProts = slNameListFromString(cloneString(prots), ',');
-    resolvCmd = cmd;
+    resolvCmd = trimSpaces(cloneString(cmd));
 }
 
 static void initializeUdc()
 /* Use the $TMPDIR environment variable, if set, to amend the default location
  * of the cache */
 {
 if (udcInitialized)
     return;
 char *tmpDir = getenv("TMPDIR");
 if (isNotEmpty(tmpDir))
     {
     char buffer[2048];
     safef(buffer, sizeof(buffer), "%s/udcCache", tmpDir);
     udcSetDefaultDir(buffer);
     }
@@ -500,71 +500,107 @@
 }
 
 void udcDisableCache()
 /* Switch off caching. Re-enable with udcSetDefaultDir */
 {
 defaultDir = NULL;
 udcInitialized = TRUE;
 }
 
 static bool udcCacheEnabled()
 /* TRUE if caching is activated */
 {
 return (defaultDir != NULL);
 }
 
+static void makeUdcTmp(char tmpPath[PATH_LEN])
+/* create a URL temporary file */
+{
+safef(tmpPath, PATH_LEN, "%s/udcTmp-XXXXXX", getTempDir());
+int fd = mkstemp(tmpPath);
+if (fd < 0)
+    errnoAbort("udc:makeUdcTmp: creating temporary file failed: %s", tmpPath);
+close(fd);
+}
+
+static void resolveUrlExec(char *url, char *stdoutTmp, char *stderrTmp)
+/* exec child process to resolve URL */
+{
+if ((dup2(mustOpenFd("/dev/null", O_RDONLY), STDIN_FILENO) < 0) ||
+    (dup2(mustOpenFd(stdoutTmp, O_WRONLY), STDOUT_FILENO) < 0) ||
+    (dup2(mustOpenFd(stderrTmp, O_WRONLY), STDERR_FILENO) < 0))
+    errnoAbort("udc:resolveUrlExec: dup2 failed");
+
+// parse into words to get any arguments encoded in string
+int numWords = chopByWhite(cloneString(resolvCmd), NULL, 0);
+char *words[numWords + 1];
+chopByWhite(resolvCmd, words, numWords);
+
+char* args[numWords + 2];
+CopyArray(words, args, numWords);
+args[numWords] = url;
+args[numWords + 1] = NULL;
+
+execv(resolvCmd, args);
+errnoAbort("udc:resolveUrlExec  failed: %s", resolvCmd);
+exit(1); // should never make it here
+}
+
 static char* resolveUrl(char *url) 
 /* return the HTTPS URL given a pseudo-URL e.g. s3://xxxx. Result must be freed. */
 {
-char filename[1024];
-safef(filename, sizeof filename, "%s/udcTmp-XXXXXX", getTempDir());
-mkstemp(filename);
+// Max tried to do this with pipeline but there is some hard to find bug with it and threads
+char stdoutTmp[PATH_LEN], stderrTmp[PATH_LEN];
+makeUdcTmp(stdoutTmp);
+makeUdcTmp(stderrTmp);
     
 verbose(4, "Resolving url %s using command %s\n", url, resolvCmd);
 
-char* program = resolvCmd;
-char* args[4];
-args[0] = program;
-args[1] = url;
-args[2] = filename;
-args[3] = NULL;
-
-pid_t pid = 0;
-int status;
-pid = fork();
-
+pid_t pid = fork();
 if (pid < 0)
-    errAbort("udc:resolveUrl: error in fork");
+    errnoAbort("udc:resolveUrl: error in fork");
 if (pid == 0)
     {
     // child process
-    int err = execv(program, args);
-    if (err!=0)
-        errAbort("Cannot run %s", program);
-    exit(0);
+    resolveUrlExec(url, stdoutTmp, stderrTmp);
     }
 
 // pid > 0 = main process
-pid = wait(&status);
+int status;
+if (waitpid(pid, &status, 0) < 0)
+    errnoAbort("udc:resolveUrl: waitpid failed");
+if (WIFSIGNALED(status))
+    errAbort("udc:resolveUrl: resolver signaled (%d)", WTERMSIG(status));
+if (WIFSTOPPED(status) || WIFCONTINUED(status))
+    errAbort("udc:resolveUrl: resolver unexpectedly stop or continued");
+if (WIFEXITED(status) && (WEXITSTATUS(status) != 0))
+    {
+    char* errMsg;
+    readInGulp(stderrTmp, &errMsg, NULL);
+    errAbort("udc:resolveUrl: resolve program failed %s: %s", resolvCmd, errMsg);
+    }
+
+// sucesss; got URL
 char* newUrl = NULL;
-size_t len = 0;
-readInGulp(filename, &newUrl, &len);
-unlink(filename);
-if (len <= 0)
-    errAbort("Got empty string in output file, from %s, args %s %s", program, url, filename);
+readInGulp(stdoutTmp, &newUrl, NULL);
+trimSpaces(newUrl);
+if (strlen(newUrl) == 0)
+    errAbort("Got empty URL from URL resolve program: %s %s", resolvCmd, url);
+
+unlink(stdoutTmp);
+unlink(stderrTmp);
     
-stripString(newUrl, "\n");
 verbose(4, "Resolved url: %s -> %s\n", url, newUrl);
 return newUrl;
 }
 
 int udcDataViaHttpOrFtp( char *url, bits64 offset, int size, void *buffer, struct udcFile *file)
 /* Fetch a block of data of given size into buffer using url's protocol,
  * which must be http, https or ftp.  Returns number of bytes actually read.
  * Does an errAbort on error.
  * Typically will be called with size in the 8k-64k range. */
 {
 if (udcIsResolvable(url)) 
     {
         if (file->connInfo.resolvedUrl) {
             verbose(4, "URL %s was already resolved to %s\n", url, file->connInfo.resolvedUrl);
             url = file->connInfo.resolvedUrl;
@@ -2027,52 +2063,68 @@
 	memcpy(newBuf, buf, bufSize);
 	freeMem(longBuf);
 	buf = longBuf = newBuf;
 	bufSize = newBufSize;
 	}
     char c = udcGetChar(file);
     buf[i] = c;
     if (c == 0)
         break;
     }
 char *retString = cloneString(buf);
 freeMem(longBuf);
 return retString;
 }
 
-char *udcFileReadAll(char *url, char *cacheDir, size_t maxSize, size_t *retSize)
-/* Read a complete file via UDC. The cacheDir may be null in which case udcDefaultDir()
- * will be used.  If maxSize is non-zero, check size against maxSize
- * and abort if it's bigger.  Returns file data (with an extra terminal for the
- * common case where it's treated as a C string).  If retSize is non-NULL then
- * returns size of file in *retSize. Do a freeMem or freez of the returned buffer
- * when done. */
+char *udcFileReadAllIfExists(char *url, char *cacheDir, size_t maxSize, size_t *retSize)
+/* Read a complete file via UDC. Return NULL if the file doesn't exist.  The
+ * cacheDir may be null in which case udcDefaultDir() will be used.  If
+ * maxSize is non-zero, check size against maxSize and abort if it's bigger.
+ * Returns file data (with an extra terminal for the common case where it's
+ * treated as a C string).  If retSize is non-NULL then returns size of file
+ * in *retSize. Do a freeMem or freez of the returned buffer when done. */
 {
-struct udcFile  *file = udcFileOpen(url, cacheDir);
+struct udcFile  *file = udcFileMayOpen(url, cacheDir);
+if (file == NULL)
+    return NULL;
 size_t size = file->size;
 if (maxSize != 0 && size > maxSize)
     errAbort("%s is %lld bytes, but maxSize to udcFileReadAll is %lld",
     	url, (long long)size, (long long)maxSize);
 char *buf = needLargeMem(size+1);
 udcMustRead(file, buf, size);
 buf[size] = 0;	// add trailing zero for string processing
 udcFileClose(&file);
 if (retSize != NULL)
     *retSize = size;
 return buf;
 }
 
+char *udcFileReadAll(char *url, char *cacheDir, size_t maxSize, size_t *retSize)
+/* Read a complete file via UDC. The cacheDir may be null in which case udcDefaultDir()
+ * will be used.  If maxSize is non-zero, check size against maxSize
+ * and abort if it's bigger.  Returns file data (with an extra terminal for the
+ * common case where it's treated as a C string).  If retSize is non-NULL then
+ * returns size of file in *retSize. Do a freeMem or freez of the returned buffer
+ * when done. */
+{
+char *buf = udcFileReadAllIfExists(url, cacheDir, maxSize, retSize);
+if (buf == NULL)
+    errAbort("Couldn't open %s", url);
+return buf;
+}
+
 struct lineFile *udcWrapShortLineFile(char *url, char *cacheDir, size_t maxSize)
 /* Read in entire short (up to maxSize) url into memory and wrap a line file around it.
  * The cacheDir may be null in which case udcDefaultDir() will be used.  If maxSize
  * is zero then a default value (currently 256 meg) will be used. */
 {
 if (maxSize == 0) maxSize = 256 * 1024 * 1024;
 char *buf = udcFileReadAll(url, cacheDir, maxSize, NULL);
 return lineFileOnString(url, TRUE, buf);
 }
 
 void udcSeekCur(struct udcFile *file, bits64 offset)
 /* Seek to a particular position in file. */
 {
 file->ios.udc.numSeeks++;
 file->offset += offset;