32514d2022bc8c424480258e3a138c70eff3a48b
chmalee
  Fri Jan 24 15:48:55 2020 -0800
Adding command line option to download track hub data files to hubClone, refs #24835

diff --git src/hg/utils/hubClone/hubClone.c src/hg/utils/hubClone/hubClone.c
index 706cd18..209b235 100644
--- src/hg/utils/hubClone/hubClone.c
+++ src/hg/utils/hubClone/hubClone.c
@@ -1,297 +1,364 @@
 /* hubClone - Clone the hub text files to a local copy, fixing up bigDataUrls
  * to remote location if necessary. */
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "trackDb.h"
 #include "cart.h" // can't include trackHub.h without this?
 #include "trackHub.h"
 #include "errCatch.h"
 #include "ra.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hubClone - Clone the remote hub text files to a local copy in newDirectoryName, fixing up bigDataUrls to remote location if necessary\n"
   "usage:\n"
   "   hubClone http://url/to/hub.txt\n"
   "options:\n"
   "   -udcDir=/dir/to/udcCache   Path to udc directory\n"
+  "   -download                  Download data files in addition to the hub configuration files\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"udcDir", OPTION_STRING},
+   {"download", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
 void polishHubName(char *name)
 /* Helper function for making somewhat safe directory names. Changes non-alpha to '_' */
 {
 if (name == NULL)
     return;
 
 char *in = name;
 char c;
 
 for(; (c = *in) != 0; in++)
     {
     if (!(isalnum(c) || c == '-' || c == '_'))
         *in = '_';
     }
 }
 
 void printHubStanza(struct hash *stanza, FILE *out, char *baseUrl)
 /* print hub.txt stanza to out */
 {
 struct hashEl *hel, *helList = hashElListHash(stanza);
 fprintf(out, "%s %s\n", "hub", (char *)hashFindVal(stanza, "hub"));
 for (hel = helList; hel != NULL; hel = hel->next)
     {
     if (!sameString(hel->name, "hub"))
         {
         if (sameString(hel->name, "descriptionUrl"))
             fprintf(out, "%s %s\n", hel->name, trackHubRelativeUrl(baseUrl, hel->val));
         else
             fprintf(out, "%s %s\n", hel->name, (char *)hel->val);
         }
     }
 fprintf(out, "\n");
 hashElFreeList(&helList);
 }
 
 void printGenomeStanza(struct hash *stanza, FILE *out, char *baseUrl, boolean oneFile)
 /* print genomes.txt stanza to out */
 {
 struct hashEl *hel, *helList = hashElListHash(stanza);
 char *genome = (char *)hashFindVal(stanza, "genome");
 
 fprintf(out, "%s %s\n", "genome", genome);
 for (hel = helList; hel != NULL; hel = hel->next)
     {
     if (!sameString(hel->name, "genome"))
         {
         if (sameString(hel->name, "groups") ||
             sameString(hel->name, "twoBitPath") ||
             sameString(hel->name, "htmlPath")
             )
             fprintf(out, "%s %s\n", hel->name, trackHubRelativeUrl(baseUrl, hel->val));
         else if (sameString(hel->name, "trackDb"))
             {
             if (oneFile)
                 {
                 fprintf(out, "%s %s\n", hel->name, (char *)hel->val);
                 }
             else
                 {
                 // some assembly hubs use different directory names than the typical
                 // genomeName/trackDb.txt setup, hardcode this so assembly hub will
                 // still load locally
                 char *tdbFileName = NULL;
                 if ((tdbFileName = strrchr((char *)hel->val, '/')) != NULL)
                     tdbFileName += 1;
                 else
                     tdbFileName = (char *)hel->val;
                 fprintf(out, "%s %s/%s\n", hel->name, genome, tdbFileName);
                 }
             }
         else
             fprintf(out, "%s %s\n", hel->name, (char *)hel->val);
         }
     }
 fprintf(out, "\n");
 hashElFreeList(&helList);
 }
 
-void printTrackDbStanza(struct hash *stanza, FILE *out, char *baseUrl)
+#define READ_SIZE 1024 * 1024 * 64
+int downloadFile(FILE *f, char *url)
+/* Download a file in chunks, return -1 on error. Wrap in errCatch so
+ * we can keep downloading rest of hub files. */
+{
+int ret = 0;
+struct errCatch *errCatch = errCatchNew();
+if (errCatchStart(errCatch))
+    {
+    struct udcFile *file = udcFileOpen(url, udcDefaultDir());
+    size_t size = READ_SIZE;
+    off_t fileSize = udcFileSize(url);
+    off_t counter = 0;
+    char *buf = needLargeMem(size+1);
+    while (counter < fileSize)
+        {
+        bits64 sizeRead = udcRead(file, buf, size);
+        counter += sizeRead;
+        mustWrite(f, buf, sizeRead);
+        }
+    freeMem(buf);
+    udcFileClose(&file);
+    }
+errCatchEnd(errCatch);
+if (errCatch->gotError)
+    ret = -1;
+errCatchFree(&errCatch);
+return ret;
+}
+
+void printTrackDbStanza(struct hash *stanza, FILE *out, char *baseUrl, char *downloadDir)
 /* print a trackDb stanza but with relative references replaced by remote links */
 {
 struct hashEl *hel, *helList = hashElListHash(stanza);
+struct dyString *fname = dyStringNew(0);
 fprintf(out, "%s %s\n", "track", (char *)hashFindVal(stanza, "track"));
 for (hel = helList; hel != NULL; hel = hel->next)
     {
     if (!sameString(hel->name, "track"))
         {
         if (sameString(hel->name, "bigDataUrl") ||
             sameString(hel->name, "bigDataIndex") ||
             sameString(hel->name, "barChartMatrixUrl") ||
             sameString(hel->name, "barChartSampleUrl") ||
             sameString(hel->name, "linkDataUrl") ||
             sameString(hel->name, "frames") ||
             sameString(hel->name, "summary") ||
             sameString(hel->name, "searchTrix") ||
             sameString(hel->name, "html")
             )
-            fprintf(out, "%s %s\n", hel->name, trackHubRelativeUrl(baseUrl, hel->val));
+            {
+            char *urlToData = trackHubRelativeUrl(baseUrl, hel->val);
+            if (isNotEmpty(downloadDir))
+                {
+                dyStringClear(fname);
+                char *relName = strrchr(hel->val,'/');
+                if (relName != NULL)
+                    {
+                    relName = relName + 1;
+                    dyStringPrintf(fname, "%s%s", downloadDir, relName);
+                    }
+                else
+                    {
+                    relName = hel->val;
+                    dyStringPrintf(fname, "%s%s", downloadDir, (char *)hel->val);
+                    }
+                FILE *f = mustOpen(dyStringContents(fname), "wb");
+                // download file, in chunks if necessary
+                if (downloadFile(f, urlToData) == -1)
+                    fprintf(stderr, "Error downloading file. Try again with wget or curl: %s\n", urlToData);
+                fprintf(out, "%s %s\n", hel->name, relName);
+                }
+            else
+                fprintf(out, "%s %s\n", hel->name, urlToData);
+            }
         else
             fprintf(out, "%s %s\n", hel->name, (char *)hel->val);
         }
     }
 fprintf(out, "\n");
 hashElFreeList(&helList);
 }
 
 void printGenericStanza(struct hash *stanza, FILE *out, char *baseUrl)
 /* print a hash to out */
 {
 struct hashEl *hel, *helList = hashElListHash(stanza);
 for (hel = helList; hel != NULL; hel = hel->next)
     {
     fprintf(out, "%s %s\n", hel->name, (char *)hel->val);
     }
 fprintf(out,"\n");
 }
 
-void printOneFile(char *url, FILE *f, boolean oneFile)
+void printOneFile(char *url, FILE *f, boolean oneFile, char *downloadDir)
 /* printOneFile: pass a stanza to appropriate printer */
 {
 struct lineFile *lf;
 struct hash *stanza;
 struct hashEl *includeFile;
 
 lf = udcWrapShortLineFile(url, NULL, MAX_HUB_TRACKDB_FILE_SIZE);
 while ((stanza = raNextRecord(lf)) != NULL)
     {
     if (hashLookup(stanza, "hub"))
         {
         printHubStanza(stanza, f, url);
         }
     else if (hashLookup(stanza, "genome"))
         {
         printGenomeStanza(stanza, f, url, oneFile);
         }
     else if (hashLookup(stanza, "track"))
         {
-        printTrackDbStanza(stanza, f, url);
+        printTrackDbStanza(stanza, f, url, downloadDir);
         }
     else
         {
         // if there's an include file then open and print the include file
         includeFile = hashLookup(stanza, "include");
         if (includeFile != NULL)
             {
             char *newUrl = trackHubRelativeUrl(url, includeFile->val);
-            printOneFile(newUrl, f, oneFile);
+            printOneFile(newUrl, f, oneFile, downloadDir);
             }
         else
             printGenericStanza(stanza, f, url);
         }
     }
 lineFileClose(&lf);
 freeHash(&stanza);
 }
 
 struct trackHub *readHubFromUrl(char *hubUrl)
 /* readHubUrl: errCatch around trackHubOpen */
 {
 struct trackHub *hub = NULL;
 struct errCatch *errCatch = errCatchNew();
 
 if (errCatchStart(errCatch))
     hub = trackHubOpen(hubUrl, "");
 errCatchEnd(errCatch);
 if (errCatch->gotError)
     errAbort("aborting: %s\n", errCatch->message->string);
 return hub;
 }
 
 FILE *createPathAndFile(char *path)
 /* if path contains parent directories that don't exist, create them first before opening file */
 {
 char *copy = cloneString(path);
 if (stringIn("/", copy))
     {
     chopSuffixAt(copy, '/');
     makeDirs(copy);
     // now make the real file
     return mustOpen(path, "w");
     }
 return mustOpen(path, "w");
 }
 
-void createWriteAndCloseFile(char *fileName, char *url, boolean useOneFile)
+void createWriteAndCloseFile(char *fileName, char *url, boolean useOneFile, char *downloadDir)
 /* Wrapper around a couple lines */
 {
 FILE *f;
 f = createPathAndFile(fileName);
-printOneFile(url, f, useOneFile);
+printOneFile(url, f, useOneFile, downloadDir);
 carefulClose(&f);
 }
 
-void hubClone(char *hubUrl)
+void hubClone(char *hubUrl, boolean download)
 /* hubClone - Clone the hub text files to a local copy, fixing up bigDataUrls 
  * to remote locations if necessary. */
 {
 struct trackHub *hub;
 struct trackHubGenome *genome;
 char *hubBasePath, *hubName, *hubFileName;
 char *genomesUrl, *genomesDir, *genomesFileName;
 char *tdbFileName, *tdbFilePath;
 char *path; 
 FILE *f;
+struct dyString *downloadDir = dyStringNew(0);
 boolean oneFile = FALSE;
 
 hubBasePath = cloneString(hubUrl);
 chopSuffixAt(hubBasePath, '/'); // don't forget to add a "/" back on!
 hubFileName = strrchr(hubUrl, '/'); 
 hubFileName += 1;
 
 hub = readHubFromUrl(hubUrl);
 if (hub == NULL)
     errAbort("error opening %s", hubUrl);
 
 hubName = cloneString((char *)hashFindVal(hub->settings, "hub"));
 polishHubName(hubName);
 
 if (trackHubSetting(hub, "useOneFile"))
     {
     oneFile = TRUE;
     makeDirs(hubName);
     path = catTwoStrings(hubName, catTwoStrings("/", hubFileName));
     f = mustOpen(path, "w");
-    printOneFile(hubUrl, f, oneFile);
+    if (download)
+        {
+        dyStringPrintf(downloadDir, "%s/", hubName);
+        }
+    printOneFile(hubUrl, f, oneFile, dyStringContents(downloadDir));
     carefulClose(&f);
     }
 else
     {
     genome = hub->genomeList;
     if (genome == NULL)
         errAbort("error opening %s file", hub->genomesFile);
 
     path = catTwoStrings(hubName, catTwoStrings("/", hubFileName));
-    createWriteAndCloseFile(path, hubUrl, oneFile);
+    createWriteAndCloseFile(path, hubUrl, oneFile, dyStringContents(downloadDir));
 
     genomesUrl = trackHubRelativeUrl(hub->url, hub->genomesFile);
     genomesFileName = catTwoStrings(hubName, catTwoStrings("/", hub->genomesFile));
     char *genomePath = cloneString(genomesFileName);
     chopSuffixAt(genomePath, '/'); // used later for making the right directory structure
 
-    createWriteAndCloseFile(genomesFileName, genomesUrl, oneFile);
+    createWriteAndCloseFile(genomesFileName, genomesUrl, oneFile, dyStringContents(downloadDir));
 
     for (; genome != NULL; genome = genome->next)
         {
         if (startsWith("_", genome->name)) // assembly hubs have a leading '_'
             genome->name += 1;
 
         // make correct directory strucutre
         genomesDir = catTwoStrings(genomePath, catTwoStrings("/", genome->name));
+        if (download)
+            {
+            dyStringClear(downloadDir);
+            dyStringPrintf(downloadDir, "%s/%s/", hubName, genome->name);
+            }
         tdbFileName = strrchr(genome->trackDbFile, '/') + 1;
         tdbFilePath = catTwoStrings(genomesDir, catTwoStrings("/", tdbFileName));
-        createWriteAndCloseFile(tdbFilePath, genome->trackDbFile, oneFile);
+        createWriteAndCloseFile(tdbFilePath, genome->trackDbFile, oneFile, dyStringContents(downloadDir));
         }
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
-if (argc != 2)
+if (argc < 2)
     usage();
 udcSetDefaultDir(optionVal("udcDir", udcDefaultDir()));
-hubClone(argv[1]);
+hubClone(argv[1], optionExists("download"));
 return 0;
 }