cf3b8d0618e5ea5763b9968a68601055973e4a2f
lrnassar
  Tue Feb 3 15:12:57 2026 -0800
Add a checker to see if a file URL can be opened (otherwise hubClone just fails on it), and an optional flag that forces it to continue regardless of missing trackDb.txt files. Refs #36264

diff --git src/hg/utils/hubClone/hubClone.c src/hg/utils/hubClone/hubClone.c
index e8f7b5c874e..b69b07614ae 100644
--- src/hg/utils/hubClone/hubClone.c
+++ src/hg/utils/hubClone/hubClone.c
@@ -10,40 +10,50 @@
 #include "errCatch.h"
 #include "ra.h"
 #include "hui.h"
 #include "pipeline.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "hubClone - Clone the remote hub text files to a local copy in newDirectoryName, fixing up bigDataUrls to remote location if necessary\n"
   "usage:\n"
   "   hubClone http://url/to/hub.txt\n"
   "options:\n"
   "   -udcDir=/dir/to/udcCache   Path to udc directory\n"
   "   -download                  Download data files in addition to the hub configuration files\n"
+  "   -skipMissingAssemblies     Skip assemblies whose trackDb.txt files are missing instead of aborting\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
    {"udcDir", OPTION_STRING},
    {"download", OPTION_BOOLEAN},
+   {"skipMissingAssemblies", OPTION_BOOLEAN},
    {NULL, 0},
 };
 
+/* Simple structure to hold genome info when doing manual parsing */
+struct simpleGenome
+    {
+    struct simpleGenome *next;
+    char *name;           /* genome name */
+    char *trackDbPath;    /* relative path to trackDb file */
+    };
+
 void polishHubName(char *name)
 /* Helper function for making somewhat safe directory names. Changes non-alpha to '_' */
 {
 if (name == NULL)
     return;
 
 char *in = name;
 char c;
 
 for(; (c = *in) != 0; in++)
     {
     if (!(isalnum(c) || c == '-' || c == '_'))
         *in = '_';
     }
 }
@@ -248,49 +258,264 @@
     // now make the real file
     return mustOpen(path, "w");
     }
 return mustOpen(path, "w");
 }
 
 void createWriteAndCloseFile(char *fileName, char *url, boolean useOneFile, char *downloadDir)
 /* Wrapper around a couple lines */
 {
 FILE *f;
 f = createPathAndFile(fileName);
 printOneFile(url, f, useOneFile, downloadDir);
 carefulClose(&f);
 }
 
-void hubClone(char *hubUrl, boolean download)
+boolean canAccessUrl(char *url)
+/* Check if a URL can be accessed by attempting to open it */
+{
+struct errCatch *errCatch = errCatchNew();
+boolean canAccess = FALSE;
+
+if (errCatchStart(errCatch))
+    {
+    struct lineFile *lf = udcWrapShortLineFile(url, NULL, MAX_HUB_TRACKDB_FILE_SIZE);
+    if (lf != NULL)
+        {
+        canAccess = TRUE;
+        lineFileClose(&lf);
+        }
+    }
+errCatchEnd(errCatch);
+errCatchFree(&errCatch);
+return canAccess;
+}
+
+char *parseHubTxtForGenomesFile(char *hubUrl, char **retHubName, boolean *retUseOneFile)
+/* Parse hub.txt to get genomesFile path and hub name. Returns genomesFile value or NULL. */
+{
+struct lineFile *lf;
+struct hash *stanza;
+char *genomesFile = NULL;
+char *hubName = NULL;
+boolean useOneFile = FALSE;
+
+struct errCatch *errCatch = errCatchNew();
+if (errCatchStart(errCatch))
+    {
+    lf = udcWrapShortLineFile(hubUrl, NULL, MAX_HUB_TRACKDB_FILE_SIZE);
+    while ((stanza = raNextRecord(lf)) != NULL)
+        {
+        if (hashLookup(stanza, "hub"))
+            {
+            hubName = cloneString(hashFindVal(stanza, "hub"));
+            char *gf = hashFindVal(stanza, "genomesFile");
+            if (gf != NULL)
+                genomesFile = cloneString(gf);
+            if (hashFindVal(stanza, "useOneFile"))
+                useOneFile = TRUE;
+            }
+        freeHash(&stanza);
+        }
+    lineFileClose(&lf);
+    }
+errCatchEnd(errCatch);
+if (errCatch->gotError)
+    errAbort("Error reading hub.txt: %s", errCatch->message->string);
+errCatchFree(&errCatch);
+
+if (retHubName)
+    *retHubName = hubName;
+if (retUseOneFile)
+    *retUseOneFile = useOneFile;
+return genomesFile;
+}
+
+struct simpleGenome *parseGenomesTxt(char *genomesUrl)
+/* Parse genomes.txt to get list of genomes with their trackDb paths */
+{
+struct lineFile *lf;
+struct hash *stanza;
+struct simpleGenome *genomeList = NULL;
+
+struct errCatch *errCatch = errCatchNew();
+if (errCatchStart(errCatch))
+    {
+    lf = udcWrapShortLineFile(genomesUrl, NULL, MAX_HUB_TRACKDB_FILE_SIZE);
+    while ((stanza = raNextRecord(lf)) != NULL)
+        {
+        char *genomeName = hashFindVal(stanza, "genome");
+        char *trackDbPath = hashFindVal(stanza, "trackDb");
+        if (genomeName != NULL && trackDbPath != NULL)
+            {
+            struct simpleGenome *sg;
+            AllocVar(sg);
+            sg->name = cloneString(genomeName);
+            sg->trackDbPath = cloneString(trackDbPath);
+            slAddHead(&genomeList, sg);
+            }
+        freeHash(&stanza);
+        }
+    lineFileClose(&lf);
+    }
+errCatchEnd(errCatch);
+if (errCatch->gotError)
+    errAbort("Error reading genomes.txt: %s", errCatch->message->string);
+errCatchFree(&errCatch);
+
+slReverse(&genomeList);
+return genomeList;
+}
+
+void printGenomesTxtFiltered(char *genomesUrl, char *outputPath, struct hash *skipGenomes)
+/* Read genomes.txt and write it out, skipping genomes in skipGenomes hash */
+{
+struct lineFile *lf;
+struct hash *stanza;
+FILE *out = createPathAndFile(outputPath);
+
+lf = udcWrapShortLineFile(genomesUrl, NULL, MAX_HUB_TRACKDB_FILE_SIZE);
+while ((stanza = raNextRecord(lf)) != NULL)
+    {
+    char *genomeName = hashFindVal(stanza, "genome");
+    if (genomeName != NULL)
+        {
+        if (hashLookup(skipGenomes, genomeName))
+            {
+            // Skip this genome stanza
+            freeHash(&stanza);
+            continue;
+            }
+        // Print the genome stanza
+        printGenomeStanza(stanza, out, genomesUrl, FALSE);
+        }
+    else
+        {
+        // Non-genome stanza (shouldn't happen in valid genomes.txt, but handle gracefully)
+        printGenericStanza(stanza, out, genomesUrl);
+        }
+    freeHash(&stanza);
+    }
+lineFileClose(&lf);
+carefulClose(&out);
+}
+
+void hubClone(char *hubUrl, boolean download, boolean skipMissingAssemblies)
 /* hubClone - Clone the hub text files to a local copy, fixing up bigDataUrls
  * to remote locations if necessary. */
 {
 struct trackHub *hub;
 struct trackHubGenome *genome;
 char *hubBasePath, *hubName, *hubFileName;
 char *genomesUrl, *genomesDir, *genomesFileName;
 char *tdbFileName, *tdbFilePath;
 char *path;
 FILE *f;
 struct dyString *downloadDir = dyStringNew(0);
 boolean oneFile = FALSE;
 
 hubBasePath = cloneString(hubUrl);
 chopSuffixAt(hubBasePath, '/'); // don't forget to add a "/" back on!
 hubFileName = strrchr(hubUrl, '/');
 hubFileName += 1;
 
+if (skipMissingAssemblies)
+    {
+    // Manual parsing mode: don't use trackHubOpen which validates all files upfront
+    char *genomesFile = parseHubTxtForGenomesFile(hubUrl, &hubName, &oneFile);
+    polishHubName(hubName);
+
+    if (oneFile)
+        {
+        // For useOneFile hubs, we still need to try trackHubOpen since everything is in one file
+        // Fall through to standard processing
+        hub = readHubFromUrl(hubUrl);
+        if (hub == NULL)
+            errAbort("error opening %s", hubUrl);
+        makeDirs(hubName);
+        path = catTwoStrings(hubName, catTwoStrings("/", hubFileName));
+        f = mustOpen(path, "w");
+        if (download)
+            dyStringPrintf(downloadDir, "%s/", hubName);
+        printOneFile(hubUrl, f, oneFile, dyStringContents(downloadDir));
+        carefulClose(&f);
+        return;
+        }
+
+    if (genomesFile == NULL)
+        errAbort("No genomesFile found in hub.txt");
+
+    genomesUrl = trackHubRelativeUrl(hubUrl, genomesFile);
+    struct simpleGenome *genomeList = parseGenomesTxt(genomesUrl);
+
+    if (genomeList == NULL)
+        errAbort("No genomes found in %s", genomesUrl);
+
+    // Write hub.txt
+    path = catTwoStrings(hubName, catTwoStrings("/", hubFileName));
+    createWriteAndCloseFile(path, hubUrl, FALSE, dyStringContents(downloadDir));
+
+    // Track which genomes to skip
+    struct hash *skipGenomes = hashNew(0);
+
+    // Process each genome, checking accessibility
+    genomesFileName = catTwoStrings(hubName, catTwoStrings("/", genomesFile));
+    char *genomePath = cloneString(genomesFileName);
+    chopSuffixAt(genomePath, '/');
+
+    struct simpleGenome *sg;
+    for (sg = genomeList; sg != NULL; sg = sg->next)
+        {
+        char *trackDbUrl = trackHubRelativeUrl(genomesUrl, sg->trackDbPath);
+        char *genomeName = sg->name;
+
+        // Strip leading underscore for assembly hubs
+        if (startsWith("_", genomeName))
+            genomeName = genomeName + 1;
+
+        if (!canAccessUrl(trackDbUrl))
+            {
+            warn("Skipping assembly '%s': trackDb file not accessible: %s",
+                 genomeName, trackDbUrl);
+            hashAdd(skipGenomes, sg->name, NULL);
+            continue;
+            }
+
+        // Make correct directory structure and write trackDb
+        genomesDir = catTwoStrings(genomePath, catTwoStrings("/", genomeName));
+        if (download)
+            {
+            dyStringClear(downloadDir);
+            dyStringPrintf(downloadDir, "%s/%s/", hubName, genomeName);
+            }
+        tdbFileName = strrchr(sg->trackDbPath, '/');
+        if (tdbFileName != NULL)
+            tdbFileName += 1;
+        else
+            tdbFileName = sg->trackDbPath;
+        tdbFilePath = catTwoStrings(genomesDir, catTwoStrings("/", tdbFileName));
+        createWriteAndCloseFile(tdbFilePath, trackDbUrl, FALSE, dyStringContents(downloadDir));
+        }
+
+    // Write genomes.txt, filtering out skipped genomes
+    printGenomesTxtFiltered(genomesUrl, genomesFileName, skipGenomes);
+
+    hashFree(&skipGenomes);
+    return;
+    }
+
+// Standard mode: use trackHubOpen
 hub = readHubFromUrl(hubUrl);
 if (hub == NULL)
     errAbort("error opening %s", hubUrl);
 
 hubName = cloneString((char *)hashFindVal(hub->settings, "hub"));
 polishHubName(hubName);
 
 if (trackHubSetting(hub, "useOneFile"))
     {
     oneFile = TRUE;
     makeDirs(hubName);
     path = catTwoStrings(hubName, catTwoStrings("/", hubFileName));
     f = mustOpen(path, "w");
     if (download)
         {
@@ -330,18 +555,18 @@
         tdbFileName = strrchr(genome->trackDbFile, '/') + 1;
         tdbFilePath = catTwoStrings(genomesDir, catTwoStrings("/", tdbFileName));
         createWriteAndCloseFile(tdbFilePath, genome->trackDbFile, oneFile, dyStringContents(downloadDir));
         }
     }
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc < 2)
     usage();
 setUdcCacheDir();
 udcSetDefaultDir(optionVal("udcDir", udcDefaultDir()));
-hubClone(argv[1], optionExists("download"));
+hubClone(argv[1], optionExists("download"), optionExists("skipMissingAssemblies"));
 return 0;
 }