cc99fd8cda2d90b8f6fd95b2f926fc19704d0a90
chmalee
  Fri May 22 13:33:25 2026 -0700
hubtools: fix upload to match hubspace hooks, add to user-apps, refs #36790

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools
index 0bc7a88cbbc..94ba7f322d6 100755
--- src/utils/hubtools/hubtools
+++ src/utils/hubtools/hubtools
@@ -14,49 +14,57 @@
     import requests # if this fails, install the library with 'python -m pip install requests --user'
 except:
     # most code now can work around the absence of the requests library
     requestsLoaded = False
 
 #import pyyaml   # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml()
 
 # ==== functions =====
 
 # debugging: when activated with -d, output more information and do not remove any temp files
 debugMode = False
 # full extra verbose output of all HTTP requests sent, there is no command line option for this
 doVerbose = False
 
 # allowed file types by hubtools up
-# copied from the javascript file hgHubConnect.js
+# mirrors extensionMap in hg/js/hgMyData.js
+# hub.txt comes before text so hub.txt and *.hub.txt don't fall through to text
 fileTypeExtensions = {
+    "hub.txt": [ "hub.txt" ],
     "bigBed": [ ".bb", ".bigbed" ],
     "bam": [ ".bam" ],
     "vcf": [ ".vcf" ],
     "vcfTabix": [ ".vcf.gz", "vcf.bgz" ],
     "bigWig": [ ".bw", ".bigwig" ],
     "hic": [ ".hic" ],
     "cram": [ ".cram" ],
     "bigBarChart": [ ".bigbarchart" ],
     "bigGenePred": [ ".bgp", ".biggenepred" ],
     "bigMaf": [ ".bigmaf" ],
     "bigInteract": [ ".biginteract" ],
     "bigPsl": [ ".bigpsl" ],
     "bigChain": [ ".bigchain" ],
     "bamIndex": [ ".bam.bai", ".bai" ],
-    "tabixIndex": [ ".vcf.gz.tbi", "vcf.bgz.tbi" ]
+    "tabixIndex": [ ".vcf.gz.tbi", "vcf.bgz.tbi" ],
+    "2bit": [ ".2bit" ],
+    "text": [ ".txt", ".text" ],
 }
 
+# JS regex for a single hub-name segment, used to validate the <hubName> CLI arg
+# matches parentDirSegmentRegex in hg/js/hgMyData.js
+hubNameSegmentRegex = re.compile(r"^[0-9a-zA-Z._]+$")
+
 asHead = """table bed
 "Browser extensible data (<=12 fields) "
     (
 """
 
 asLines = """
     string chrom;      "Chromosome (or contig, scaffold, etc.)"
     uint   chromStart; "Start position in chromosome"
     uint   chromEnd;   "End position in chromosome"
     string name;       "Name of item"
     uint   score;      "Score from 0-1000"
     char[1] strand;    "+ or -"
     uint thickStart;   "Start of where display should be thick (start codon)"
     uint thickEnd;     "End of where display should be thick (stop codon)"
     uint reserved;     "Used as itemRgb as of 2004-11-22"
@@ -786,132 +794,166 @@
             elif dispMode=="compact":
                 tdb["visibility"] = "dense"
             else:
                 tdb["visibility"] = "pack"
         else:
             tdb["visibility"] = "pack"
     
         writeStanza(ofh, 0, tdb)
 
 def installModule(package):
     " install a package "
     logging.info("Could not find Python module '%s', trying to install with pip" % package)
     subprocess.check_call([sys.executable, "-m", "pip", "install", package])
 
 def cacheLoad(fname):
-    " load file cache from json file "
+    " load file cache from json file, keyed as { hubName: { relPath: {mtime, size} } } "
     if not isfile(fname):
         logging.debug("No upload cache present")
         return {}
 
     logging.debug("Loading "+fname)
-    return json.load(open(fname))
+    with open(fname) as fh:
+        data = json.load(fh)
+    # Old cache shape was { localPath: {mtime, size} }; detect and discard.
+    # Cache is purely local state, so no migration code.
+    for v in data.values():
+        if not isinstance(v, dict) or "mtime" in v:
+            logging.info("upload cache %s is in the old flat shape, discarding" % fname)
+            return {}
+        break
+    return data
 
 def cacheWrite(uploadCache, fname):
     logging.debug("Writing "+fname)
-    json.dump(uploadCache, open(fname, "w"), indent=4)
+    with open(fname, "w") as fh:
+        json.dump(uploadCache, fh, indent=4)
+
+def validateHubName(hubName):
+    " errAbort if hubName isn't a single segment matching the JS parentDirSegmentRegex "
+    # Trailing dots (e.g. "hub.") are allowed for parity with the JS regex.
+    if not hubName:
+        errAbort("hub name is empty")
+    if "/" in hubName or hubName in (".", "..") or hubName.startswith("."):
+        errAbort("hub name '%s' must be a single path segment, no '/' and no leading '.'" % hubName)
+    if not hubNameSegmentRegex.match(hubName):
+        errAbort("hub name '%s' has invalid characters; allowed: letters, digits, '.', '_'" % hubName)
 
 def getFileType(fbase):
     " return the file type defined in the hubspace system, given a base file name "
+    # hub.txt and <hubname>.hub.txt are both fileType=hub.txt; the server uses
+    # the literal filename (not fileType) to tell them apart
+    if fbase == "hub.txt" or fbase.endswith(".hub.txt"):
+        logging.debug("file type for %s is hub.txt" % fbase)
+        return "hub.txt"
+
     ret = "NA"
     for fileType, fileExts in fileTypeExtensions.items():
+        if fileType == "hub.txt":
+            continue
         for fileExt in fileExts:
             if fbase.endswith(fileExt):
                 ret = fileType
                 break
         if ret!="NA":
             break
 
     logging.debug("file type for %s is %s" % (fbase, ret))
     return ret
 
 def uploadFiles(tdbDir, hubName):
     "upload files to hubspace. Server name and token can come from ~/.hubtools.conf "
+    validateHubName(hubName)
+
     try:
         from tusclient import client
     except ModuleNotFoundError:
         installModule("tuspy")
         from tusclient import client
 
     serverUrl = cfgOption("tusUrl", "https://hubspace.gi.ucsc.edu/files")
 
     cookies = {}
     cookieNameUser = cfgOption("wiki.userNameCookie", "wikidb_mw1_UserName")
     cookieNameId = cfgOption("wiki.loggedInCookie", "wikidb_mw1_UserID")
 
     apiKey = cfgOption("apiKey")
     if apiKey is None:
         errAbort("To upload files, the file ~/.hubtools.conf must contain a line like apiKey='xxx').\n"
                 "Go to https://genome.ucsc.edu/cgi-bin/hgHubConnect#dev to create a new  apiKey. Then run \n"
                 "   echo 'apiKey=\"xxxx\"' >> ~/.hubtools.conf \n"
                 "and run the 'hubtools up' command again.")
 
     logging.info(f"TUS server URL: {serverUrl}")
     my_client = client.TusClient(serverUrl)
 
     cacheFname = join(tdbDir, ".hubtools.files.json")
     uploadCache = cacheLoad(cacheFname)
+    hubCache = uploadCache.setdefault(hubName, {})
 
     logging.debug("trackDb directory is %s" % tdbDir)
-    for rootDir, _, files in os.walk(tdbDir):
+    for rootDir, dirs, files in os.walk(tdbDir):
+        # don't descend into dot-dirs (.git, .cache, ...); their contents
+        # would otherwise upload with a parentDir segment the server rejects
+        dirs[:] = [d for d in dirs if not d.startswith(".")]
         for fbase in files:
             if fbase.startswith("."):
                 continue
             localPath = normpath(join(rootDir, fbase))
             logging.debug("localPath: %s" % localPath)
             localMtime = os.stat(localPath).st_mtime
 
-            # skip files that have not changed their mtime since last upload
-            if localPath in uploadCache:
-                cacheMtime = uploadCache[localPath]["mtime"]
+            fileAbsPath = abspath(localPath)
+            # POSIX-style relative path inside the hub, with hubName as the root
+            remoteRelPath = relpath(fileAbsPath, tdbDir).replace(os.sep, "/")
+            subDir = dirname(remoteRelPath)
+            parentDir = hubName + "/" + subDir if subDir else hubName
+
+            # skip files that have not changed their mtime since last upload to this hub
+            if remoteRelPath in hubCache:
+                cacheMtime = hubCache[remoteRelPath]["mtime"]
                 if localMtime == cacheMtime:
                     logging.info("%s: file mtime unchanged, not uploading again" % localPath)
                     continue
                 else:
                     logging.debug("file %s: mtime is %f, cache mtime is %f, need to re-upload" %
                             (localPath, localMtime, cacheMtime))
             else:
-                logging.debug("file %s not in upload cache" % localPath)
+                logging.debug("file %s not in upload cache for hub %s" % (localPath, hubName))
 
             fileType = getFileType(fbase)
 
-            fileAbsPath = abspath(localPath)
-            remoteRelPath = relpath(fileAbsPath, tdbDir)
-            remoteDir = dirname(remoteRelPath)
-
             meta = {
                     "apiKey" : apiKey,
-                    "parentDir" : remoteDir,
-                    "genome":"NA",
+                    "parentDir" : parentDir,
+                    "genome" : "",
                     "fileName" : fbase,
-                    "hubName" : hubName,
                     "hubtools" : "true",
                     "fileType": fileType,
                     "lastModified" : str(int(localMtime)*1000),
             }
 
             logging.info(f"Uploading {localPath}, meta {meta}")
             uploader = my_client.uploader(localPath, metadata=meta)
             uploader.upload()
 
-            # note that this file was uploaded
-            cache = {}
-            mtime = os.stat(localPath).st_mtime
-            cache["mtime"] = mtime
-            cache["size"] = os.stat(localPath).st_size
-            uploadCache[localPath] = cache
-
+            # record this file as uploaded and persist the cache after each
+            # upload so an interrupted run doesn't re-upload finished files
+            hubCache[remoteRelPath] = {
+                    "mtime": os.stat(localPath).st_mtime,
+                    "size": os.stat(localPath).st_size,
+            }
             cacheWrite(uploadCache, cacheFname)
 
 def iterRaStanzas(fname):
     " parse an ra-style (trackDb) file and yield dictionaries "
     data = dict()
     logging.debug("Parsing %s in trackDb format" % fname)
     with open(fname, "rt") as ifh:
         for l in ifh:
             l = l.lstrip(" ").rstrip("\r\n")
             if len(l)==0:
                 yield data
                 data = dict()
             else:
                 if " " not in l:
                     continue