src/utils/hubtools/hubtools 0159c1e515b598e2050935f4f12a2c5a5023a433

0159c1e515b598e2050935f4f12a2c5a5023a433
max
  Tue May 26 09:22:03 2026 -0700
hubtools: fix critical runtime crashes and logic errors, refs #36790

- stanzaAddVal/stanzaNew had no body or were undefined; add both
- tdbCommentsAppendOneList iterated wrong variable name (tdb vs pairs)
- fetchChromSizes: wrap raw bytes in io.BytesIO for gzip.GzipFile
- getChromSizesFname: remove reference to undefined variable dataDir
- errAbort: bare raise outside except replaced with raise Exception(msg)
- httpReqOld: path=url -> path=parsed_url.path in no-params branch
- broadPeak conversion: correct type to bigBed 6+3, bedType to bed6+3
- getAsFname: add bigBroadPeak URL to prevent KeyError
- httpReq: add raise_for_status() to surface 4xx/5xx errors
- Add missing imports: urllib.parse, gzip, csv, io
- except: -> except ImportError: for requests import guard
- tarfile.extractall: use filter='data' on Python 3.12+ to prevent path traversal
- ThreadPoolExecutor: use with-block to ensure shutdown
- downloadUrl: HTTPError now re-raised after logging at error level
- logging.warn() -> logging.warning() (removed in Python 3.12)
- Fix docstring typos: dadta, apend, stray z in comment

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools
index 94ba7f322d6..7eb6c45a43f 100755
--- src/utils/hubtools/hubtools
+++ src/utils/hubtools/hubtools
@@ -1,30 +1,30 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, os, json, subprocess, shutil, string, glob, tempfile, re
-import shlex, urllib, time, tarfile, hashlib
+import shlex, urllib, urllib.parse, time, tarfile, hashlib, gzip, csv, io
 
 from pathlib import Path
 from collections import defaultdict, OrderedDict
 from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir, normpath
 from urllib.parse import unquote
 import concurrent.futures
 
 requestsLoaded = True
 try:
     import requests # if this fails, install the library with 'python -m pip install requests --user'
-except:
+except ImportError:
     # most code now can work around the absence of the requests library
     requestsLoaded = False
 
 #import pyyaml   # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml()
 
 # ==== functions =====
 
 # debugging: when activated with -d, output more information and do not remove any temp files
 debugMode = False
 # full extra verbose output of all HTTP requests sent, there is no command line option for this
 doVerbose = False
 
 # allowed file types by hubtools up
 # mirrors extensionMap in hg/js/hgMyData.js
 # hub.txt comes before text so hub.txt and *.hub.txt don't fall through to text
@@ -157,31 +157,31 @@
         parser.print_help()
         exit(1)
 
     global debugMode
     if options.debug:
         debugMode = True
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 def errAbort(msg):
     " print and abort) "
     logging.error(msg)
     if debugMode:
-        raise # show stacktrace
+        raise Exception(msg)
     else:
         sys.exit(1)
 
 def makedirs(d):
     if not isdir(d):
         os.makedirs(d)
 
 def parseConf(fname):
     " parse a hg.conf style file, return as dict key -> value (all strings) "
     logging.debug("Parsing "+fname)
     conf = {}
     for line in open(fname):
         line = line.strip()
         if line.startswith("#"):
             continue
@@ -679,31 +679,31 @@
 
 def httpReqOld(url, asBytes=False, asJson=False, params=None, method="get", max_redirects=10):
     " get data with HTTP and return it. Does not use the requests library. "
     # Parse URL
     import http.client
     import urllib.parse
     parsed_url = urllib.parse.urlparse(url)
     connection = http.client.HTTPSConnection(parsed_url.netloc) if parsed_url.scheme == "https" else http.client.HTTPConnection(parsed_url.netloc)
 
     if params is not None and len(params)!=0:
         query_string = urllib.parse.urlencode(params or {})
         path = parsed_url.path or "/"
         if method.lower() == "get" and query_string:
             path += "?" + query_string
     else:
-        path = url
+        path = parsed_url.path or "/"
     
     # Set headers and body
     headers = {}
     body = None
     if method.lower() == "post":
         headers["Content-Type"] = "application/x-www-form-urlencoded"
         body = query_string
 
     logging.debug(f"HTTP {method.upper()} {url}")
 
     try:
         # Send the request
         connection.request(method.upper(), path, body=body, headers=headers)
         response = connection.getresponse()
 
@@ -735,30 +735,31 @@
     if not requestsLoaded:
         return httpReqOld(url, asBytes, asJson, params)
 
     if doVerbose:
         import http.client
         http.client.HTTPConnection.debuglevel=1
         logging.basicConfig()
         logging.getLogger().setLevel(logging.DEBUG)
         req_log = logging.getLogger('requests.packages.urllib3')
         req_log.setLevel(logging.DEBUG)
         req_log.propagate = True
 
     logging.debug("HTTP GET %s" % url)
     try:
         resp = requests.get(url, params=params, allow_redirects=True)
+        resp.raise_for_status()
     except requests.RequestException as e:
         errAbort(f"Error fetching the URL: {e}")
 
     if asBytes:
         logging.debug("HTTP GET with raw bytes")
         return resp.content
     elif asJson:
         logging.debug("HTTP GET with JSON")
         return resp.json()
     else:
         logging.debug("HTTP GET with text")
         return resp.text
 
 def importJbrowse(baseUrl, db, outDir):
     " import an IGV trackList.json hierarchy "
@@ -1117,80 +1118,80 @@
     for fieldIdx, val in enumerate(row):
         if not fieldIdx in skipFields:
             bedRow.append( row[fieldIdx] )
 
     return bedRow
 
 def fetchChromSizes(db, outputFileName):
     " find on local disk or download a <db>.sizes text file "
     # Construct the URL based on the database name
     url = f"https://hgdownload.cse.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz"
     # Send a request to download the file
     chromSizesData = httpReq(url, asBytes=True)
     # Open the output gzip file for writing
     with gzip.open(outputFileName, 'wt') as outFile:
         # Open the response content as a gzip file in text mode
-        with gzip.GzipFile(fileobj=chromSizesData, mode='r') as inFile:
+        with gzip.GzipFile(fileobj=io.BytesIO(chromSizesData), mode='r') as inFile:
             # Read the content using csv reader to handle tab-separated values
             reader = csv.reader(inFile, delimiter='\t')
             writer = csv.writer(outFile, delimiter='\t', lineterminator='\n')
             # Iterate through each row, and retain only the first two fields
             for row in reader:
                 writer.writerow(row[:2])  # Write only the first two fields
     logging.info("Downloaded %s to %s" % (db, outputFileName))
 
 def getLocalDataPath(fname):
     " return local filename in directory for hubtool files "
     localDataDir = os.path.expanduser("~/.local/hubtools")
     if not isdir(localDataDir):
         logging.info("Creating directory "+localDataDir)
         os.makedirs(localDataDir)
     fname = join(localDataDir, fname)
     return fname
 
 def getAsFname(fileType):
-    " download an .as file into the local dadta directory "
+    " download an .as file into the local data directory "
     fname = getLocalDataPath(fileType+".as")
     urls = {
-            "bigNarrowPeak" : "https://genome.ucsc.edu/goldenpath/help/examples/bigNarrowPeak.as"
+            "bigNarrowPeak" : "https://genome.ucsc.edu/goldenpath/help/examples/bigNarrowPeak.as",
+            "bigBroadPeak"  : "https://genome.ucsc.edu/goldenpath/help/examples/bigBroadPeak.as",
             }
     if not isfile(fname):
         url = urls[fileType]
         downloadUrl(url, fname)
     return fname
 
 def getChromSizesFname(db):
     " return fname of chrom sizes, download into ~/.local/hubtools/ if not found "
     fname = "/hive/data/genomes/%s/chrom.sizes" % db
     if isfile(fname):
         return fname
 
     fname = getLocalDataPath("%s.sizes" % db)
     if not isfile(fname):
-        makedirs(dataDir)
         fetchChromSizes(db, fname)
     return fname
 
 def convTsv(db, tsvFname, outBedFname, outAsFname, outBbFname):
     " convert tsv files in inDir to outDir, assume that they all have one column for chrom, start and end. Try to guess these or fail. "
     # join and output merged bed
     bigCols = set() # col names of columns with > 255 chars
 
     unsortedBedFh = tempfile.NamedTemporaryFile(suffix=".bed", dir=dirname(outBedFname), mode="wt")
     fieldNames = None
 
-    isOneBased = True # useful in the future maybe, name=0,start=1z,...
+    isOneBased = True # useful in the future maybe, name=0,start=1,...
     bedFieldsDesc = None # in the future, the user may want to input a string like name=0,start=1, but switch this off for now
 
     for line in open(tsvFname):
         row = line.rstrip("\r\n").split("\t")
         if fieldNames is None:
             fieldNames = row
             fieldDesc, notExtraFields = parseFieldDesc(bedFieldsDesc, fieldNames)
             continue
 
         # note fields with data > 255 chars.
         for colName, colData in zip(fieldNames, row):
             if len(colData)>255:
                 bigCols.add(colName)
 
         bedRow = makeBedRow(row, fieldDesc, notExtraFields, isOneBased)
@@ -1361,45 +1362,45 @@
 
     import requests
     try:
         # Send a GET request to the URL
         response = requests.get(url, stream=True)
         response.raise_for_status()  # Raise an exception for HTTP errors
         # Write the content to the specified local file
         tmpFname = local_file_name+".tmp"
         with open(tmpFname, 'wb') as file:
             for chunk in response.iter_content(chunk_size=65536):  # Download in chunks
                 file.write(chunk)
         logging.info(f"Downloaded '{url}' to '{local_file_name}'.")
         os.rename(tmpFname, local_file_name)
 
     except requests.exceptions.HTTPError as e:
-        logging.info(f"Unable to download %s: HTTP error {e}")
+        logging.error(f"Unable to download %s: HTTP error {e}")
+        raise
 
     except requests.exceptions.RequestException as e:
         logging.error(f"An error occurred while downloading the URL: {e}")
         raise
 
 def downloadUrlsParallel(url_filename_list, max_threads=12):
     """ given a list of [url, localFname], download the files with 12 parallel threads """
     logging.info("Downloading %s files with %d parallel threads" % (len(url_filename_list), max_threads))
-    executor = concurrent.futures.ThreadPoolExecutor(max_threads)
     func = downloadUrl
     if not requestsLoaded:
         func = downloadUrlOld
+    with concurrent.futures.ThreadPoolExecutor(max_threads) as executor:
         futures = [executor.submit(func, url, local_filename) for url, local_filename in url_filename_list]
-
         # wait for all futures to complete (this will handle exceptions)
         for future in concurrent.futures.as_completed(futures):
             try:
                 future.result()  # Block until this particular future is done
             except Exception as e:
                 logging.error(f"Error in thread: {e}")
 
 def makeLegalTrackName(s):
     " remove characters that are not allowed for track names "
     s = s.replace(" ", "_")
     # the only problem of this is that you can run into duplicated track names, e.g. "MyTrack!!" and "MyTrack!" are both "MyTrack"
     return re.sub('[^A-Za-z_0-9]+', '', s)
 
 def mustBeLegalTrackName(s):
     " error abort if s is not a legal track name "
@@ -1448,32 +1449,32 @@
 
     if trackType.startswith("bed"):
         bedToBigBed(textFname, db, outFname)
         tdb["type"] = "bigBed"
     elif trackType=="narrowPeak":
         asFname = getAsFname("bigNarrowPeak")
         tmpFh = makeTempFile(dir=outDir, suffix=".bed")
         narrowPeakToBigNarrowPeak(textFname, tmpFh)
         bedToBigBed(tmpFh.name, db, outFname, asFname=asFname, bedType="bed6+4")
         tdb["type"] = "bigBed 6+"
         tdb["spectrum"] = "on"
     elif trackType=="broadPeak":
         asFname = getAsFname("bigBroadPeak")
         tmpFh = makeTempFile(dir=outDir, suffix=".bed")
         broadPeakToBed(textFname, tmpFh)
-        bedToBigBed(tmpFh.name, db, outFname, asFname=asFname, bedType="bed6+4")
-        tdb["type"] = "bigNarrowPeak"
+        bedToBigBed(tmpFh.name, db, outFname, asFname=asFname, bedType="bed6+3")
+        tdb["type"] = "bigBed 6+3"
         tdb["spectrum"] = "on"
 
     else:
         errAbort("No support yet for track type '%s'. Please contact us." % trackType)
 
     tdb["bigDataUrl"] = basename(outFname)
     return tdb
 
 def convCtDb(hubInDir, db, inDir, outDir, doDownload):
     " convert one db part of a track archive to an output directory "
     meta = parseMeta([hubInDir])
 
     findGlob = join(inDir, "*.ct")
     inFnames = glob.glob(findGlob)
     if len(inFnames)==0:
@@ -1636,31 +1637,31 @@
             val = val.lstrip("'").rstrip("'")
 
             if key=="description":
                 key = "longLabel"
             if key=="name":
                 key = "track"
 
             if key in ["inputType", "tdbType"]:
                 continue
 
             tdb[key] = val
             if key=="track":
                 tdb["shortLabel"] = val
 
         if "dbTableName" in tdb or "bigDataUrl" not in tdb:
-            logging.warn("Cannot import custom tracks without bigDataUrls, skipping %s" % tdb["track"])
+            logging.warning("Cannot import custom tracks without bigDataUrls, skipping %s" % tdb["track"])
         else:
             tdbs.append(tdb)
 
     return tdbs
 
 def downloadTrackArchive(serverUrl, hgsid, ofh):
     " download the track archive for a given hgsid and save it under tgzFname "
 
     # https://hgwdev-max.gi.ucsc.edu/cgi-bin/hgSession?hgsid=425203018_AsoR0syMagMP0brh6a2Y2F7R2RNi&hgS_makeDownload_=Submit
     logging.info("Getting track archive from server %s, hgsid %s" % (serverUrl, hgsid))
     cgiUrl = serverUrl+f"/cgi-bin/hgSession"
     params = {"hgsid":hgsid, "hgS_makeDownload_":"Submit"}
 
     page = httpReq(cgiUrl, params=params)
     statusToken = re.search(r"backgroundStatus=([^&]*)", page).group(1)
@@ -1715,53 +1716,65 @@
     if url.startswith("http"):
         if "hgsid=" in url:
             serverUrl, hgsid = hgsidFromUrl(url)
         else:
             serverUrl, hgsid = hgsidFromPage(url)
 
         tgzFh = makeTempFile(dir=downDir, suffix=".tar.gz", mode="wb")
         downloadTrackArchive(serverUrl, hgsid, tgzFh)
         tgzFname = tgzFh.name
     else:
         tgzFh = None
         tgzFname = url
 
     logging.info("Extracting %s to %s" % (tgzFname, downDir))
     with tarfile.open(tgzFname, 'r:gz') as tar:
+        try:
+            tar.extractall(path=downDir, filter='data')
+        except TypeError:
             tar.extractall(path=downDir)
 
     convArchDir(inDir, downDir, outDir, doDownload)
 
     if not debugMode:
         if tgzFh:
             tgzFh.close() # = deletes temp file
         logging.info("Removing %s" % downDir)
         shutil.rmtree(downDir)
 
 def stanzaKey(stanza):
     " return key of stanza, so track name or .hub or .genome "
     if "track" in stanza:
         return stanza["track"][2]
     else:
         for tdbType in ["hub", "genome"]:
             if tdbType in stanza:
                 return "."+tdbType
 
     errAbort("Got hub.txt file with a stanza that has neither a 'track', nor a 'hub', nor a 'genome' key: %s" % repr(stanza))
 
-def stanzaNew(pairs):
+def stanzaNew(pairs, indent=0):
     " given key,val pairs, make a new stanza "
+    newTdb = OrderedDict()
+    for key, val in pairs:
+        newTdb[key] = ([], indent, val)
+    return newTdb
+
+def stanzaAddVal(tdb, tag, val):
+    " add or update a key/val in a stanza, inheriting indent from existing entries "
+    indent = next(iter(tdb.values()))[1] if tdb else 0
+    tdb[tag] = ([], indent, val)
 
 def stanzaMatchesRe(tdb, tags, pat):
     " try to match pat (a compiled regex) against values of all tags listed in 'tags'. Never match the special stanzas .hub and .genome . "
     for tag in tags:
         if tag in tdb:
             comments, indent, value = tdb[tag]
             if pat.search(value):
                 return True
     return False
 
 def stanzaMatchesTrack(tdb, searchName):
     " True if track name is same as searchName. False for non-track stanzas."
     tag = "track"
     if not tag in tdb:
         return False
@@ -1839,31 +1852,31 @@
                 comments = []
             # preserve empty lines at the file beginning or within comments
             else:
                 comments.append("")
 
         # a normal line can either be a comment or a "key<space>value" lines
         else:
             if line.lstrip(" ").startswith("#"):
                 comments.append(line)
                 continue
             else:
                 nonWhite = line.lstrip()
                 indent = len(line) - len(nonWhite)
                 key, val = nonWhite.split(" ", 1) # preserve trailing whitespace
                 if key in stanza:
-                    logging.warn(f"TrackDb .ra format error: duplicated key {key} in stanza {stanza}")
+                    logging.warning(f"TrackDb .ra format error: duplicated key {key} in stanza {stanza}")
                 stanza[key] = (comments, indent, val)
                 comments = []
 
     # handle last stanza, most files don't end with a newline
     if len(stanza)!=0:
         tdbKey = stanzaKey(stanza)
         tdbs[tdbKey] = stanza
 
     tdbCount = len(tdbs)
     logging.info(f"Read {fname}, {tdbCount} stanzas")
     return tdbs
 
 def tdbCommentsWrite(tdbs, fname):
     " write back the structure returned by tdbCommentsParse into fname "
     ofh = open(fname, "wt")
@@ -1879,49 +1892,49 @@
     ofh.close()
 
     tdbCount = len(tdbs)
     logging.info(f"Wrote {fname}, {tdbCount} stanzas")
 
 def tdbCommentsAppendStanza(tdbs, tdb, indent=0):
     " add a single OrderedDict() as a new track stanza to the end of the data structure from tdbCommentsParse "
     tdbKey = stanzaKey(tdb)
     tdbs[tdbKey] = tdb
     return tdbs
 
 def tdbCommentsAppendOneList(tdbs, pairs, indent=0):
     " add a single list of pairs as a new track stanza to the end of the data structure from tdbCommentsParse "
     newTdb = OrderedDict()
     trackName = None
-    for key, val in tdb:
+    for key, val in pairs:
         newTdb[key] = ([], indent, val)
         if key=="track":
             trackName = val
     assert(trackName is not None)
     tdbs[trackName] = newTdb
     return tdbs
 
 def tdbCommentsEdit(tdbs, indent=None, newTags=None, delTags=None):
     " indent stanzas of a tdbCommentsParse data structure or add some keyVals "
     newTdbs = OrderedDict()
     for trackName, tdb in tdbs.items():
         newTdb = stanzaEdit(tdb, indent=indent, newTags=newTags, delTags=delTags)
         newTdbs[trackName] = newTdb
 
     return newTdbs
 
 def tdbCommentsAppendAll(tdbs1, tdbs2):
-    " apend the second tdbCommentsParse data structure to the first "
+    " append the second tdbCommentsParse data structure to the first "
     for key, val in tdbs2.items():
         tdbs1[key] = val
     return tdbs1
 
 def tdbCommentsInsertAfter(tdbs, parentName, insertTdbs):
     " search in tdbs for a track parentName, insert newTdbs, and return the result "
     newTdbs = OrderedDict()
     for name, tdb in tdbs.items():
         tdbCommentsAppendStanza(newTdbs, tdb)
         if stanzaMatchesTrack(tdb, parentName):
             tdbCommentsAppendAll(newTdbs, insertTdbs)
     return newTdbs
 
 def addView(hubFname, contType, contName, contLabel):
     " add a view under a container "