0159c1e515b598e2050935f4f12a2c5a5023a433 max Tue May 26 09:22:03 2026 -0700 hubtools: fix critical runtime crashes and logic errors, refs #36790 - stanzaAddVal/stanzaNew had no body or were undefined; add both - tdbCommentsAppendOneList iterated wrong variable name (tdb vs pairs) - fetchChromSizes: wrap raw bytes in io.BytesIO for gzip.GzipFile - getChromSizesFname: remove reference to undefined variable dataDir - errAbort: bare raise outside except replaced with raise Exception(msg) - httpReqOld: path=url -> path=parsed_url.path in no-params branch - broadPeak conversion: correct type to bigBed 6+3, bedType to bed6+3 - getAsFname: add bigBroadPeak URL to prevent KeyError - httpReq: add raise_for_status() to surface 4xx/5xx errors - Add missing imports: urllib.parse, gzip, csv, io - except: -> except ImportError: for requests import guard - tarfile.extractall: use filter='data' on Python 3.12+ to prevent path traversal - ThreadPoolExecutor: use with-block to ensure shutdown - downloadUrl: HTTPError now re-raised after logging at error level - logging.warn() -> logging.warning() (removed in Python 3.12) - Fix docstring typos: dadta, apend, stray z in comment Co-Authored-By: Claude Sonnet 4.6 diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools index 94ba7f322d6..7eb6c45a43f 100755 --- src/utils/hubtools/hubtools +++ src/utils/hubtools/hubtools @@ -1,30 +1,30 @@ #!/usr/bin/env python3 import logging, sys, optparse, os, json, subprocess, shutil, string, glob, tempfile, re -import shlex, urllib, time, tarfile, hashlib +import shlex, urllib, urllib.parse, time, tarfile, hashlib, gzip, csv, io from pathlib import Path from collections import defaultdict, OrderedDict from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir, normpath from urllib.parse import unquote import concurrent.futures requestsLoaded = True try: import requests # if this fails, install the library with 'python -m pip install requests --user' -except: +except ImportError: # most code now can work around the absence of the requests library requestsLoaded = False #import pyyaml # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml() # ==== functions ===== # debugging: when activated with -d, output more information and do not remove any temp files debugMode = False # full extra verbose output of all HTTP requests sent, there is no command line option for this doVerbose = False # allowed file types by hubtools up # mirrors extensionMap in hg/js/hgMyData.js # hub.txt comes before text so hub.txt and *.hub.txt don't fall through to text @@ -157,31 +157,31 @@ parser.print_help() exit(1) global debugMode if options.debug: debugMode = True logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def errAbort(msg): " print and abort) " logging.error(msg) if debugMode: - raise # show stacktrace + raise Exception(msg) else: sys.exit(1) def makedirs(d): if not isdir(d): os.makedirs(d) def parseConf(fname): " parse a hg.conf style file, return as dict key -> value (all strings) " logging.debug("Parsing "+fname) conf = {} for line in open(fname): line = line.strip() if line.startswith("#"): continue @@ -679,31 +679,31 @@ def httpReqOld(url, asBytes=False, asJson=False, params=None, method="get", max_redirects=10): " get data with HTTP and return it. Does not use the requests library. " # Parse URL import http.client import urllib.parse parsed_url = urllib.parse.urlparse(url) connection = http.client.HTTPSConnection(parsed_url.netloc) if parsed_url.scheme == "https" else http.client.HTTPConnection(parsed_url.netloc) if params is not None and len(params)!=0: query_string = urllib.parse.urlencode(params or {}) path = parsed_url.path or "/" if method.lower() == "get" and query_string: path += "?" + query_string else: - path = url + path = parsed_url.path or "/" # Set headers and body headers = {} body = None if method.lower() == "post": headers["Content-Type"] = "application/x-www-form-urlencoded" body = query_string logging.debug(f"HTTP {method.upper()} {url}") try: # Send the request connection.request(method.upper(), path, body=body, headers=headers) response = connection.getresponse() @@ -735,30 +735,31 @@ if not requestsLoaded: return httpReqOld(url, asBytes, asJson, params) if doVerbose: import http.client http.client.HTTPConnection.debuglevel=1 logging.basicConfig() logging.getLogger().setLevel(logging.DEBUG) req_log = logging.getLogger('requests.packages.urllib3') req_log.setLevel(logging.DEBUG) req_log.propagate = True logging.debug("HTTP GET %s" % url) try: resp = requests.get(url, params=params, allow_redirects=True) + resp.raise_for_status() except requests.RequestException as e: errAbort(f"Error fetching the URL: {e}") if asBytes: logging.debug("HTTP GET with raw bytes") return resp.content elif asJson: logging.debug("HTTP GET with JSON") return resp.json() else: logging.debug("HTTP GET with text") return resp.text def importJbrowse(baseUrl, db, outDir): " import an IGV trackList.json hierarchy " @@ -1117,80 +1118,80 @@ for fieldIdx, val in enumerate(row): if not fieldIdx in skipFields: bedRow.append( row[fieldIdx] ) return bedRow def fetchChromSizes(db, outputFileName): " find on local disk or download a .sizes text file " # Construct the URL based on the database name url = f"https://hgdownload.cse.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz" # Send a request to download the file chromSizesData = httpReq(url, asBytes=True) # Open the output gzip file for writing with gzip.open(outputFileName, 'wt') as outFile: # Open the response content as a gzip file in text mode - with gzip.GzipFile(fileobj=chromSizesData, mode='r') as inFile: + with gzip.GzipFile(fileobj=io.BytesIO(chromSizesData), mode='r') as inFile: # Read the content using csv reader to handle tab-separated values reader = csv.reader(inFile, delimiter='\t') writer = csv.writer(outFile, delimiter='\t', lineterminator='\n') # Iterate through each row, and retain only the first two fields for row in reader: writer.writerow(row[:2]) # Write only the first two fields logging.info("Downloaded %s to %s" % (db, outputFileName)) def getLocalDataPath(fname): " return local filename in directory for hubtool files " localDataDir = os.path.expanduser("~/.local/hubtools") if not isdir(localDataDir): logging.info("Creating directory "+localDataDir) os.makedirs(localDataDir) fname = join(localDataDir, fname) return fname def getAsFname(fileType): - " download an .as file into the local dadta directory " + " download an .as file into the local data directory " fname = getLocalDataPath(fileType+".as") urls = { - "bigNarrowPeak" : "https://genome.ucsc.edu/goldenpath/help/examples/bigNarrowPeak.as" + "bigNarrowPeak" : "https://genome.ucsc.edu/goldenpath/help/examples/bigNarrowPeak.as", + "bigBroadPeak" : "https://genome.ucsc.edu/goldenpath/help/examples/bigBroadPeak.as", } if not isfile(fname): url = urls[fileType] downloadUrl(url, fname) return fname def getChromSizesFname(db): " return fname of chrom sizes, download into ~/.local/hubtools/ if not found " fname = "/hive/data/genomes/%s/chrom.sizes" % db if isfile(fname): return fname fname = getLocalDataPath("%s.sizes" % db) if not isfile(fname): - makedirs(dataDir) fetchChromSizes(db, fname) return fname def convTsv(db, tsvFname, outBedFname, outAsFname, outBbFname): " convert tsv files in inDir to outDir, assume that they all have one column for chrom, start and end. Try to guess these or fail. " # join and output merged bed bigCols = set() # col names of columns with > 255 chars unsortedBedFh = tempfile.NamedTemporaryFile(suffix=".bed", dir=dirname(outBedFname), mode="wt") fieldNames = None - isOneBased = True # useful in the future maybe, name=0,start=1z,... + isOneBased = True # useful in the future maybe, name=0,start=1,... bedFieldsDesc = None # in the future, the user may want to input a string like name=0,start=1, but switch this off for now for line in open(tsvFname): row = line.rstrip("\r\n").split("\t") if fieldNames is None: fieldNames = row fieldDesc, notExtraFields = parseFieldDesc(bedFieldsDesc, fieldNames) continue # note fields with data > 255 chars. for colName, colData in zip(fieldNames, row): if len(colData)>255: bigCols.add(colName) bedRow = makeBedRow(row, fieldDesc, notExtraFields, isOneBased) @@ -1361,45 +1362,45 @@ import requests try: # Send a GET request to the URL response = requests.get(url, stream=True) response.raise_for_status() # Raise an exception for HTTP errors # Write the content to the specified local file tmpFname = local_file_name+".tmp" with open(tmpFname, 'wb') as file: for chunk in response.iter_content(chunk_size=65536): # Download in chunks file.write(chunk) logging.info(f"Downloaded '{url}' to '{local_file_name}'.") os.rename(tmpFname, local_file_name) except requests.exceptions.HTTPError as e: - logging.info(f"Unable to download %s: HTTP error {e}") + logging.error(f"Unable to download %s: HTTP error {e}") + raise except requests.exceptions.RequestException as e: logging.error(f"An error occurred while downloading the URL: {e}") raise def downloadUrlsParallel(url_filename_list, max_threads=12): """ given a list of [url, localFname], download the files with 12 parallel threads """ logging.info("Downloading %s files with %d parallel threads" % (len(url_filename_list), max_threads)) - executor = concurrent.futures.ThreadPoolExecutor(max_threads) func = downloadUrl if not requestsLoaded: func = downloadUrlOld + with concurrent.futures.ThreadPoolExecutor(max_threads) as executor: futures = [executor.submit(func, url, local_filename) for url, local_filename in url_filename_list] - # wait for all futures to complete (this will handle exceptions) for future in concurrent.futures.as_completed(futures): try: future.result() # Block until this particular future is done except Exception as e: logging.error(f"Error in thread: {e}") def makeLegalTrackName(s): " remove characters that are not allowed for track names " s = s.replace(" ", "_") # the only problem of this is that you can run into duplicated track names, e.g. "MyTrack!!" and "MyTrack!" are both "MyTrack" return re.sub('[^A-Za-z_0-9]+', '', s) def mustBeLegalTrackName(s): " error abort if s is not a legal track name " @@ -1448,32 +1449,32 @@ if trackType.startswith("bed"): bedToBigBed(textFname, db, outFname) tdb["type"] = "bigBed" elif trackType=="narrowPeak": asFname = getAsFname("bigNarrowPeak") tmpFh = makeTempFile(dir=outDir, suffix=".bed") narrowPeakToBigNarrowPeak(textFname, tmpFh) bedToBigBed(tmpFh.name, db, outFname, asFname=asFname, bedType="bed6+4") tdb["type"] = "bigBed 6+" tdb["spectrum"] = "on" elif trackType=="broadPeak": asFname = getAsFname("bigBroadPeak") tmpFh = makeTempFile(dir=outDir, suffix=".bed") broadPeakToBed(textFname, tmpFh) - bedToBigBed(tmpFh.name, db, outFname, asFname=asFname, bedType="bed6+4") - tdb["type"] = "bigNarrowPeak" + bedToBigBed(tmpFh.name, db, outFname, asFname=asFname, bedType="bed6+3") + tdb["type"] = "bigBed 6+3" tdb["spectrum"] = "on" else: errAbort("No support yet for track type '%s'. Please contact us." % trackType) tdb["bigDataUrl"] = basename(outFname) return tdb def convCtDb(hubInDir, db, inDir, outDir, doDownload): " convert one db part of a track archive to an output directory " meta = parseMeta([hubInDir]) findGlob = join(inDir, "*.ct") inFnames = glob.glob(findGlob) if len(inFnames)==0: @@ -1636,31 +1637,31 @@ val = val.lstrip("'").rstrip("'") if key=="description": key = "longLabel" if key=="name": key = "track" if key in ["inputType", "tdbType"]: continue tdb[key] = val if key=="track": tdb["shortLabel"] = val if "dbTableName" in tdb or "bigDataUrl" not in tdb: - logging.warn("Cannot import custom tracks without bigDataUrls, skipping %s" % tdb["track"]) + logging.warning("Cannot import custom tracks without bigDataUrls, skipping %s" % tdb["track"]) else: tdbs.append(tdb) return tdbs def downloadTrackArchive(serverUrl, hgsid, ofh): " download the track archive for a given hgsid and save it under tgzFname " # https://hgwdev-max.gi.ucsc.edu/cgi-bin/hgSession?hgsid=425203018_AsoR0syMagMP0brh6a2Y2F7R2RNi&hgS_makeDownload_=Submit logging.info("Getting track archive from server %s, hgsid %s" % (serverUrl, hgsid)) cgiUrl = serverUrl+f"/cgi-bin/hgSession" params = {"hgsid":hgsid, "hgS_makeDownload_":"Submit"} page = httpReq(cgiUrl, params=params) statusToken = re.search(r"backgroundStatus=([^&]*)", page).group(1) @@ -1715,53 +1716,65 @@ if url.startswith("http"): if "hgsid=" in url: serverUrl, hgsid = hgsidFromUrl(url) else: serverUrl, hgsid = hgsidFromPage(url) tgzFh = makeTempFile(dir=downDir, suffix=".tar.gz", mode="wb") downloadTrackArchive(serverUrl, hgsid, tgzFh) tgzFname = tgzFh.name else: tgzFh = None tgzFname = url logging.info("Extracting %s to %s" % (tgzFname, downDir)) with tarfile.open(tgzFname, 'r:gz') as tar: + try: + tar.extractall(path=downDir, filter='data') + except TypeError: tar.extractall(path=downDir) convArchDir(inDir, downDir, outDir, doDownload) if not debugMode: if tgzFh: tgzFh.close() # = deletes temp file logging.info("Removing %s" % downDir) shutil.rmtree(downDir) def stanzaKey(stanza): " return key of stanza, so track name or .hub or .genome " if "track" in stanza: return stanza["track"][2] else: for tdbType in ["hub", "genome"]: if tdbType in stanza: return "."+tdbType errAbort("Got hub.txt file with a stanza that has neither a 'track', nor a 'hub', nor a 'genome' key: %s" % repr(stanza)) -def stanzaNew(pairs): +def stanzaNew(pairs, indent=0): " given key,val pairs, make a new stanza " + newTdb = OrderedDict() + for key, val in pairs: + newTdb[key] = ([], indent, val) + return newTdb + +def stanzaAddVal(tdb, tag, val): + " add or update a key/val in a stanza, inheriting indent from existing entries " + indent = next(iter(tdb.values()))[1] if tdb else 0 + tdb[tag] = ([], indent, val) def stanzaMatchesRe(tdb, tags, pat): " try to match pat (a compiled regex) against values of all tags listed in 'tags'. Never match the special stanzas .hub and .genome . " for tag in tags: if tag in tdb: comments, indent, value = tdb[tag] if pat.search(value): return True return False def stanzaMatchesTrack(tdb, searchName): " True if track name is same as searchName. False for non-track stanzas." tag = "track" if not tag in tdb: return False @@ -1839,31 +1852,31 @@ comments = [] # preserve empty lines at the file beginning or within comments else: comments.append("") # a normal line can either be a comment or a "keyvalue" lines else: if line.lstrip(" ").startswith("#"): comments.append(line) continue else: nonWhite = line.lstrip() indent = len(line) - len(nonWhite) key, val = nonWhite.split(" ", 1) # preserve trailing whitespace if key in stanza: - logging.warn(f"TrackDb .ra format error: duplicated key {key} in stanza {stanza}") + logging.warning(f"TrackDb .ra format error: duplicated key {key} in stanza {stanza}") stanza[key] = (comments, indent, val) comments = [] # handle last stanza, most files don't end with a newline if len(stanza)!=0: tdbKey = stanzaKey(stanza) tdbs[tdbKey] = stanza tdbCount = len(tdbs) logging.info(f"Read {fname}, {tdbCount} stanzas") return tdbs def tdbCommentsWrite(tdbs, fname): " write back the structure returned by tdbCommentsParse into fname " ofh = open(fname, "wt") @@ -1879,49 +1892,49 @@ ofh.close() tdbCount = len(tdbs) logging.info(f"Wrote {fname}, {tdbCount} stanzas") def tdbCommentsAppendStanza(tdbs, tdb, indent=0): " add a single OrderedDict() as a new track stanza to the end of the data structure from tdbCommentsParse " tdbKey = stanzaKey(tdb) tdbs[tdbKey] = tdb return tdbs def tdbCommentsAppendOneList(tdbs, pairs, indent=0): " add a single list of pairs as a new track stanza to the end of the data structure from tdbCommentsParse " newTdb = OrderedDict() trackName = None - for key, val in tdb: + for key, val in pairs: newTdb[key] = ([], indent, val) if key=="track": trackName = val assert(trackName is not None) tdbs[trackName] = newTdb return tdbs def tdbCommentsEdit(tdbs, indent=None, newTags=None, delTags=None): " indent stanzas of a tdbCommentsParse data structure or add some keyVals " newTdbs = OrderedDict() for trackName, tdb in tdbs.items(): newTdb = stanzaEdit(tdb, indent=indent, newTags=newTags, delTags=delTags) newTdbs[trackName] = newTdb return newTdbs def tdbCommentsAppendAll(tdbs1, tdbs2): - " apend the second tdbCommentsParse data structure to the first " + " append the second tdbCommentsParse data structure to the first " for key, val in tdbs2.items(): tdbs1[key] = val return tdbs1 def tdbCommentsInsertAfter(tdbs, parentName, insertTdbs): " search in tdbs for a track parentName, insert newTdbs, and return the result " newTdbs = OrderedDict() for name, tdb in tdbs.items(): tdbCommentsAppendStanza(newTdbs, tdb) if stanzaMatchesTrack(tdb, parentName): tdbCommentsAppendAll(newTdbs, insertTdbs) return newTdbs def addView(hubFname, contType, contName, contLabel): " add a view under a container "