src/utils/hubtools/hubtools 5d2cc0dd88a130e3e1c3eb3d9f1ad5d7d1f67e04

5d2cc0dd88a130e3e1c3eb3d9f1ad5d7d1f67e04
max
  Wed Feb 26 08:10:53 2025 -0800
typo in previous commit

diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools
index 7ace50d5b2e..437751cd712 100755
--- src/utils/hubtools/hubtools
+++ src/utils/hubtools/hubtools
@@ -1,1276 +1,1276 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, os, json, subprocess, shutil, string, glob, tempfile, re
 import shlex
 from pathlib import Path
 from collections import defaultdict, OrderedDict
 from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir, normpath
 #import pyyaml   # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml()
 
 # ==== functions =====
 # allowed file types by hubtools up
 # copied from the javascript file hgHubConnect.js
 fileTypeExtensions = {
     "bigBed": [ ".bb", ".bigbed" ],
     "bam": [ ".bam" ],
     "vcf": [ ".vcf" ],
     "vcfTabix": [ ".vcf.gz", "vcf.bgz" ],
     "bigWig": [ ".bw", ".bigwig" ],
     "hic": [ ".hic" ],
     "cram": [ ".cram" ],
     "bigBarChart": [ ".bigbarchart" ],
     "bigGenePred": [ ".bgp", ".biggenepred" ],
     "bigMaf": [ ".bigmaf" ],
     "bigInteract": [ ".biginteract" ],
     "bigPsl": [ ".bigpsl" ],
     "bigChain": [ ".bigchain" ],
     "bamIndex": [ ".bam.bai", ".bai" ],
     "tabixIndex": [ ".vcf.gz.tbi", "vcf.bgz.tbi" ]
 }
 
 asHead = """table bed
 "Browser extensible data (<=12 fields) "
     (
 """
 
 asLines = """
     string chrom;      "Chromosome (or contig, scaffold, etc.)"
     uint   chromStart; "Start position in chromosome"
     uint   chromEnd;   "End position in chromosome"
     string name;       "Name of item"
     uint   score;      "Score from 0-1000"
     char[1] strand;    "+ or -"
     uint thickStart;   "Start of where display should be thick (start codon)"
     uint thickEnd;     "End of where display should be thick (stop codon)"
     uint reserved;     "Used as itemRgb as of 2004-11-22"
     int blockCount;    "Number of blocks"
     int[blockCount] blockSizes; "Comma separated list of block sizes"
     int[blockCount] chromStarts; "Start positions relative to chromStart"
 """.split("\n")
 
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs
 
     hubtools make <assemblyCode e.g. hg38>: create a track hub for all bigBed/bigWig files under a directory.
     Creates single-file hub.txt, and tries to guess reasonable settings from the file names:
     - bigBed/bigWig files in the current directory will be top level tracks
     - big* files in subdirectories become composites
     - for every filename, the part before the first dot becomes the track base name
     - if a directory has more than 80% of track base names with both a bigBed
       and bigWig file, views are activated for this composite
     - track attributes can be changed using tracks.ra, tracks.tsv, tracks.json or tracks.yaml files, in
       each top or subdirectory
     - The first column of tracks.tsv must be 'track'.
     - tracks.json and tracks.yaml must have objects at the top level, the attributes are <trackName> or
       the special attribute "hub".
 
     hubtools up <hubName>: upload files to hubSpace
     - needs ~/.hubtools.conf with a line apiKey="xxxx". Create a key by going to
       My Data > Track Hubs > Track Development on https://genome.ucsc.edu
     - uploads all files from the -i directory or the current dir if not specified.
 
     hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt.
     - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/
     - <db> is assembly identifier
 
     hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc.
     - <fname> is the input filename. "hub" and "genome" stanzas are skipped.
     - output goes to stdout
 
     hubtools conv -i myTsvDir
     - convert .tsv files in inDir to .bigBed files in current directory
 
     hubtools archive -i xxx/archive -o outDir
     - Create a hub from an extracted xxx.tar.gz file "archive" directory. You can download a .tar.gz from
       in the Genome Browser under My Data > My Session. The archive contains all your custom tracks.
       It is usually easier to use the 'ct' command, as this will automatically download an archive
       an convert it, given a full genome browser URL with an "hgsid" parameter on the URL.
 
     hubtools ct <hgTracksUrl>
     - Download all custom tracks from a Genome Browser URL, either as a /s/ stable shortlink or 
       the temporary hgsid=xxxx URL
 
     Examples:
     hubtools conv -i myTsvs/
     hubtools make hg38
     hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3
     hubtools tab hub.txt > tracks.tsv
     hubtools session
 
     tar xvfz SC_20230723_backup.tar.gz
     hubtools archive -i archive -o hub
     hubtools ct 'https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr7%3A155798978%2D155810671&hgsid=345826202_8d3Mpumjaw9IXYI2RDaSt5xMoUhH'
 
     For the "hubtools make" step, you can specify additional options for your tracks via tracks.json/.yaml or tracks.tsv:
 
     tracks.json can look like this, can have more keys, one per track, or the special key "hub":
     {
         "hub" :      { "hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons" },
         "myTrack" :  { "shortLabel" : "My nice track" }
     }
     tracks.tsv should look like this, other columns can be added:
 
         #track<tab>shortLabel
         myTrack<tab>My nice track
 
     For a list of all possible fields/columns see https://genome.ucsc.edu/goldenpath/help/trackDb/trackDbHub.html:
     """)
 
 
     parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory")
     parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.")
 
     #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy")
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages")
     parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace")
     (options, args) = parser.parse_args()
 
     if len(args)==0:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 def errAbort(msg):
     " print and abort) "
     logging.error(msg)
     sys.exit(1)
 
 def makedirs(d):
     if not isdir(d):
         os.makedirs(d)
 
 def parseConf(fname):
     " parse a hg.conf style file, return as dict key -> value (all strings) "
     logging.debug("Parsing "+fname)
     conf = {}
     for line in open(fname):
         line = line.strip()
         if line.startswith("#"):
             continue
         elif line.startswith("include "):
             inclFname = line.split()[1]
             absFname = normpath(join(dirname(fname), inclFname))
             if os.path.isfile(absFname):
                 inclDict = parseConf(absFname)
                 conf.update(inclDict)
         elif "=" in line: # string search for "="
             key, value = line.split("=",1)
             conf[key] = value
     return conf
 
 # cache of hg.conf contents
 hgConf = None
 
 def parseHgConf():
     """ return hg.conf as dict key:value. """
     global hgConf
     if hgConf is not None:
         return hgConf
 
     hgConf = dict() # python dict = hash table
 
     fname = os.path.expanduser("~/.hubtools.conf")
     if isfile(fname):
         hgConf = parseConf(fname)
     else:
         fname = os.path.expanduser("~/.hg.conf")
         if isfile(fname):
             hgConf = parseConf(fname)
 
 def cfgOption(name, default=None):
     " return hg.conf option or default "
     global hgConf
 
     if not hgConf:
         parseHgConf()
 
     return hgConf.get(name, default)
 
 def parseMetaRa(fname):
     """parse tracks.ra or tracks.txt and return as a dict of trackName -> dict of key->val """
     logging.debug("Reading %s as .ra" % fname)
     trackName = None
     stanzaData = {}
     ret = {}
     for line in open(fname):
         line = line.strip()
         if line.startswith("#"):
             continue
         if line=="":
             if len(stanzaData)==0:
                 # double newline
                 continue
             if trackName is None:
                 errAbort("File %s has a stanza without a track name" % fname)
             if trackName in ret:
                 errAbort("File %s contains two stanzas with the same track name '%s' " % trackName)
 
             ret[trackName] = stanzaData
             stanzaData = {}
             trackName = None
             continue
 
         key, val = line.split(" ", maxsplit=1)
         if key == "track":
             trackName = val
             continue
         else:
             stanzaData[key] = val
 
     if len(stanzaData)!=0:
         ret[trackName] = stanzaData
     logging.debug("Got %s from .ra" % str(ret))
     return ret
 
 def parseMetaTsv(fname):
     " parse a tracks.tsv file and return as a dict of trackName -> dict of key->val "
     headers = None
     meta = {}
     logging.debug("Parsing track meta data from %s in tsv format" % fname)
     for line in open(fname):
         row = line.rstrip("\r\n").split("\t")
         if headers is None:
             assert(line.startswith("track\t") or line.startswith("#track"))
             row[0] = row[0].lstrip("#")
             headers = row
             continue
 
         assert(len(row)==len(headers))
         key = row[0]
 
         rowDict = {}
         for header, val in zip(headers[1:], row[1:]):
             rowDict[header] = val
         #row = {k:v for k,v in zip(headers, fs)}
         meta[key] = rowDict
     return meta
 
 def parseMetaJson(fname):
     " parse a json file and merge it into meta and return "
     logging.debug("Reading %s as json" % fname)
     newMeta = json.load(open(fname))
     return newMeta
 
 def parseMetaYaml(fname):
     " parse yaml file "
     import yaml # if this doesn't work, run 'pip install pyyaml'
     with open(fname) as stream:
         try:
             return yaml.safe_load(stream)
         except yaml.YAMLError as exc:
             logging.error(exc)
 
 def parseMeta(inDir):
     " parse a tab-sep file with headers and return an ordered dict firstField -> dictionary "
     fname = join(inDir, "tracks.tsv")
     meta = OrderedDict()
     if isfile(fname):
         tsvMeta = parseMetaTsv(fname)
         meta = allMetaOverride(meta, tsvMeta)
 
     fname = join(inDir, "tracks.json")
     if isfile(fname):
         jsonMeta = parseMetaJson(fname)
         meta = allMetaOverride(meta, jsonMeta)
 
     fname = join(inDir, "tracks.ra")
     if isfile(fname):
         raMeta = parseMetaRa(fname)
         meta = allMetaOverride(meta, raMeta)
 
     fname = join(inDir, "tracks.yaml")
     if isfile(fname):
         yamlMeta = parseMetaYaml(fname)
         meta = allMetaOverride(meta, yamlMeta)
 
     logging.debug("Got meta from %s: %s" % (inDir, str(meta)))
     return meta
 
 def writeHubGenome(ofh, db, inMeta):
     " create a hub.txt and genomes.txt file, hub.txt is just a template "
     meta = inMeta.get("hub", {})
     ofh.write("hub autoHub\n")
     ofh.write("shortLabel %s\n" % meta.get("shortLabel", "Auto-generated hub"))
     ofh.write("longLabel %s\n" % meta.get("longLabel", "Auto-generated hub"))
     #ofh.write("genomesFile genomes.txt\n")
     if "descriptionUrl" in meta:
         ofh.write("descriptionUrl %s\n" % meta["descriptionUrl"])
     ofh.write("email %s\n" % meta.get("email", "yourEmail@example.com"))
     ofh.write("useOneFile on\n\n")
 
     ofh.write("genome %s\n\n" % db)
     return ofh
 
 def readSubdirs(inDir, subDirs):
     " given a list of dirs, find those that are composite dirs (not supporting supertracks for now) "
     compDicts, superDicts = {}, {}
     for subDir in subDirs:
         subPath = join(inDir, subDir)
         subSubDirs, subDict = readFnames(subPath)
         if len(subDict)==0:
             # no files in this dir
             continue
         if len(subSubDirs)==0:
             compDicts[subDir] = subDict
         #else:
             #superDicts[subDir] = subDict
 
     return compDicts, superDicts
 
 def reorderDirs(compDirs, meta):
     " order the directories in compDirs in the order that they appear in the meta data -> will mean that composites have the right order "
     if len(meta)==0:
         logging.debug("Not reordering these subdirectories: %s" % compDirs.keys())
         return compDirs
 
     # first use the names in the meta data, and put in the right order
     newCompDirs = OrderedDict()
     for dirName in meta.keys():
         if dirName in compDirs:
             newCompDirs[dirName] = compDirs[dirName]
 
     # then add everything else at the end
     for dirName in compDirs:
         if dirName not in newCompDirs:
             newCompDirs[dirName] = compDirs[dirName]
 
     logging.debug("Reordered input directories based on meta data. New order is: %s" % newCompDirs.keys())
     return newCompDirs
 
 
 def readDirs(inDir, meta):
     " recurse down into directories and return containerType -> parentName -> fileBase -> type -> list of absPath "
     ret = {}
     subDirs, topFiles = readFnames(inDir)
     ret["top"] = { None : topFiles } # top-level track files have None as the parent
 
     compDirs, superDirs = readSubdirs(inDir, subDirs)
     compDirs = reorderDirs(compDirs, meta)
     # superDirs not used yet, no time
 
     ret["comps"] = compDirs
     ret["supers"] = superDirs
 
     return ret
 
 def readFnames(inDir):
     " return dict with basename -> fileType -> filePath "
     fnameDict = defaultdict(dict)
     #tdbDir = abspath(dirname(trackDbPath))
     subDirs = []
     for fname in os.listdir(inDir):
         filePath = join(inDir, fname)
         if isdir(filePath):
             subDirs.append(fname)
             continue
         baseName, ext = splitext(basename(fname))
         ext = ext.lower()
 
         # actually, use the part before the first dot, not the one before the extension, as the track name
         # this means that a.scaled.bigBed and a.raw.bigWig get paired correctly
         fileBase = basename(fname).split(".")[0]
 
         if ext==".bw" or ext==".bigwig":
             fileType = "bigWig"
         elif ext==".bb" or ext==".bigbed":
             fileType = "bigBed"
         else:
             logging.debug("file %s is not bigBed nor bigWig, skipping" % fname)
             continue
 
         absFname = abspath(filePath)
         #relFname = relFname(absFname, tdbDir)
         fnameDict[fileBase].setdefault(fileType, [])
         #fnameDict[baseName][fileType].setdefault([])
         fnameDict[fileBase][fileType].append(absFname)
     return subDirs, fnameDict
 
 def mostFilesArePaired(fnameDict):
     " check if 80% of the tracks have a pair bigBed+bigWig"
     pairCount = 0
     for baseName, typeDict in fnameDict.items():
         if "bigBed" in typeDict and "bigWig" in typeDict:
             pairCount += 1
 
     pairShare = pairCount / len(fnameDict)
     return ( pairShare > 0.8 )
 
 def writeLn(ofh, spaceCount, line):
     "write line to ofh, with spaceCount before it "
     ofh.write("".join([" "]*spaceCount))
     ofh.write(line)
     ofh.write("\n")
 
 def writeStanza(ofh, indent, tdb):
     " write a stanza given a tdb key-val dict "
     track = tdb["track"]
 
     shortLabel = tdb.get("shortLabel", track.replace("_", " "))
     visibility = tdb.get("visibility", "pack")
     longLabel  = tdb.get("longLabel", shortLabel)
 
     if not "type" in tdb:
         errAbort("Track info has no type attribute: %s" % tdb)
     trackType = tdb["type"]
 
     writeLn(ofh, indent, "track %s" % track)
     writeLn(ofh, indent, "shortLabel %s" % shortLabel)
     if longLabel:
         writeLn(ofh, indent, "longLabel %s" % longLabel)
     if "parent" in tdb:
         writeLn(ofh, indent, "parent %s" % tdb["parent"])
     writeLn(ofh, indent, "type %s" % trackType)
     writeLn(ofh, indent, "visibility %s" % visibility)
     if "bigDataUrl" in tdb:
         writeLn(ofh, indent, "bigDataUrl %s" % tdb["bigDataUrl"])
 
     for key, val in tdb.items():
         if key in ["track", "shortLabel", "longLabel", "type", "bigDataUrl", "visibility", "parent"]:
             continue
         writeLn(ofh, indent, "%s %s" % (key, val))
 
     ofh.write("\n")
 
 def metaOverride(tdb, meta):
     " override track info for one single track, from meta into tdb "
     trackName = tdb["track"]
 
     if trackName not in meta and "__" in trackName:
         logging.debug("Using only basename of track %s" % trackName)
         trackName = trackName.split("__")[1]
 
     if trackName not in meta:
         logging.debug("No meta info for track %s" % tdb["track"])
         return
 
     trackMeta = meta[trackName]
 
     for key, val in trackMeta.items():
         if val!="":
             tdb[key] = trackMeta[key]
 
 def allMetaOverride(tdb, meta):
     " override track info for all tracks, from meta into tdb "
     if meta is None:
         return tdb
 
     for trackName in meta:
         trackMeta = meta[trackName]
         if trackName not in tdb:
             tdb[trackName] = {}
 
         trackTdb = tdb[trackName]
 
         for key, val in trackMeta.items():
             trackTdb[key] = val
 
     return tdb
 
 def reorderTracks(fileDict, meta):
     " given an unsorted dictionary of files and ordered metadata, try to sort the files according to the metadata"
     if len(meta)==0:
         return fileDict # no meta data -> no ordering necessary
 
     trackOrder = []
     # meta is an OrderedDict, so the keys are also ordered
     for trackName in meta.keys():
         if "__" in trackName: # in composite mode, the tracknames contain the parent and the track type
             trackName = trackName.split("__")[1]
         trackOrder.append( trackName )
 
     trackOrder = list(meta.keys())
 
     newFiles = OrderedDict()
     doneTracks = set()
     # first add the tracks in the order of the meta data
     for trackBase in trackOrder:
         # the tsv file can have the track names either as basenames or as full tracknames
         if trackBase not in fileDict and "__" in trackBase:
             trackBase = trackBase.split("__")[1]
 
         if trackBase in fileDict:
             newFiles[trackBase] = fileDict[trackBase]
             doneTracks.add(trackBase)
 
     logging.debug("Track ordering from meta data used: %s" % newFiles.keys())
 
     # then add all other tracks
     for trackBase, fileData in fileDict.items():
         if trackBase not in doneTracks:
             newFiles[trackBase] = fileDict[trackBase]
             logging.debug("Not specified in meta, so adding at the end: %s" % trackBase)
 
     logging.debug("Final track order is: %s" % newFiles.keys())
 
     assert(len(newFiles)==len(fileDict))
     return newFiles
 
 def writeTdb(inDir, dirDict, dirType, tdbDir, ofh):
     " given a dict with basename -> type -> filenames, write track entries to ofh "
     # this code is getting increasingly complex because it supports composites/views and pairing of bigBed/bigWig files
     # either this needs better comments or maybe a separate code path for this rare use case
     global compCount
 
     fnameDict = dirDict[dirType]
 
     for parentName, typeDict in fnameDict.items():
         if parentName is None: # top level tracks: use top tracks.tsv
             subDir = inDir
         else: # container tracks -> use tracks.tsv in the subdirectory
             subDir = join(inDir, parentName)
 
         parentMeta = parseMeta(subDir)
 
         indent = 0
         parentHasViews = False
 
         groupMeta = {}
 
         if dirType=="comps":
             tdb = {
                     "track" : parentName,
                     "shortLabel": parentName,
                     "visibility" : "dense",
                     "compositeTrack" : "on",
                     "autoScale" : "group",
                     "type" : "bed 4"
                   }
             metaOverride(tdb, parentMeta)
             groupMeta = parentMeta
 
             parentHasViews = mostFilesArePaired(typeDict)
 
             if parentHasViews:
                 tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals"
                 logging.info("Container track %s has >80%% of paired files, activating views" % parentName)
 
             writeStanza(ofh, indent, tdb)
             indent = 4
 
             if parentHasViews:
                 # we have composites with paired files? -> write the track stanzas for the two views
                 groupMeta = parseMeta(subDir)
                 tdbViewPeaks = {
                         "track" : parentName+"ViewPeaks",
                         "shortLabel" : parentName+" Peaks",
                         "parent" : parentName,
                         "view" : "PK",
                         "visibility" : "dense",
                         "type" : "bigBed",
                         "scoreFilter" : "off",
                         "viewUi" : "on"
                         }
                 metaOverride(tdbViewPeaks, parentMeta)
                 writeStanza(ofh, indent, tdbViewPeaks)
 
                 tdbViewSig = {
                         "track" : parentName+"ViewSignal",
                         "shortLabel" : parentName+" Signal",
                         "parent" : parentName,
                         "view" : "SIG",
                         "visibility" : "dense",
                         "type" : "bigWig",
                         "viewUi" : "on"
                         }
                 metaOverride(tdbViewSig, parentMeta)
                 writeStanza(ofh, indent, tdbViewSig)
         else:
             # no composites
             groupMeta = parseMeta(subDir)
 
         typeDict = reorderTracks(typeDict, groupMeta)
 
         for trackBase, typeFnames in typeDict.items():
             for fileType, absFnames in typeFnames.items():
                 assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type
                 absFname = absFnames[0]
                 fileBase = basename(absFname)
                 relFname = relpath(absFname, tdbDir)
 
                 labelSuff = ""
                 if parentHasViews:
                     if fileType=="bigWig":
                         labelSuff = " Signal"
                     elif fileType=="bigBed":
                         labelSuff = " Peaks"
                     else:
                         assert(False) # views and non-bigWig/Bed are not supported yet?
 
                 if parentName is not None:
                     parentPrefix = parentName+"__"
                 else:
                     parentPrefix = ""
 
                 trackName = parentPrefix+trackBase+"__"+fileType
                 tdb = {
                         "track"      :  trackName,
                         "shortLabel" :  trackBase+labelSuff,
                         "longLabel"  :  trackBase+labelSuff,
                         "visibility" :  "dense",
                         "type"       :  fileType,
                         "bigDataUrl" :  relFname,
                       }
 
                 if parentName:
                     tdb["parent"] = parentName
 
                 if parentHasViews:
                     onOff = "on"
                     if trackName in groupMeta and "visibility" in groupMeta[trackName]:
                         vis = groupMeta[trackName]["visibility"]
                         if vis=="hide":
                             onOff = "off"
                         del tdb["visibility"]
 
                     if fileType=="bigBed":
                         tdb["parent"] = parentName+"ViewPeaks"+" "+onOff
                         tdb["subGroups"] = "view=PK"
                     else:
                         tdb["parent"] = parentName+"ViewSignal"+" "+onOff
                         tdb["subGroups"] = "view=SIG"
 
                 metaOverride(tdb, groupMeta)
 
                 if trackName in groupMeta and "visibility" in groupMeta[trackName]:
                     del tdb["visibility"]
 
                 writeStanza(ofh, indent, tdb)
 
 def importJbrowse(baseUrl, db, outDir):
     " import an IGV trackList.json hierarchy "
     import requests
     outFn = join(outDir, "hub.txt")
     ofh = open(outFn, "w")
     writeHubGenome(ofh, db, {})
 
     trackListUrl = baseUrl+"/data/trackList.json"
     logging.info("Loading %s" % trackListUrl)
     trackList = requests.get(trackListUrl).json()
     tdbs = []
     for tl in trackList["tracks"]:
         if "type" in tl and tl["type"]=="SequenceTrack":
             logging.info("Genome is: "+tl["label"])
             continue
         if tl["storeClass"]=="JBrowse/Store/SeqFeature/NCList":
             logging.info("NCList found: ",tl)
             continue
         tdb = {}
         tdb["track"] = tl["label"]
         tdb["shortLabel"] = tl["key"]
         if "data_file" in tl:
             tdb["bigDataUrl"] = baseUrl+"/data/"+tl["data_file"]
         else:
             tdb["bigDataUrl"] = baseUrl+"/data/"+tl["urlTemplate"]
         if tl["storeClass"] == "JBrowse/Store/SeqFeature/BigWig":
             tdb["type"] = "bigWig"
 
         dispMode = tl.get("display_mode")
         if dispMode:
             if dispMode=="normal":
                 tdb["visibility"] = "pack"
             elif dispMode=="compact":
                 tdb["visibility"] = "dense"
             else:
                 tdb["visibility"] = "pack"
         else:
             tdb["visibility"] = "pack"
     
         writeStanza(ofh, 0, tdb)
 
 def installModule(package):
     " install a package "
     logging.info("Could not find Python module '%s', trying to install with pip" % package)
     subprocess.check_call([sys.executable, "-m", "pip", "install", package])
 
 def cacheLoad(fname):
     " load file cache from json file "
     if not isfile(fname):
         logging.debug("No upload cache present")
         return {}
 
     logging.debug("Loading "+fname)
     return json.load(open(fname))
 
 def cacheWrite(uploadCache, fname):
     logging.debug("Writing "+fname)
     json.dump(uploadCache, open(fname, "w"), indent=4)
 
 def getFileType(fbase):
     " return the file type defined in the hubspace system, given a base file name "
     ret = "NA"
     for fileType, fileExts in fileTypeExtensions.items():
         for fileExt in fileExts:
             if fbase.endswith(fileExt):
                 ret = fileType
                 break
         if ret!="NA":
             break
 
     logging.debug("file type for %s is %s" % (fbase, ret))
     return ret
 
 def uploadFiles(tdbDir, hubName):
     "upload files to hubspace. Server name and token can come from ~/.hubtools.conf "
     try:
         from tusclient import client
     except ModuleNotFoundError:
         installModule("tuspy")
         from tusclient import client
 
     serverUrl = cfgOption("tusUrl", "https://hubspace.gi.ucsc.edu/files")
 
     cookies = {}
     cookieNameUser = cfgOption("wiki.userNameCookie", "wikidb_mw1_UserName")
     cookieNameId = cfgOption("wiki.loggedInCookie", "wikidb_mw1_UserID")
 
     apiKey = cfgOption("apiKey")
     if apiKey is None:
         errAbort("To upload files, the file ~/.hubtools.conf must contain a line like apiKey='xxx').\n"
                 "Go to https://genome.ucsc.edu/cgi-bin/hgHubConnect#dev to create a new  apiKey. Then run \n"
                 "   echo 'apiKey=\"xxxx\"' >> ~/.hubtools.conf \n"
                 "and run the 'hubtools up' command again.")
 
     logging.info(f"TUS server URL: {serverUrl}")
     my_client = client.TusClient(serverUrl)
 
     cacheFname = join(tdbDir, ".hubtools.files.json")
     uploadCache = cacheLoad(cacheFname)
 
     logging.debug("trackDb directory is %s" % tdbDir)
     for rootDir, _, files in os.walk(tdbDir):
         for fbase in files:
             if fbase.startswith("."):
                 continue
             localPath = normpath(join(rootDir, fbase))
             logging.debug("localPath: %s" % localPath)
             localMtime = os.stat(localPath).st_mtime
 
             # skip files that have not changed their mtime since last upload
             if localPath in uploadCache:
                 cacheMtime = uploadCache[fpath]["mtime"]
                 if localMtime == cacheMtime:
                     logging.info("%s: file mtime unchanged, not uploading again" % localPath)
                     continue
                 else:
                     logging.debug("file %s: mtime is %f, cache mtime is %f, need to re-upload" %
                             (localPath, localMtime, cacheMtime))
             else:
                 logging.debug("file %s not in upload cache" % localPath)
 
             fileType = getFileType(fbase)
 
             fileAbsPath = abspath(localPath)
             remoteRelPath = relpath(fileAbsPath, tdbDir)
             remoteDir = dirname(remoteRelPath)
 
             meta = {
                     "apiKey" : apiKey,
                     "parentDir" : remoteDir,
                     "genome":"NA",
                     "fileName" : fbase,
                     "hubName" : hubName,
                     "hubtools" : "true",
                     "fileType": fileType,
                     "lastModified" : str(int(localMtime)),
             }
 
             logging.info(f"Uploading {localPath}, meta {meta}")
             uploader = my_client.uploader(localPath, metadata=meta)
             uploader.upload()
 
             # note that this file was uploaded
             cache = {}
             mtime = os.stat(localPath).st_mtime
             cache["mtime"] = mtime
             cache["size"] = os.stat(localPath).st_size
-            uploadCache[fpath] = cache
+            uploadCache[localPath] = cache
 
     cacheWrite(uploadCache, cacheFname)
 
 def iterRaStanzas(fname):
     " parse an ra-style (trackDb) file and yield dictionaries "
     data = dict()
     logging.debug("Parsing %s in trackDb format" % fname)
     with open(fname, "rt") as ifh:
         for l in ifh:
             l = l.lstrip(" ").rstrip("\r\n")
             if len(l)==0:
                 yield data
                 data = dict()
             else:
                 if " " not in l:
                     continue
                 key, val = l.split(" ", maxsplit=1)
                 data[key] = val
 
     if len(data)!=0:
         yield data
 
 def raToTab(fname):
     " convert .ra file to .tsv "
     stanzas = []
     allFields = set()
     for stanza in iterRaStanzas(fname):
         if "hub" in stanza or "genome" in stanza:
             continue
         allFields.update(stanza.keys())
         stanzas.append(stanza)
 
     if "track" in allFields:
         allFields.remove("track")
     if "shortLabel" in allFields:
         allFields.remove("shortLabel")
 
     hasLongLabel = False
     if "longLabel" in allFields:
         allFields.remove("longLabel")
         hasLongLabel = True
 
     sortedFields = sorted(list(allFields))
     # make sure that track shortLabel and longLabel come first and always there, handy for manual edits
     if hasLongLabel:
         sortedFields.insert(0, "longLabel")
     sortedFields.insert(0, "shortLabel")
     sortedFields.insert(0, "track")
 
     ofh = sys.stdout
     ofh.write("#")
     ofh.write("\t".join(sortedFields))
     ofh.write("\n")
 
     for s in stanzas:
         row = []
         for fieldName in sortedFields:
             row.append(s.get(fieldName, ""))
         ofh.write("\t".join(row))
         ofh.write("\n")
 
 def guessFieldDesc(fieldNames):
     "given a list of field names, try to guess to which fields in bed12 they correspond "
     logging.info("No field description specified, guessing fields from TSV headers: %s" % fieldNames)
     fieldDesc = {}
     skipFields = set()
 
     for fieldIdx, fieldName in enumerate(fieldNames):
         caseField = fieldName.lower()
         if caseField in ["chrom", "chromosome"]:
             name = "chrom"
         elif caseField.endswith(" id") or caseField.endswith("accession"):
             name = "name"
         elif caseField in ["start", "chromstart", "position"]:
             name = "start"
         elif caseField in ["Reference"]:
             name = "refAllele"
         elif caseField in ["strand"]:
             name = "strand"
         elif caseField in ["score"]:
             name = "score"
         else:
             continue
         fieldDesc[name] = fieldIdx
         skipFields.add(fieldIdx)
     logging.info("TSV <-> BED correspondance: %s" % fieldDesc)
     return fieldDesc, skipFields
 
 def parseFieldDesc(fieldDescStr, fieldNames):
     " given a string chrom=1,start=2,end=3,... return dict {chrom:1,...} "
     if not fieldDescStr:
         return guessFieldDesc(fieldNames)
 
     fieldDesc, skipFields = guessFieldDesc(fieldNames)
 
     if fieldDescStr:
         for part in fieldDescStr.split(","):
             fieldName, fieldIdx = part.split("=")
             fieldIdx = int(fieldIdx)
             fieldDesc[fieldName] = fieldIdx
             skipFields.add(fieldIdx)
 
     return fieldDesc, skipFields
 
 def makeBedRow(row, fieldDesc, skipFields, isOneBased):
     " given a row of a tsv file and a fieldDesc with BedFieldName -> field-index, return a bed12+ row with extra fields "
     bedRow = []
 
     # first construct the bed 12 fields
     for fieldName in ["chrom", "start", "end", "name", "score", "strand",
             "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "chromStarts"]:
 
         fieldIdx = fieldDesc.get(fieldName)
         if fieldIdx is not None:
             if fieldName=="start":
                 chromStart = int(row[fieldDesc["start"]])
                 if isOneBased:
                     chromStart = chromStart-1
                 val = str(chromStart)
             elif fieldName=="end":
                 chromEnd = int(val)
             else:
                 val = row[fieldIdx]
         else:
             if fieldName=="end":
                 chromEnd = chromStart+1
                 val = str(chromEnd)
             elif fieldName=="score":
                 val = "0"
             elif fieldName=="strand":
                 val = "."
             elif fieldName=="thickStart":
                 val = str(chromStart)
             elif fieldName=="thickEnd":
                 val = str(chromEnd)
             elif fieldName=="itemRgb":
                 val = "0,0,0"
             elif fieldName=="blockCount":
                 val = "1"
             elif fieldName=="blockSizes":
                 val = str(chromEnd-chromStart)
             elif fieldName=="chromStarts":
                 #val = str(chromStart)
                 val = "0"
             else:
                 logging.error("Cannot find a field for %s" % fieldName)
                 sys.exit(1)
 
         bedRow.append(val)
 
     # now handle all other fields
     for fieldIdx, val in enumerate(row):
         if not fieldIdx in skipFields:
             bedRow.append( row[fieldIdx] )
 
     return bedRow
 
 def fetchChromSizes(db, outputFileName):
     " find on local disk or download a <db>.sizes text file "
     # Construct the URL based on the database name
     url = f"https://hgdownload.cse.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz"
     # Send a request to download the file
     response = requests.get(url, stream=True)
     # Check if the request was successful
     if response.status_code == 200:
         # Open the output gzip file for writing
         with gzip.open(outputFileName, 'wt') as outFile:
             # Open the response content as a gzip file in text mode
             with gzip.GzipFile(fileobj=response.raw, mode='r') as inFile:
                 # Read the content using csv reader to handle tab-separated values
                 reader = csv.reader(inFile, delimiter='\t')
                 writer = csv.writer(outFile, delimiter='\t', lineterminator='\n')
                 # Iterate through each row, and retain only the first two fields
                 for row in reader:
                     writer.writerow(row[:2])  # Write only the first two fields
     else:
         raise Exception(f"Failed to download file from {url}, status code: {response.status_code}")
     logging.info("Downloaded %s to %s" % (db, outputFileName))
 
 def getChromSizesFname(db):
     " return fname of chrom sizes, download into ~/.local/ucscData/ if not found "
     fname = "/hive/data/genomes/%s/chrom.sizes" % db
     if isfile(fname):
         return fname
 
     dataDir = "~/.local/hubtools"
     fname = join(dataDir, "%s.sizes" % db)
     fname = os.path.expanduser(fname)
     if not isfile(fname):
         makedirs(dataDir)
         fetchChromSizes(db, fname)
     return fname
 
 def convTsv(db, tsvFname, outBedFname, outAsFname, outBbFname):
     " convert tsv files in inDir to outDir, assume that they all have one column for chrom, start and end. Try to guess these or fail. "
     #tsvFname, outBedFname, outAsFname = args
 
     # join and output merged bed
     bigCols = set() # col names of columns with > 255 chars
 
     #bedFh = open(bedFname, "w")
     unsortedBedFh = tempfile.NamedTemporaryFile(suffix=".bed", dir=dirname(outBedFname), mode="wt")
     fieldNames = None
     #isOneBased = options.oneBased
     isOneBased = True
     bedFieldsDesc = None # in the future, the user may want to input a string like name=0,start=1, but switch this off for now
 
     for line in open(tsvFname):
         row = line.rstrip("\r\n").split("\t")
         if fieldNames is None:
             fieldNames = row
             fieldDesc, notExtraFields = parseFieldDesc(bedFieldsDesc, fieldNames)
             continue
 
         # note fields with data > 255 chars.
         for colName, colData in zip(fieldNames, row):
             if len(colData)>255:
                 bigCols.add(colName)
 
         bedRow = makeBedRow(row, fieldDesc, notExtraFields, isOneBased)
 
         chrom = bedRow[0]
         if chrom.isdigit() or chrom in ["X", "Y"]:
             bedRow[0] = "chr"+bedRow[0]
 
         unsortedBedFh.write( ("\t".join(bedRow)))
         unsortedBedFh.write("\n")
     unsortedBedFh.flush()
 
     cmd = "sort -k1,1 -k2,2n %s > %s" % (unsortedBedFh.name, outBedFname)
     assert(os.system(cmd)==0)
     unsortedBedFh.close() # removes the temp file
 
     # generate autosql
     # BED fields
     #bedColCount = int(options.type.replace("bed", "").replace("+" , ""))
     bedColCount = 12
     asFh = open(outAsFname, "w")
     asFh.write(asHead)
     asFh.write("\n".join(asLines[:bedColCount+1]))
     asFh.write("\n")
 
     # extra fields
     #fieldNames = fieldNames[bedColCount:]
     for fieldIdx, field in enumerate(fieldNames):
         if fieldIdx in notExtraFields:
             continue
         name = field.replace(" ","")
         name = field.replace("%","perc_")
         name = re.sub("[^a-zA-Z0-9]", "", name)
         name = name[0].lower()+name[1:]
         fType = "string"
         if field in bigCols:
             fType = "lstring"
         asFh.write('      %s %s; "%s" \n' % (fType, name, field))
     asFh.write(")\n")
     asFh.close()
 
     chromSizesFname = getChromSizesFname(db)
 
     cmd = ["bedToBigBed", outBedFname, chromSizesFname, outBbFname, "-tab", "-type=bed%d+" % bedColCount, "-as=%s" % outAsFname]
     subprocess.check_call(cmd)
 
 def convTsvDir(inDir, db, outDir):
     " find tsv files under inDir and convert them all to .bb "
     ext = "tsv"
     pat = join(inDir, '**/*.'+ext)
     logging.debug("Finding files under %s (%s), writing output to %s" % (inDir, pat, outDir))
 
     for fname in glob.glob(pat, recursive=True):
         logging.debug("Found %s" % fname)
         absFname = abspath(fname)
         relFname = relpath(absFname, inDir)
         outPath = Path(join(outDir, relFname))
 
         if not outPath.parents[0].is_dir():
             logging.debug("mkdir -p %s" % outPath.parents[0])
             makedirs(outPath.parents[0])
 
         bedFname = outPath.with_suffix(".bed")
         asFname = outPath.with_suffix(".as")
         bbFname = outPath.with_suffix(".bb")
 
         convTsv(db, fname, bedFname, asFname, bbFname)
 
 def parseTrackLine(s):
     " Use shlex to split the string respecting quotes, written by chatGPT "
     lexer = shlex.shlex(s, posix=True)
     lexer.whitespace_split = True
     lexer.wordchars += '='  # Allow '=' as part of words
 
     tokens = list(lexer)
 
     # Convert the tokens into a dictionary
     it = iter(tokens)
     result = {}
     for token in it:
         if '=' in token:
             key, value = token.split('=', 1)
             result[key] = value.strip('"')  # Remove surrounding quotes if present
         else:
             result[token] = next(it).strip('"')  # Handle cases like name="...".
 
     return result
 
 def readTrackLines(fnames):
     " read the first line and convert to a dict of all fnames. "
     logging.debug("Reading track lines from %s" % fnames)
     ret = {}
     for fn in fnames:
         line1 = open(fn).readline().rstrip("\n")
         notTrack = line1.replace("track ", "", 1)
         tdb = parseTrackLine(notTrack)
         ret[fn] = tdb
     return ret
 
 def stripFirstLine(inputFilename, outputFilename):
     " written by chatGpt: copies all lines to output, except the first line "
     with open(inputFilename, 'r') as infile, open(outputFilename, 'w') as outfile:
         # Skip the first line
         next(infile)
         # Write the rest of the lines to the output file
         for line in infile:
             outfile.write(line)
 
 def bedToBigBed(inFname, db, outFname):
     " convert bed to bigbed file, handles chrom.sizes download "
     chromSizesFname = getChromSizesFname(db)
     cmd = ["bedToBigBed", inFname, chromSizesFname, outFname]
     logging.debug("Running %s" % cmd)
     logging.info(f'Converting {inFname} to {outFname}. (chromSizes: {chromSizesFname}')
     subprocess.check_call(cmd)
 
 def convCtDb(db, inDir, outDir):
     " convert one db part of a track archive to an output directory "
     findGlob = join(inDir, "*.ct")
     inFnames = glob.glob(findGlob)
     if len(inFnames)==0:
         logging.info("No *.ct files found in %s" % findGlob)
         return
 
     tdbData = readTrackLines(inFnames)
 
     makedirs(outDir)
     hubTxtFname = join(outDir, "hub.txt")
     ofh = open(hubTxtFname, "wt")
 
     meta = parseMeta(inDir)
     writeHubGenome(ofh, db, meta)
 
     tdbIdx = 0
     for fname, tdb in tdbData.items():
         tdbIdx += 1
         # custom track names can include spaces, spec characters, etc. Strip all those
         # append a number to make sure that the result is unique
         track = tdb["name"]
         track = re.sub('[^A-Za-z0-9]+', '', track)+"_"+str(tdbIdx)
         tdb["track"] = track
 
         tdb["shortLabel"] = tdb["name"]
         del tdb["name"]
         tdb["longLabel"] = tdb["description"]
         del tdb["description"]
 
         if "bigDataUrl" not in tdb:
             bedFname = join(outDir, tdb["track"]+".bed")
             bbFname = join(outDir, tdb["track"]+".bb")
             stripFirstLine(fname, bedFname)
             bedToBigBed(bedFname, db, bbFname)
             os.remove(bedFname)
             tdb["bigDataUrl"] = tdb["track"]+".bb"
             tdb["type"] = "bigBed"
 
         writeStanza(ofh, 0, tdb)
 
     logging.info("Wrote %s" % hubTxtFname)
     ofh.close()
 
 def convArchDir(inDir, outDir):
     " convert a directory created from the .tar.gz file downloaded via our track archive feature "
     dbContent = os.listdir(inDir)
     dbDirs = []
     for db in dbContent:
         subDir = join(inDir, db)
         if isdir(subDir):
             dbDirs.append((db, subDir))
 
     if len(dbDirs)==0:
         errAbort("No directories found under %s. Extract the tarfile and point this program at the 'archive' directory." % inDir)
 
     for db, inSubDir in dbDirs:
         logging.debug("Processing %s, db=%s" % (inSubDir, db))
         outSubDir = join(outDir, db)
         convCtDb(db, inSubDir, outSubDir)
 
 def hgsidFromUrl(url):
     " return hgsid given a URL. written by chatGPT "
     # Parse the URL
     parsed_url = urllib.parse.urlparse(url)
     server_name = f"{parsed_url.scheme}://{parsed_url.netloc}"
     query_params = urllib.parse.parse_qs(parsed_url.query)
     hgsid = query_params.get('hgsid')
     # Return the hgsid value if it exists
     if hgsid:
         return server_name, hgsid[0]  # Extracting the first value from the list
     errAbort("hgsid parameter not found in "+url)
 
 def convCtUrl(url, outDir):
     " given an hgTracks URL with an hgsid on it, get all custom track lines and create a hub file for it. Does not download custom tracks. "
     serverUrl, hgsid = hgsidFromUrl(url)
     cartDumpUrl = serverUrl+"/cgi-bin/cartDump?hgsid="+hgsid
 
 
 def hubtools(args, options):
     """ find files under dirName and create a trackDb.txt for them"""
 
     cmd = args[0]
 
     inDir = "."
     if options.inDir:
         inDir = options.inDir
 
     outDir = "."
     if options.outDir:
         outDir = options.outDir
 
     if cmd=="up":
         if len(args)<2:
             errAbort("The 'up' command requires one argument, the name of the hub. You can specify any name, "
                 "ideally a short, meaningful string, e.g. atacseq, muscle-rna or yamamoto2022. Avoid special characters.")
         hubName = args[1]
         uploadFiles(inDir, hubName)
         return
 
     tdbDir = inDir
     if options.outDir:
         tdbDir = options.outDir
 
     if cmd=="jbrowse":
         importJbrowse(args[1], args[2], tdbDir)
     elif cmd == "tab":
         raToTab(args[1])
     elif cmd == "make":
         db = args[1]
 
         meta = parseMeta(inDir)
         dirFiles = readDirs(inDir, meta)
 
         hubFname = join(tdbDir, "hub.txt")
         logging.info("Writing %s" % hubFname)
         ofh = open(hubFname, "w")
 
         meta = parseMeta(inDir)
         writeHubGenome(ofh, db, meta)
 
         writeTdb(inDir, dirFiles, "top", tdbDir, ofh)
         writeTdb(inDir, dirFiles, "comps", tdbDir, ofh)
 
         ofh.close()
 
     elif cmd=="conv":
         db = args[1]
 
         convTsvDir(inDir, db, outDir)
 
     elif cmd=="archive":
         convArchDir(inDir, outDir)
 
     elif cmd=="ct":
         url = args[1]
         convCtUrl(url, outDir)
 
     else:
         logging.error("Unknown command: '%s'" % args[1])
 
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     hubtools(args, options)
 
 
 main()