d1e2aafa5f6b2e2acb4069f4ed51e9db37934ffe max Wed Feb 26 06:59:11 2025 -0800 trying to fix hubtools behavior when specifying -i, email from Chris, no redmine diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools index de74fcdd06c..7ace50d5b2e 100755 --- src/utils/hubtools/hubtools +++ src/utils/hubtools/hubtools @@ -1,1272 +1,1276 @@ #!/usr/bin/env python3 import logging, sys, optparse, os, json, subprocess, shutil, string, glob, tempfile, re import shlex from pathlib import Path from collections import defaultdict, OrderedDict -from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir +from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir, normpath #import pyyaml # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml() # ==== functions ===== # allowed file types by hubtools up # copied from the javascript file hgHubConnect.js fileTypeExtensions = { "bigBed": [ ".bb", ".bigbed" ], "bam": [ ".bam" ], "vcf": [ ".vcf" ], "vcfTabix": [ ".vcf.gz", "vcf.bgz" ], "bigWig": [ ".bw", ".bigwig" ], "hic": [ ".hic" ], "cram": [ ".cram" ], "bigBarChart": [ ".bigbarchart" ], "bigGenePred": [ ".bgp", ".biggenepred" ], "bigMaf": [ ".bigmaf" ], "bigInteract": [ ".biginteract" ], "bigPsl": [ ".bigpsl" ], "bigChain": [ ".bigchain" ], "bamIndex": [ ".bam.bai", ".bai" ], "tabixIndex": [ ".vcf.gz.tbi", "vcf.bgz.tbi" ] } asHead = """table bed "Browser extensible data (<=12 fields) " ( """ asLines = """ string chrom; "Chromosome (or contig, scaffold, etc.)" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name of item" uint score; "Score from 0-1000" char[1] strand; "+ or -" uint thickStart; "Start of where display should be thick (start codon)" uint thickEnd; "End of where display should be thick (stop codon)" uint reserved; "Used as itemRgb as of 2004-11-22" int blockCount; "Number of blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Start positions relative to chromStart" """.split("\n") def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs hubtools make <assemblyCode e.g. hg38>: create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt, and tries to guess reasonable settings from the file names: - bigBed/bigWig files in the current directory will be top level tracks - big* files in subdirectories become composites - for every filename, the part before the first dot becomes the track base name - if a directory has more than 80% of track base names with both a bigBed and bigWig file, views are activated for this composite - track attributes can be changed using tracks.ra, tracks.tsv, tracks.json or tracks.yaml files, in each top or subdirectory - The first column of tracks.tsv must be 'track'. - tracks.json and tracks.yaml must have objects at the top level, the attributes are <trackName> or the special attribute "hub". hubtools up <hubName>: upload files to hubSpace - needs ~/.hubtools.conf with a line apiKey="xxxx". Create a key by going to My Data > Track Hubs > Track Development on https://genome.ucsc.edu - uploads all files from the -i directory or the current dir if not specified. hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt. - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/ - <db> is assembly identifier hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc. - <fname> is the input filename. "hub" and "genome" stanzas are skipped. - output goes to stdout hubtools conv -i myTsvDir - convert .tsv files in inDir to .bigBed files in current directory hubtools archive -i xxx/archive -o outDir - Create a hub from an extracted xxx.tar.gz file "archive" directory. You can download a .tar.gz from in the Genome Browser under My Data > My Session. The archive contains all your custom tracks. It is usually easier to use the 'ct' command, as this will automatically download an archive an convert it, given a full genome browser URL with an "hgsid" parameter on the URL. hubtools ct <hgTracksUrl> - Download all custom tracks from a Genome Browser URL, either as a /s/ stable shortlink or the temporary hgsid=xxxx URL Examples: hubtools conv -i myTsvs/ hubtools make hg38 hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3 hubtools tab hub.txt > tracks.tsv hubtools session tar xvfz SC_20230723_backup.tar.gz hubtools archive -i archive -o hub hubtools ct 'https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr7%3A155798978%2D155810671&hgsid=345826202_8d3Mpumjaw9IXYI2RDaSt5xMoUhH' For the "hubtools make" step, you can specify additional options for your tracks via tracks.json/.yaml or tracks.tsv: tracks.json can look like this, can have more keys, one per track, or the special key "hub": { "hub" : { "hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons" }, "myTrack" : { "shortLabel" : "My nice track" } } tracks.tsv should look like this, other columns can be added: #track<tab>shortLabel myTrack<tab>My nice track For a list of all possible fields/columns see https://genome.ucsc.edu/goldenpath/help/trackDb/trackDbHub.html: """) parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.") #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages") parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace") (options, args) = parser.parse_args() if len(args)==0: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def errAbort(msg): " print and abort) " logging.error(msg) sys.exit(1) def makedirs(d): if not isdir(d): os.makedirs(d) def parseConf(fname): " parse a hg.conf style file, return as dict key -> value (all strings) " logging.debug("Parsing "+fname) conf = {} for line in open(fname): line = line.strip() if line.startswith("#"): continue elif line.startswith("include "): inclFname = line.split()[1] absFname = normpath(join(dirname(fname), inclFname)) if os.path.isfile(absFname): inclDict = parseConf(absFname) conf.update(inclDict) elif "=" in line: # string search for "=" key, value = line.split("=",1) conf[key] = value return conf # cache of hg.conf contents hgConf = None def parseHgConf(): """ return hg.conf as dict key:value. """ global hgConf if hgConf is not None: return hgConf hgConf = dict() # python dict = hash table fname = os.path.expanduser("~/.hubtools.conf") if isfile(fname): hgConf = parseConf(fname) else: fname = os.path.expanduser("~/.hg.conf") if isfile(fname): hgConf = parseConf(fname) def cfgOption(name, default=None): " return hg.conf option or default " global hgConf if not hgConf: parseHgConf() return hgConf.get(name, default) def parseMetaRa(fname): """parse tracks.ra or tracks.txt and return as a dict of trackName -> dict of key->val """ logging.debug("Reading %s as .ra" % fname) trackName = None stanzaData = {} ret = {} for line in open(fname): line = line.strip() if line.startswith("#"): continue if line=="": if len(stanzaData)==0: # double newline continue if trackName is None: errAbort("File %s has a stanza without a track name" % fname) if trackName in ret: errAbort("File %s contains two stanzas with the same track name '%s' " % trackName) ret[trackName] = stanzaData stanzaData = {} trackName = None continue key, val = line.split(" ", maxsplit=1) if key == "track": trackName = val continue else: stanzaData[key] = val if len(stanzaData)!=0: ret[trackName] = stanzaData logging.debug("Got %s from .ra" % str(ret)) return ret def parseMetaTsv(fname): " parse a tracks.tsv file and return as a dict of trackName -> dict of key->val " headers = None meta = {} logging.debug("Parsing track meta data from %s in tsv format" % fname) for line in open(fname): row = line.rstrip("\r\n").split("\t") if headers is None: assert(line.startswith("track\t") or line.startswith("#track")) row[0] = row[0].lstrip("#") headers = row continue assert(len(row)==len(headers)) key = row[0] rowDict = {} for header, val in zip(headers[1:], row[1:]): rowDict[header] = val #row = {k:v for k,v in zip(headers, fs)} meta[key] = rowDict return meta def parseMetaJson(fname): " parse a json file and merge it into meta and return " logging.debug("Reading %s as json" % fname) newMeta = json.load(open(fname)) return newMeta def parseMetaYaml(fname): " parse yaml file " import yaml # if this doesn't work, run 'pip install pyyaml' with open(fname) as stream: try: return yaml.safe_load(stream) except yaml.YAMLError as exc: logging.error(exc) def parseMeta(inDir): " parse a tab-sep file with headers and return an ordered dict firstField -> dictionary " fname = join(inDir, "tracks.tsv") meta = OrderedDict() if isfile(fname): tsvMeta = parseMetaTsv(fname) meta = allMetaOverride(meta, tsvMeta) fname = join(inDir, "tracks.json") if isfile(fname): jsonMeta = parseMetaJson(fname) meta = allMetaOverride(meta, jsonMeta) fname = join(inDir, "tracks.ra") if isfile(fname): raMeta = parseMetaRa(fname) meta = allMetaOverride(meta, raMeta) fname = join(inDir, "tracks.yaml") if isfile(fname): yamlMeta = parseMetaYaml(fname) meta = allMetaOverride(meta, yamlMeta) logging.debug("Got meta from %s: %s" % (inDir, str(meta))) return meta def writeHubGenome(ofh, db, inMeta): " create a hub.txt and genomes.txt file, hub.txt is just a template " meta = inMeta.get("hub", {}) ofh.write("hub autoHub\n") ofh.write("shortLabel %s\n" % meta.get("shortLabel", "Auto-generated hub")) ofh.write("longLabel %s\n" % meta.get("longLabel", "Auto-generated hub")) #ofh.write("genomesFile genomes.txt\n") if "descriptionUrl" in meta: ofh.write("descriptionUrl %s\n" % meta["descriptionUrl"]) ofh.write("email %s\n" % meta.get("email", "yourEmail@example.com")) ofh.write("useOneFile on\n\n") ofh.write("genome %s\n\n" % db) return ofh def readSubdirs(inDir, subDirs): " given a list of dirs, find those that are composite dirs (not supporting supertracks for now) " compDicts, superDicts = {}, {} for subDir in subDirs: subPath = join(inDir, subDir) subSubDirs, subDict = readFnames(subPath) if len(subDict)==0: # no files in this dir continue if len(subSubDirs)==0: compDicts[subDir] = subDict #else: #superDicts[subDir] = subDict return compDicts, superDicts def reorderDirs(compDirs, meta): " order the directories in compDirs in the order that they appear in the meta data -> will mean that composites have the right order " if len(meta)==0: logging.debug("Not reordering these subdirectories: %s" % compDirs.keys()) return compDirs # first use the names in the meta data, and put in the right order newCompDirs = OrderedDict() for dirName in meta.keys(): if dirName in compDirs: newCompDirs[dirName] = compDirs[dirName] # then add everything else at the end for dirName in compDirs: if dirName not in newCompDirs: newCompDirs[dirName] = compDirs[dirName] logging.debug("Reordered input directories based on meta data. New order is: %s" % newCompDirs.keys()) return newCompDirs def readDirs(inDir, meta): " recurse down into directories and return containerType -> parentName -> fileBase -> type -> list of absPath " ret = {} subDirs, topFiles = readFnames(inDir) ret["top"] = { None : topFiles } # top-level track files have None as the parent compDirs, superDirs = readSubdirs(inDir, subDirs) compDirs = reorderDirs(compDirs, meta) # superDirs not used yet, no time ret["comps"] = compDirs ret["supers"] = superDirs return ret def readFnames(inDir): " return dict with basename -> fileType -> filePath " fnameDict = defaultdict(dict) #tdbDir = abspath(dirname(trackDbPath)) subDirs = [] for fname in os.listdir(inDir): filePath = join(inDir, fname) if isdir(filePath): subDirs.append(fname) continue baseName, ext = splitext(basename(fname)) ext = ext.lower() # actually, use the part before the first dot, not the one before the extension, as the track name # this means that a.scaled.bigBed and a.raw.bigWig get paired correctly fileBase = basename(fname).split(".")[0] if ext==".bw" or ext==".bigwig": fileType = "bigWig" elif ext==".bb" or ext==".bigbed": fileType = "bigBed" else: logging.debug("file %s is not bigBed nor bigWig, skipping" % fname) continue absFname = abspath(filePath) #relFname = relFname(absFname, tdbDir) fnameDict[fileBase].setdefault(fileType, []) #fnameDict[baseName][fileType].setdefault([]) fnameDict[fileBase][fileType].append(absFname) return subDirs, fnameDict def mostFilesArePaired(fnameDict): " check if 80% of the tracks have a pair bigBed+bigWig" pairCount = 0 for baseName, typeDict in fnameDict.items(): if "bigBed" in typeDict and "bigWig" in typeDict: pairCount += 1 pairShare = pairCount / len(fnameDict) return ( pairShare > 0.8 ) def writeLn(ofh, spaceCount, line): "write line to ofh, with spaceCount before it " ofh.write("".join([" "]*spaceCount)) ofh.write(line) ofh.write("\n") def writeStanza(ofh, indent, tdb): " write a stanza given a tdb key-val dict " track = tdb["track"] shortLabel = tdb.get("shortLabel", track.replace("_", " ")) visibility = tdb.get("visibility", "pack") longLabel = tdb.get("longLabel", shortLabel) if not "type" in tdb: errAbort("Track info has no type attribute: %s" % tdb) trackType = tdb["type"] writeLn(ofh, indent, "track %s" % track) writeLn(ofh, indent, "shortLabel %s" % shortLabel) if longLabel: writeLn(ofh, indent, "longLabel %s" % longLabel) if "parent" in tdb: writeLn(ofh, indent, "parent %s" % tdb["parent"]) writeLn(ofh, indent, "type %s" % trackType) writeLn(ofh, indent, "visibility %s" % visibility) if "bigDataUrl" in tdb: writeLn(ofh, indent, "bigDataUrl %s" % tdb["bigDataUrl"]) for key, val in tdb.items(): if key in ["track", "shortLabel", "longLabel", "type", "bigDataUrl", "visibility", "parent"]: continue writeLn(ofh, indent, "%s %s" % (key, val)) ofh.write("\n") def metaOverride(tdb, meta): " override track info for one single track, from meta into tdb " trackName = tdb["track"] if trackName not in meta and "__" in trackName: logging.debug("Using only basename of track %s" % trackName) trackName = trackName.split("__")[1] if trackName not in meta: logging.debug("No meta info for track %s" % tdb["track"]) return trackMeta = meta[trackName] for key, val in trackMeta.items(): if val!="": tdb[key] = trackMeta[key] def allMetaOverride(tdb, meta): " override track info for all tracks, from meta into tdb " if meta is None: return tdb for trackName in meta: trackMeta = meta[trackName] if trackName not in tdb: tdb[trackName] = {} trackTdb = tdb[trackName] for key, val in trackMeta.items(): trackTdb[key] = val return tdb def reorderTracks(fileDict, meta): " given an unsorted dictionary of files and ordered metadata, try to sort the files according to the metadata" if len(meta)==0: return fileDict # no meta data -> no ordering necessary trackOrder = [] # meta is an OrderedDict, so the keys are also ordered for trackName in meta.keys(): if "__" in trackName: # in composite mode, the tracknames contain the parent and the track type trackName = trackName.split("__")[1] trackOrder.append( trackName ) trackOrder = list(meta.keys()) newFiles = OrderedDict() doneTracks = set() # first add the tracks in the order of the meta data for trackBase in trackOrder: # the tsv file can have the track names either as basenames or as full tracknames if trackBase not in fileDict and "__" in trackBase: trackBase = trackBase.split("__")[1] if trackBase in fileDict: newFiles[trackBase] = fileDict[trackBase] doneTracks.add(trackBase) logging.debug("Track ordering from meta data used: %s" % newFiles.keys()) # then add all other tracks for trackBase, fileData in fileDict.items(): if trackBase not in doneTracks: newFiles[trackBase] = fileDict[trackBase] logging.debug("Not specified in meta, so adding at the end: %s" % trackBase) logging.debug("Final track order is: %s" % newFiles.keys()) assert(len(newFiles)==len(fileDict)) return newFiles def writeTdb(inDir, dirDict, dirType, tdbDir, ofh): " given a dict with basename -> type -> filenames, write track entries to ofh " # this code is getting increasingly complex because it supports composites/views and pairing of bigBed/bigWig files # either this needs better comments or maybe a separate code path for this rare use case global compCount fnameDict = dirDict[dirType] for parentName, typeDict in fnameDict.items(): if parentName is None: # top level tracks: use top tracks.tsv subDir = inDir else: # container tracks -> use tracks.tsv in the subdirectory subDir = join(inDir, parentName) parentMeta = parseMeta(subDir) indent = 0 parentHasViews = False groupMeta = {} if dirType=="comps": tdb = { "track" : parentName, "shortLabel": parentName, "visibility" : "dense", "compositeTrack" : "on", "autoScale" : "group", "type" : "bed 4" } metaOverride(tdb, parentMeta) groupMeta = parentMeta parentHasViews = mostFilesArePaired(typeDict) if parentHasViews: tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals" logging.info("Container track %s has >80%% of paired files, activating views" % parentName) writeStanza(ofh, indent, tdb) indent = 4 if parentHasViews: # we have composites with paired files? -> write the track stanzas for the two views groupMeta = parseMeta(subDir) tdbViewPeaks = { "track" : parentName+"ViewPeaks", "shortLabel" : parentName+" Peaks", "parent" : parentName, "view" : "PK", "visibility" : "dense", "type" : "bigBed", "scoreFilter" : "off", "viewUi" : "on" } metaOverride(tdbViewPeaks, parentMeta) writeStanza(ofh, indent, tdbViewPeaks) tdbViewSig = { "track" : parentName+"ViewSignal", "shortLabel" : parentName+" Signal", "parent" : parentName, "view" : "SIG", "visibility" : "dense", "type" : "bigWig", "viewUi" : "on" } metaOverride(tdbViewSig, parentMeta) writeStanza(ofh, indent, tdbViewSig) else: # no composites groupMeta = parseMeta(subDir) typeDict = reorderTracks(typeDict, groupMeta) for trackBase, typeFnames in typeDict.items(): for fileType, absFnames in typeFnames.items(): assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type absFname = absFnames[0] fileBase = basename(absFname) relFname = relpath(absFname, tdbDir) labelSuff = "" if parentHasViews: if fileType=="bigWig": labelSuff = " Signal" elif fileType=="bigBed": labelSuff = " Peaks" else: assert(False) # views and non-bigWig/Bed are not supported yet? if parentName is not None: parentPrefix = parentName+"__" else: parentPrefix = "" trackName = parentPrefix+trackBase+"__"+fileType tdb = { "track" : trackName, "shortLabel" : trackBase+labelSuff, "longLabel" : trackBase+labelSuff, "visibility" : "dense", "type" : fileType, "bigDataUrl" : relFname, } if parentName: tdb["parent"] = parentName if parentHasViews: onOff = "on" if trackName in groupMeta and "visibility" in groupMeta[trackName]: vis = groupMeta[trackName]["visibility"] if vis=="hide": onOff = "off" del tdb["visibility"] if fileType=="bigBed": tdb["parent"] = parentName+"ViewPeaks"+" "+onOff tdb["subGroups"] = "view=PK" else: tdb["parent"] = parentName+"ViewSignal"+" "+onOff tdb["subGroups"] = "view=SIG" metaOverride(tdb, groupMeta) if trackName in groupMeta and "visibility" in groupMeta[trackName]: del tdb["visibility"] writeStanza(ofh, indent, tdb) def importJbrowse(baseUrl, db, outDir): " import an IGV trackList.json hierarchy " import requests outFn = join(outDir, "hub.txt") ofh = open(outFn, "w") writeHubGenome(ofh, db, {}) trackListUrl = baseUrl+"/data/trackList.json" logging.info("Loading %s" % trackListUrl) trackList = requests.get(trackListUrl).json() tdbs = [] for tl in trackList["tracks"]: if "type" in tl and tl["type"]=="SequenceTrack": logging.info("Genome is: "+tl["label"]) continue if tl["storeClass"]=="JBrowse/Store/SeqFeature/NCList": logging.info("NCList found: ",tl) continue tdb = {} tdb["track"] = tl["label"] tdb["shortLabel"] = tl["key"] if "data_file" in tl: tdb["bigDataUrl"] = baseUrl+"/data/"+tl["data_file"] else: tdb["bigDataUrl"] = baseUrl+"/data/"+tl["urlTemplate"] if tl["storeClass"] == "JBrowse/Store/SeqFeature/BigWig": tdb["type"] = "bigWig" dispMode = tl.get("display_mode") if dispMode: if dispMode=="normal": tdb["visibility"] = "pack" elif dispMode=="compact": tdb["visibility"] = "dense" else: tdb["visibility"] = "pack" else: tdb["visibility"] = "pack" writeStanza(ofh, 0, tdb) def installModule(package): " install a package " logging.info("Could not find Python module '%s', trying to install with pip" % package) subprocess.check_call([sys.executable, "-m", "pip", "install", package]) def cacheLoad(fname): " load file cache from json file " if not isfile(fname): logging.debug("No upload cache present") return {} logging.debug("Loading "+fname) return json.load(open(fname)) def cacheWrite(uploadCache, fname): logging.debug("Writing "+fname) json.dump(uploadCache, open(fname, "w"), indent=4) def getFileType(fbase): " return the file type defined in the hubspace system, given a base file name " ret = "NA" for fileType, fileExts in fileTypeExtensions.items(): for fileExt in fileExts: if fbase.endswith(fileExt): ret = fileType break if ret!="NA": break - logging.debug("file type for %s is %s" % (fbase, fileType)) + logging.debug("file type for %s is %s" % (fbase, ret)) return ret def uploadFiles(tdbDir, hubName): "upload files to hubspace. Server name and token can come from ~/.hubtools.conf " try: from tusclient import client except ModuleNotFoundError: installModule("tuspy") from tusclient import client serverUrl = cfgOption("tusUrl", "https://hubspace.gi.ucsc.edu/files") cookies = {} cookieNameUser = cfgOption("wiki.userNameCookie", "wikidb_mw1_UserName") cookieNameId = cfgOption("wiki.loggedInCookie", "wikidb_mw1_UserID") apiKey = cfgOption("apiKey") if apiKey is None: errAbort("To upload files, the file ~/.hubtools.conf must contain a line like apiKey='xxx').\n" "Go to https://genome.ucsc.edu/cgi-bin/hgHubConnect#dev to create a new apiKey. Then run \n" " echo 'apiKey=\"xxxx\"' >> ~/.hubtools.conf \n" "and run the 'hubtools up' command again.") logging.info(f"TUS server URL: {serverUrl}") my_client = client.TusClient(serverUrl) cacheFname = join(tdbDir, ".hubtools.files.json") uploadCache = cacheLoad(cacheFname) - for rootDir, dirs, files in os.walk(tdbDir): + logging.debug("trackDb directory is %s" % tdbDir) + for rootDir, _, files in os.walk(tdbDir): for fbase in files: if fbase.startswith("."): continue - fpath = join(rootDir, fbase) - localPath = join(tdbDir, fpath) + localPath = normpath(join(rootDir, fbase)) + logging.debug("localPath: %s" % localPath) localMtime = os.stat(localPath).st_mtime # skip files that have not changed their mtime since last upload - if fpath in uploadCache: + if localPath in uploadCache: cacheMtime = uploadCache[fpath]["mtime"] if localMtime == cacheMtime: logging.info("%s: file mtime unchanged, not uploading again" % localPath) continue else: logging.debug("file %s: mtime is %f, cache mtime is %f, need to re-upload" % - (fpath, localMtime, cacheMtime)) + (localPath, localMtime, cacheMtime)) else: - logging.debug("file %s not in upload cache" % fpath) + logging.debug("file %s not in upload cache" % localPath) fileType = getFileType(fbase) - parentDir = join(hubName, rootDir) + fileAbsPath = abspath(localPath) + remoteRelPath = relpath(fileAbsPath, tdbDir) + remoteDir = dirname(remoteRelPath) + meta = { "apiKey" : apiKey, - "parentDir" : parentDir, + "parentDir" : remoteDir, "genome":"NA", "fileName" : fbase, "hubName" : hubName, "hubtools" : "true", "fileType": fileType, "lastModified" : str(int(localMtime)), } - logging.info(f"Uploading {fpath}, meta {meta}") + logging.info(f"Uploading {localPath}, meta {meta}") uploader = my_client.uploader(localPath, metadata=meta) uploader.upload() # note that this file was uploaded cache = {} mtime = os.stat(localPath).st_mtime cache["mtime"] = mtime cache["size"] = os.stat(localPath).st_size uploadCache[fpath] = cache cacheWrite(uploadCache, cacheFname) def iterRaStanzas(fname): " parse an ra-style (trackDb) file and yield dictionaries " data = dict() logging.debug("Parsing %s in trackDb format" % fname) with open(fname, "rt") as ifh: for l in ifh: l = l.lstrip(" ").rstrip("\r\n") if len(l)==0: yield data data = dict() else: if " " not in l: continue key, val = l.split(" ", maxsplit=1) data[key] = val if len(data)!=0: yield data def raToTab(fname): " convert .ra file to .tsv " stanzas = [] allFields = set() for stanza in iterRaStanzas(fname): if "hub" in stanza or "genome" in stanza: continue allFields.update(stanza.keys()) stanzas.append(stanza) if "track" in allFields: allFields.remove("track") if "shortLabel" in allFields: allFields.remove("shortLabel") hasLongLabel = False if "longLabel" in allFields: allFields.remove("longLabel") hasLongLabel = True sortedFields = sorted(list(allFields)) # make sure that track shortLabel and longLabel come first and always there, handy for manual edits if hasLongLabel: sortedFields.insert(0, "longLabel") sortedFields.insert(0, "shortLabel") sortedFields.insert(0, "track") ofh = sys.stdout ofh.write("#") ofh.write("\t".join(sortedFields)) ofh.write("\n") for s in stanzas: row = [] for fieldName in sortedFields: row.append(s.get(fieldName, "")) ofh.write("\t".join(row)) ofh.write("\n") def guessFieldDesc(fieldNames): "given a list of field names, try to guess to which fields in bed12 they correspond " logging.info("No field description specified, guessing fields from TSV headers: %s" % fieldNames) fieldDesc = {} skipFields = set() for fieldIdx, fieldName in enumerate(fieldNames): caseField = fieldName.lower() if caseField in ["chrom", "chromosome"]: name = "chrom" elif caseField.endswith(" id") or caseField.endswith("accession"): name = "name" elif caseField in ["start", "chromstart", "position"]: name = "start" elif caseField in ["Reference"]: name = "refAllele" elif caseField in ["strand"]: name = "strand" elif caseField in ["score"]: name = "score" else: continue fieldDesc[name] = fieldIdx skipFields.add(fieldIdx) logging.info("TSV <-> BED correspondance: %s" % fieldDesc) return fieldDesc, skipFields def parseFieldDesc(fieldDescStr, fieldNames): " given a string chrom=1,start=2,end=3,... return dict {chrom:1,...} " if not fieldDescStr: return guessFieldDesc(fieldNames) fieldDesc, skipFields = guessFieldDesc(fieldNames) if fieldDescStr: for part in fieldDescStr.split(","): fieldName, fieldIdx = part.split("=") fieldIdx = int(fieldIdx) fieldDesc[fieldName] = fieldIdx skipFields.add(fieldIdx) return fieldDesc, skipFields def makeBedRow(row, fieldDesc, skipFields, isOneBased): " given a row of a tsv file and a fieldDesc with BedFieldName -> field-index, return a bed12+ row with extra fields " bedRow = [] # first construct the bed 12 fields for fieldName in ["chrom", "start", "end", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "chromStarts"]: fieldIdx = fieldDesc.get(fieldName) if fieldIdx is not None: if fieldName=="start": chromStart = int(row[fieldDesc["start"]]) if isOneBased: chromStart = chromStart-1 val = str(chromStart) elif fieldName=="end": chromEnd = int(val) else: val = row[fieldIdx] else: if fieldName=="end": chromEnd = chromStart+1 val = str(chromEnd) elif fieldName=="score": val = "0" elif fieldName=="strand": val = "." elif fieldName=="thickStart": val = str(chromStart) elif fieldName=="thickEnd": val = str(chromEnd) elif fieldName=="itemRgb": val = "0,0,0" elif fieldName=="blockCount": val = "1" elif fieldName=="blockSizes": val = str(chromEnd-chromStart) elif fieldName=="chromStarts": #val = str(chromStart) val = "0" else: logging.error("Cannot find a field for %s" % fieldName) sys.exit(1) bedRow.append(val) # now handle all other fields for fieldIdx, val in enumerate(row): if not fieldIdx in skipFields: bedRow.append( row[fieldIdx] ) return bedRow def fetchChromSizes(db, outputFileName): " find on local disk or download a <db>.sizes text file " # Construct the URL based on the database name url = f"https://hgdownload.cse.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz" # Send a request to download the file response = requests.get(url, stream=True) # Check if the request was successful if response.status_code == 200: # Open the output gzip file for writing with gzip.open(outputFileName, 'wt') as outFile: # Open the response content as a gzip file in text mode with gzip.GzipFile(fileobj=response.raw, mode='r') as inFile: # Read the content using csv reader to handle tab-separated values reader = csv.reader(inFile, delimiter='\t') writer = csv.writer(outFile, delimiter='\t', lineterminator='\n') # Iterate through each row, and retain only the first two fields for row in reader: writer.writerow(row[:2]) # Write only the first two fields else: raise Exception(f"Failed to download file from {url}, status code: {response.status_code}") logging.info("Downloaded %s to %s" % (db, outputFileName)) def getChromSizesFname(db): " return fname of chrom sizes, download into ~/.local/ucscData/ if not found " fname = "/hive/data/genomes/%s/chrom.sizes" % db if isfile(fname): return fname dataDir = "~/.local/hubtools" fname = join(dataDir, "%s.sizes" % db) fname = os.path.expanduser(fname) if not isfile(fname): makedirs(dataDir) fetchChromSizes(db, fname) return fname def convTsv(db, tsvFname, outBedFname, outAsFname, outBbFname): " convert tsv files in inDir to outDir, assume that they all have one column for chrom, start and end. Try to guess these or fail. " #tsvFname, outBedFname, outAsFname = args # join and output merged bed bigCols = set() # col names of columns with > 255 chars #bedFh = open(bedFname, "w") unsortedBedFh = tempfile.NamedTemporaryFile(suffix=".bed", dir=dirname(outBedFname), mode="wt") fieldNames = None #isOneBased = options.oneBased isOneBased = True bedFieldsDesc = None # in the future, the user may want to input a string like name=0,start=1, but switch this off for now for line in open(tsvFname): row = line.rstrip("\r\n").split("\t") if fieldNames is None: fieldNames = row fieldDesc, notExtraFields = parseFieldDesc(bedFieldsDesc, fieldNames) continue # note fields with data > 255 chars. for colName, colData in zip(fieldNames, row): if len(colData)>255: bigCols.add(colName) bedRow = makeBedRow(row, fieldDesc, notExtraFields, isOneBased) chrom = bedRow[0] if chrom.isdigit() or chrom in ["X", "Y"]: bedRow[0] = "chr"+bedRow[0] unsortedBedFh.write( ("\t".join(bedRow))) unsortedBedFh.write("\n") unsortedBedFh.flush() cmd = "sort -k1,1 -k2,2n %s > %s" % (unsortedBedFh.name, outBedFname) assert(os.system(cmd)==0) unsortedBedFh.close() # removes the temp file # generate autosql # BED fields #bedColCount = int(options.type.replace("bed", "").replace("+" , "")) bedColCount = 12 asFh = open(outAsFname, "w") asFh.write(asHead) asFh.write("\n".join(asLines[:bedColCount+1])) asFh.write("\n") # extra fields #fieldNames = fieldNames[bedColCount:] for fieldIdx, field in enumerate(fieldNames): if fieldIdx in notExtraFields: continue name = field.replace(" ","") name = field.replace("%","perc_") name = re.sub("[^a-zA-Z0-9]", "", name) name = name[0].lower()+name[1:] fType = "string" if field in bigCols: fType = "lstring" asFh.write(' %s %s; "%s" \n' % (fType, name, field)) asFh.write(")\n") asFh.close() chromSizesFname = getChromSizesFname(db) cmd = ["bedToBigBed", outBedFname, chromSizesFname, outBbFname, "-tab", "-type=bed%d+" % bedColCount, "-as=%s" % outAsFname] subprocess.check_call(cmd) def convTsvDir(inDir, db, outDir): " find tsv files under inDir and convert them all to .bb " ext = "tsv" pat = join(inDir, '**/*.'+ext) logging.debug("Finding files under %s (%s), writing output to %s" % (inDir, pat, outDir)) for fname in glob.glob(pat, recursive=True): logging.debug("Found %s" % fname) absFname = abspath(fname) relFname = relpath(absFname, inDir) outPath = Path(join(outDir, relFname)) if not outPath.parents[0].is_dir(): logging.debug("mkdir -p %s" % outPath.parents[0]) makedirs(outPath.parents[0]) bedFname = outPath.with_suffix(".bed") asFname = outPath.with_suffix(".as") bbFname = outPath.with_suffix(".bb") convTsv(db, fname, bedFname, asFname, bbFname) def parseTrackLine(s): " Use shlex to split the string respecting quotes, written by chatGPT " lexer = shlex.shlex(s, posix=True) lexer.whitespace_split = True lexer.wordchars += '=' # Allow '=' as part of words tokens = list(lexer) # Convert the tokens into a dictionary it = iter(tokens) result = {} for token in it: if '=' in token: key, value = token.split('=', 1) result[key] = value.strip('"') # Remove surrounding quotes if present else: result[token] = next(it).strip('"') # Handle cases like name="...". return result def readTrackLines(fnames): " read the first line and convert to a dict of all fnames. " logging.debug("Reading track lines from %s" % fnames) ret = {} for fn in fnames: line1 = open(fn).readline().rstrip("\n") notTrack = line1.replace("track ", "", 1) tdb = parseTrackLine(notTrack) ret[fn] = tdb return ret def stripFirstLine(inputFilename, outputFilename): " written by chatGpt: copies all lines to output, except the first line " with open(inputFilename, 'r') as infile, open(outputFilename, 'w') as outfile: # Skip the first line next(infile) # Write the rest of the lines to the output file for line in infile: outfile.write(line) def bedToBigBed(inFname, db, outFname): " convert bed to bigbed file, handles chrom.sizes download " chromSizesFname = getChromSizesFname(db) cmd = ["bedToBigBed", inFname, chromSizesFname, outFname] logging.debug("Running %s" % cmd) logging.info(f'Converting {inFname} to {outFname}. (chromSizes: {chromSizesFname}') subprocess.check_call(cmd) def convCtDb(db, inDir, outDir): " convert one db part of a track archive to an output directory " findGlob = join(inDir, "*.ct") inFnames = glob.glob(findGlob) if len(inFnames)==0: logging.info("No *.ct files found in %s" % findGlob) return tdbData = readTrackLines(inFnames) makedirs(outDir) hubTxtFname = join(outDir, "hub.txt") ofh = open(hubTxtFname, "wt") meta = parseMeta(inDir) writeHubGenome(ofh, db, meta) tdbIdx = 0 for fname, tdb in tdbData.items(): tdbIdx += 1 # custom track names can include spaces, spec characters, etc. Strip all those # append a number to make sure that the result is unique track = tdb["name"] track = re.sub('[^A-Za-z0-9]+', '', track)+"_"+str(tdbIdx) tdb["track"] = track tdb["shortLabel"] = tdb["name"] del tdb["name"] tdb["longLabel"] = tdb["description"] del tdb["description"] if "bigDataUrl" not in tdb: bedFname = join(outDir, tdb["track"]+".bed") bbFname = join(outDir, tdb["track"]+".bb") stripFirstLine(fname, bedFname) bedToBigBed(bedFname, db, bbFname) os.remove(bedFname) tdb["bigDataUrl"] = tdb["track"]+".bb" tdb["type"] = "bigBed" writeStanza(ofh, 0, tdb) logging.info("Wrote %s" % hubTxtFname) ofh.close() def convArchDir(inDir, outDir): " convert a directory created from the .tar.gz file downloaded via our track archive feature " dbContent = os.listdir(inDir) dbDirs = [] for db in dbContent: subDir = join(inDir, db) if isdir(subDir): dbDirs.append((db, subDir)) if len(dbDirs)==0: errAbort("No directories found under %s. Extract the tarfile and point this program at the 'archive' directory." % inDir) for db, inSubDir in dbDirs: logging.debug("Processing %s, db=%s" % (inSubDir, db)) outSubDir = join(outDir, db) convCtDb(db, inSubDir, outSubDir) def hgsidFromUrl(url): " return hgsid given a URL. written by chatGPT " # Parse the URL parsed_url = urllib.parse.urlparse(url) server_name = f"{parsed_url.scheme}://{parsed_url.netloc}" query_params = urllib.parse.parse_qs(parsed_url.query) hgsid = query_params.get('hgsid') # Return the hgsid value if it exists if hgsid: return server_name, hgsid[0] # Extracting the first value from the list errAbort("hgsid parameter not found in "+url) def convCtUrl(url, outDir): " given an hgTracks URL with an hgsid on it, get all custom track lines and create a hub file for it. Does not download custom tracks. " serverUrl, hgsid = hgsidFromUrl(url) cartDumpUrl = serverUrl+"/cgi-bin/cartDump?hgsid="+hgsid def hubtools(args, options): """ find files under dirName and create a trackDb.txt for them""" cmd = args[0] inDir = "." if options.inDir: inDir = options.inDir outDir = "." if options.outDir: outDir = options.outDir if cmd=="up": if len(args)<2: errAbort("The 'up' command requires one argument, the name of the hub. You can specify any name, " - "ideally a short, meaningful string, e.g. atacseq, muscle-rna or yamamoto2022.") + "ideally a short, meaningful string, e.g. atacseq, muscle-rna or yamamoto2022. Avoid special characters.") hubName = args[1] uploadFiles(inDir, hubName) return tdbDir = inDir if options.outDir: tdbDir = options.outDir if cmd=="jbrowse": importJbrowse(args[1], args[2], tdbDir) elif cmd == "tab": raToTab(args[1]) elif cmd == "make": db = args[1] meta = parseMeta(inDir) dirFiles = readDirs(inDir, meta) hubFname = join(tdbDir, "hub.txt") logging.info("Writing %s" % hubFname) ofh = open(hubFname, "w") meta = parseMeta(inDir) writeHubGenome(ofh, db, meta) writeTdb(inDir, dirFiles, "top", tdbDir, ofh) writeTdb(inDir, dirFiles, "comps", tdbDir, ofh) ofh.close() elif cmd=="conv": db = args[1] convTsvDir(inDir, db, outDir) elif cmd=="archive": convArchDir(inDir, outDir) elif cmd=="ct": url = args[1] convCtUrl(url, outDir) else: logging.error("Unknown command: '%s'" % args[1]) # ----------- main -------------- def main(): args, options = parseArgs() hubtools(args, options) main()