5d2cc0dd88a130e3e1c3eb3d9f1ad5d7d1f67e04 max Wed Feb 26 08:10:53 2025 -0800 typo in previous commit diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools index 7ace50d5b2e..437751cd712 100755 --- src/utils/hubtools/hubtools +++ src/utils/hubtools/hubtools @@ -1,1276 +1,1276 @@ #!/usr/bin/env python3 import logging, sys, optparse, os, json, subprocess, shutil, string, glob, tempfile, re import shlex from pathlib import Path from collections import defaultdict, OrderedDict from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir, normpath #import pyyaml # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml() # ==== functions ===== # allowed file types by hubtools up # copied from the javascript file hgHubConnect.js fileTypeExtensions = { "bigBed": [ ".bb", ".bigbed" ], "bam": [ ".bam" ], "vcf": [ ".vcf" ], "vcfTabix": [ ".vcf.gz", "vcf.bgz" ], "bigWig": [ ".bw", ".bigwig" ], "hic": [ ".hic" ], "cram": [ ".cram" ], "bigBarChart": [ ".bigbarchart" ], "bigGenePred": [ ".bgp", ".biggenepred" ], "bigMaf": [ ".bigmaf" ], "bigInteract": [ ".biginteract" ], "bigPsl": [ ".bigpsl" ], "bigChain": [ ".bigchain" ], "bamIndex": [ ".bam.bai", ".bai" ], "tabixIndex": [ ".vcf.gz.tbi", "vcf.bgz.tbi" ] } asHead = """table bed "Browser extensible data (<=12 fields) " ( """ asLines = """ string chrom; "Chromosome (or contig, scaffold, etc.)" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Name of item" uint score; "Score from 0-1000" char[1] strand; "+ or -" uint thickStart; "Start of where display should be thick (start codon)" uint thickEnd; "End of where display should be thick (stop codon)" uint reserved; "Used as itemRgb as of 2004-11-22" int blockCount; "Number of blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Start positions relative to chromStart" """.split("\n") def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs hubtools make <assemblyCode e.g. hg38>: create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt, and tries to guess reasonable settings from the file names: - bigBed/bigWig files in the current directory will be top level tracks - big* files in subdirectories become composites - for every filename, the part before the first dot becomes the track base name - if a directory has more than 80% of track base names with both a bigBed and bigWig file, views are activated for this composite - track attributes can be changed using tracks.ra, tracks.tsv, tracks.json or tracks.yaml files, in each top or subdirectory - The first column of tracks.tsv must be 'track'. - tracks.json and tracks.yaml must have objects at the top level, the attributes are <trackName> or the special attribute "hub". hubtools up <hubName>: upload files to hubSpace - needs ~/.hubtools.conf with a line apiKey="xxxx". Create a key by going to My Data > Track Hubs > Track Development on https://genome.ucsc.edu - uploads all files from the -i directory or the current dir if not specified. hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt. - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/ - <db> is assembly identifier hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc. - <fname> is the input filename. "hub" and "genome" stanzas are skipped. - output goes to stdout hubtools conv -i myTsvDir - convert .tsv files in inDir to .bigBed files in current directory hubtools archive -i xxx/archive -o outDir - Create a hub from an extracted xxx.tar.gz file "archive" directory. You can download a .tar.gz from in the Genome Browser under My Data > My Session. The archive contains all your custom tracks. It is usually easier to use the 'ct' command, as this will automatically download an archive an convert it, given a full genome browser URL with an "hgsid" parameter on the URL. hubtools ct <hgTracksUrl> - Download all custom tracks from a Genome Browser URL, either as a /s/ stable shortlink or the temporary hgsid=xxxx URL Examples: hubtools conv -i myTsvs/ hubtools make hg38 hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3 hubtools tab hub.txt > tracks.tsv hubtools session tar xvfz SC_20230723_backup.tar.gz hubtools archive -i archive -o hub hubtools ct 'https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr7%3A155798978%2D155810671&hgsid=345826202_8d3Mpumjaw9IXYI2RDaSt5xMoUhH' For the "hubtools make" step, you can specify additional options for your tracks via tracks.json/.yaml or tracks.tsv: tracks.json can look like this, can have more keys, one per track, or the special key "hub": { "hub" : { "hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons" }, "myTrack" : { "shortLabel" : "My nice track" } } tracks.tsv should look like this, other columns can be added: #track<tab>shortLabel myTrack<tab>My nice track For a list of all possible fields/columns see https://genome.ucsc.edu/goldenpath/help/trackDb/trackDbHub.html: """) parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.") #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages") parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace") (options, args) = parser.parse_args() if len(args)==0: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def errAbort(msg): " print and abort) " logging.error(msg) sys.exit(1) def makedirs(d): if not isdir(d): os.makedirs(d) def parseConf(fname): " parse a hg.conf style file, return as dict key -> value (all strings) " logging.debug("Parsing "+fname) conf = {} for line in open(fname): line = line.strip() if line.startswith("#"): continue elif line.startswith("include "): inclFname = line.split()[1] absFname = normpath(join(dirname(fname), inclFname)) if os.path.isfile(absFname): inclDict = parseConf(absFname) conf.update(inclDict) elif "=" in line: # string search for "=" key, value = line.split("=",1) conf[key] = value return conf # cache of hg.conf contents hgConf = None def parseHgConf(): """ return hg.conf as dict key:value. """ global hgConf if hgConf is not None: return hgConf hgConf = dict() # python dict = hash table fname = os.path.expanduser("~/.hubtools.conf") if isfile(fname): hgConf = parseConf(fname) else: fname = os.path.expanduser("~/.hg.conf") if isfile(fname): hgConf = parseConf(fname) def cfgOption(name, default=None): " return hg.conf option or default " global hgConf if not hgConf: parseHgConf() return hgConf.get(name, default) def parseMetaRa(fname): """parse tracks.ra or tracks.txt and return as a dict of trackName -> dict of key->val """ logging.debug("Reading %s as .ra" % fname) trackName = None stanzaData = {} ret = {} for line in open(fname): line = line.strip() if line.startswith("#"): continue if line=="": if len(stanzaData)==0: # double newline continue if trackName is None: errAbort("File %s has a stanza without a track name" % fname) if trackName in ret: errAbort("File %s contains two stanzas with the same track name '%s' " % trackName) ret[trackName] = stanzaData stanzaData = {} trackName = None continue key, val = line.split(" ", maxsplit=1) if key == "track": trackName = val continue else: stanzaData[key] = val if len(stanzaData)!=0: ret[trackName] = stanzaData logging.debug("Got %s from .ra" % str(ret)) return ret def parseMetaTsv(fname): " parse a tracks.tsv file and return as a dict of trackName -> dict of key->val " headers = None meta = {} logging.debug("Parsing track meta data from %s in tsv format" % fname) for line in open(fname): row = line.rstrip("\r\n").split("\t") if headers is None: assert(line.startswith("track\t") or line.startswith("#track")) row[0] = row[0].lstrip("#") headers = row continue assert(len(row)==len(headers)) key = row[0] rowDict = {} for header, val in zip(headers[1:], row[1:]): rowDict[header] = val #row = {k:v for k,v in zip(headers, fs)} meta[key] = rowDict return meta def parseMetaJson(fname): " parse a json file and merge it into meta and return " logging.debug("Reading %s as json" % fname) newMeta = json.load(open(fname)) return newMeta def parseMetaYaml(fname): " parse yaml file " import yaml # if this doesn't work, run 'pip install pyyaml' with open(fname) as stream: try: return yaml.safe_load(stream) except yaml.YAMLError as exc: logging.error(exc) def parseMeta(inDir): " parse a tab-sep file with headers and return an ordered dict firstField -> dictionary " fname = join(inDir, "tracks.tsv") meta = OrderedDict() if isfile(fname): tsvMeta = parseMetaTsv(fname) meta = allMetaOverride(meta, tsvMeta) fname = join(inDir, "tracks.json") if isfile(fname): jsonMeta = parseMetaJson(fname) meta = allMetaOverride(meta, jsonMeta) fname = join(inDir, "tracks.ra") if isfile(fname): raMeta = parseMetaRa(fname) meta = allMetaOverride(meta, raMeta) fname = join(inDir, "tracks.yaml") if isfile(fname): yamlMeta = parseMetaYaml(fname) meta = allMetaOverride(meta, yamlMeta) logging.debug("Got meta from %s: %s" % (inDir, str(meta))) return meta def writeHubGenome(ofh, db, inMeta): " create a hub.txt and genomes.txt file, hub.txt is just a template " meta = inMeta.get("hub", {}) ofh.write("hub autoHub\n") ofh.write("shortLabel %s\n" % meta.get("shortLabel", "Auto-generated hub")) ofh.write("longLabel %s\n" % meta.get("longLabel", "Auto-generated hub")) #ofh.write("genomesFile genomes.txt\n") if "descriptionUrl" in meta: ofh.write("descriptionUrl %s\n" % meta["descriptionUrl"]) ofh.write("email %s\n" % meta.get("email", "yourEmail@example.com")) ofh.write("useOneFile on\n\n") ofh.write("genome %s\n\n" % db) return ofh def readSubdirs(inDir, subDirs): " given a list of dirs, find those that are composite dirs (not supporting supertracks for now) " compDicts, superDicts = {}, {} for subDir in subDirs: subPath = join(inDir, subDir) subSubDirs, subDict = readFnames(subPath) if len(subDict)==0: # no files in this dir continue if len(subSubDirs)==0: compDicts[subDir] = subDict #else: #superDicts[subDir] = subDict return compDicts, superDicts def reorderDirs(compDirs, meta): " order the directories in compDirs in the order that they appear in the meta data -> will mean that composites have the right order " if len(meta)==0: logging.debug("Not reordering these subdirectories: %s" % compDirs.keys()) return compDirs # first use the names in the meta data, and put in the right order newCompDirs = OrderedDict() for dirName in meta.keys(): if dirName in compDirs: newCompDirs[dirName] = compDirs[dirName] # then add everything else at the end for dirName in compDirs: if dirName not in newCompDirs: newCompDirs[dirName] = compDirs[dirName] logging.debug("Reordered input directories based on meta data. New order is: %s" % newCompDirs.keys()) return newCompDirs def readDirs(inDir, meta): " recurse down into directories and return containerType -> parentName -> fileBase -> type -> list of absPath " ret = {} subDirs, topFiles = readFnames(inDir) ret["top"] = { None : topFiles } # top-level track files have None as the parent compDirs, superDirs = readSubdirs(inDir, subDirs) compDirs = reorderDirs(compDirs, meta) # superDirs not used yet, no time ret["comps"] = compDirs ret["supers"] = superDirs return ret def readFnames(inDir): " return dict with basename -> fileType -> filePath " fnameDict = defaultdict(dict) #tdbDir = abspath(dirname(trackDbPath)) subDirs = [] for fname in os.listdir(inDir): filePath = join(inDir, fname) if isdir(filePath): subDirs.append(fname) continue baseName, ext = splitext(basename(fname)) ext = ext.lower() # actually, use the part before the first dot, not the one before the extension, as the track name # this means that a.scaled.bigBed and a.raw.bigWig get paired correctly fileBase = basename(fname).split(".")[0] if ext==".bw" or ext==".bigwig": fileType = "bigWig" elif ext==".bb" or ext==".bigbed": fileType = "bigBed" else: logging.debug("file %s is not bigBed nor bigWig, skipping" % fname) continue absFname = abspath(filePath) #relFname = relFname(absFname, tdbDir) fnameDict[fileBase].setdefault(fileType, []) #fnameDict[baseName][fileType].setdefault([]) fnameDict[fileBase][fileType].append(absFname) return subDirs, fnameDict def mostFilesArePaired(fnameDict): " check if 80% of the tracks have a pair bigBed+bigWig" pairCount = 0 for baseName, typeDict in fnameDict.items(): if "bigBed" in typeDict and "bigWig" in typeDict: pairCount += 1 pairShare = pairCount / len(fnameDict) return ( pairShare > 0.8 ) def writeLn(ofh, spaceCount, line): "write line to ofh, with spaceCount before it " ofh.write("".join([" "]*spaceCount)) ofh.write(line) ofh.write("\n") def writeStanza(ofh, indent, tdb): " write a stanza given a tdb key-val dict " track = tdb["track"] shortLabel = tdb.get("shortLabel", track.replace("_", " ")) visibility = tdb.get("visibility", "pack") longLabel = tdb.get("longLabel", shortLabel) if not "type" in tdb: errAbort("Track info has no type attribute: %s" % tdb) trackType = tdb["type"] writeLn(ofh, indent, "track %s" % track) writeLn(ofh, indent, "shortLabel %s" % shortLabel) if longLabel: writeLn(ofh, indent, "longLabel %s" % longLabel) if "parent" in tdb: writeLn(ofh, indent, "parent %s" % tdb["parent"]) writeLn(ofh, indent, "type %s" % trackType) writeLn(ofh, indent, "visibility %s" % visibility) if "bigDataUrl" in tdb: writeLn(ofh, indent, "bigDataUrl %s" % tdb["bigDataUrl"]) for key, val in tdb.items(): if key in ["track", "shortLabel", "longLabel", "type", "bigDataUrl", "visibility", "parent"]: continue writeLn(ofh, indent, "%s %s" % (key, val)) ofh.write("\n") def metaOverride(tdb, meta): " override track info for one single track, from meta into tdb " trackName = tdb["track"] if trackName not in meta and "__" in trackName: logging.debug("Using only basename of track %s" % trackName) trackName = trackName.split("__")[1] if trackName not in meta: logging.debug("No meta info for track %s" % tdb["track"]) return trackMeta = meta[trackName] for key, val in trackMeta.items(): if val!="": tdb[key] = trackMeta[key] def allMetaOverride(tdb, meta): " override track info for all tracks, from meta into tdb " if meta is None: return tdb for trackName in meta: trackMeta = meta[trackName] if trackName not in tdb: tdb[trackName] = {} trackTdb = tdb[trackName] for key, val in trackMeta.items(): trackTdb[key] = val return tdb def reorderTracks(fileDict, meta): " given an unsorted dictionary of files and ordered metadata, try to sort the files according to the metadata" if len(meta)==0: return fileDict # no meta data -> no ordering necessary trackOrder = [] # meta is an OrderedDict, so the keys are also ordered for trackName in meta.keys(): if "__" in trackName: # in composite mode, the tracknames contain the parent and the track type trackName = trackName.split("__")[1] trackOrder.append( trackName ) trackOrder = list(meta.keys()) newFiles = OrderedDict() doneTracks = set() # first add the tracks in the order of the meta data for trackBase in trackOrder: # the tsv file can have the track names either as basenames or as full tracknames if trackBase not in fileDict and "__" in trackBase: trackBase = trackBase.split("__")[1] if trackBase in fileDict: newFiles[trackBase] = fileDict[trackBase] doneTracks.add(trackBase) logging.debug("Track ordering from meta data used: %s" % newFiles.keys()) # then add all other tracks for trackBase, fileData in fileDict.items(): if trackBase not in doneTracks: newFiles[trackBase] = fileDict[trackBase] logging.debug("Not specified in meta, so adding at the end: %s" % trackBase) logging.debug("Final track order is: %s" % newFiles.keys()) assert(len(newFiles)==len(fileDict)) return newFiles def writeTdb(inDir, dirDict, dirType, tdbDir, ofh): " given a dict with basename -> type -> filenames, write track entries to ofh " # this code is getting increasingly complex because it supports composites/views and pairing of bigBed/bigWig files # either this needs better comments or maybe a separate code path for this rare use case global compCount fnameDict = dirDict[dirType] for parentName, typeDict in fnameDict.items(): if parentName is None: # top level tracks: use top tracks.tsv subDir = inDir else: # container tracks -> use tracks.tsv in the subdirectory subDir = join(inDir, parentName) parentMeta = parseMeta(subDir) indent = 0 parentHasViews = False groupMeta = {} if dirType=="comps": tdb = { "track" : parentName, "shortLabel": parentName, "visibility" : "dense", "compositeTrack" : "on", "autoScale" : "group", "type" : "bed 4" } metaOverride(tdb, parentMeta) groupMeta = parentMeta parentHasViews = mostFilesArePaired(typeDict) if parentHasViews: tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals" logging.info("Container track %s has >80%% of paired files, activating views" % parentName) writeStanza(ofh, indent, tdb) indent = 4 if parentHasViews: # we have composites with paired files? -> write the track stanzas for the two views groupMeta = parseMeta(subDir) tdbViewPeaks = { "track" : parentName+"ViewPeaks", "shortLabel" : parentName+" Peaks", "parent" : parentName, "view" : "PK", "visibility" : "dense", "type" : "bigBed", "scoreFilter" : "off", "viewUi" : "on" } metaOverride(tdbViewPeaks, parentMeta) writeStanza(ofh, indent, tdbViewPeaks) tdbViewSig = { "track" : parentName+"ViewSignal", "shortLabel" : parentName+" Signal", "parent" : parentName, "view" : "SIG", "visibility" : "dense", "type" : "bigWig", "viewUi" : "on" } metaOverride(tdbViewSig, parentMeta) writeStanza(ofh, indent, tdbViewSig) else: # no composites groupMeta = parseMeta(subDir) typeDict = reorderTracks(typeDict, groupMeta) for trackBase, typeFnames in typeDict.items(): for fileType, absFnames in typeFnames.items(): assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type absFname = absFnames[0] fileBase = basename(absFname) relFname = relpath(absFname, tdbDir) labelSuff = "" if parentHasViews: if fileType=="bigWig": labelSuff = " Signal" elif fileType=="bigBed": labelSuff = " Peaks" else: assert(False) # views and non-bigWig/Bed are not supported yet? if parentName is not None: parentPrefix = parentName+"__" else: parentPrefix = "" trackName = parentPrefix+trackBase+"__"+fileType tdb = { "track" : trackName, "shortLabel" : trackBase+labelSuff, "longLabel" : trackBase+labelSuff, "visibility" : "dense", "type" : fileType, "bigDataUrl" : relFname, } if parentName: tdb["parent"] = parentName if parentHasViews: onOff = "on" if trackName in groupMeta and "visibility" in groupMeta[trackName]: vis = groupMeta[trackName]["visibility"] if vis=="hide": onOff = "off" del tdb["visibility"] if fileType=="bigBed": tdb["parent"] = parentName+"ViewPeaks"+" "+onOff tdb["subGroups"] = "view=PK" else: tdb["parent"] = parentName+"ViewSignal"+" "+onOff tdb["subGroups"] = "view=SIG" metaOverride(tdb, groupMeta) if trackName in groupMeta and "visibility" in groupMeta[trackName]: del tdb["visibility"] writeStanza(ofh, indent, tdb) def importJbrowse(baseUrl, db, outDir): " import an IGV trackList.json hierarchy " import requests outFn = join(outDir, "hub.txt") ofh = open(outFn, "w") writeHubGenome(ofh, db, {}) trackListUrl = baseUrl+"/data/trackList.json" logging.info("Loading %s" % trackListUrl) trackList = requests.get(trackListUrl).json() tdbs = [] for tl in trackList["tracks"]: if "type" in tl and tl["type"]=="SequenceTrack": logging.info("Genome is: "+tl["label"]) continue if tl["storeClass"]=="JBrowse/Store/SeqFeature/NCList": logging.info("NCList found: ",tl) continue tdb = {} tdb["track"] = tl["label"] tdb["shortLabel"] = tl["key"] if "data_file" in tl: tdb["bigDataUrl"] = baseUrl+"/data/"+tl["data_file"] else: tdb["bigDataUrl"] = baseUrl+"/data/"+tl["urlTemplate"] if tl["storeClass"] == "JBrowse/Store/SeqFeature/BigWig": tdb["type"] = "bigWig" dispMode = tl.get("display_mode") if dispMode: if dispMode=="normal": tdb["visibility"] = "pack" elif dispMode=="compact": tdb["visibility"] = "dense" else: tdb["visibility"] = "pack" else: tdb["visibility"] = "pack" writeStanza(ofh, 0, tdb) def installModule(package): " install a package " logging.info("Could not find Python module '%s', trying to install with pip" % package) subprocess.check_call([sys.executable, "-m", "pip", "install", package]) def cacheLoad(fname): " load file cache from json file " if not isfile(fname): logging.debug("No upload cache present") return {} logging.debug("Loading "+fname) return json.load(open(fname)) def cacheWrite(uploadCache, fname): logging.debug("Writing "+fname) json.dump(uploadCache, open(fname, "w"), indent=4) def getFileType(fbase): " return the file type defined in the hubspace system, given a base file name " ret = "NA" for fileType, fileExts in fileTypeExtensions.items(): for fileExt in fileExts: if fbase.endswith(fileExt): ret = fileType break if ret!="NA": break logging.debug("file type for %s is %s" % (fbase, ret)) return ret def uploadFiles(tdbDir, hubName): "upload files to hubspace. Server name and token can come from ~/.hubtools.conf " try: from tusclient import client except ModuleNotFoundError: installModule("tuspy") from tusclient import client serverUrl = cfgOption("tusUrl", "https://hubspace.gi.ucsc.edu/files") cookies = {} cookieNameUser = cfgOption("wiki.userNameCookie", "wikidb_mw1_UserName") cookieNameId = cfgOption("wiki.loggedInCookie", "wikidb_mw1_UserID") apiKey = cfgOption("apiKey") if apiKey is None: errAbort("To upload files, the file ~/.hubtools.conf must contain a line like apiKey='xxx').\n" "Go to https://genome.ucsc.edu/cgi-bin/hgHubConnect#dev to create a new apiKey. Then run \n" " echo 'apiKey=\"xxxx\"' >> ~/.hubtools.conf \n" "and run the 'hubtools up' command again.") logging.info(f"TUS server URL: {serverUrl}") my_client = client.TusClient(serverUrl) cacheFname = join(tdbDir, ".hubtools.files.json") uploadCache = cacheLoad(cacheFname) logging.debug("trackDb directory is %s" % tdbDir) for rootDir, _, files in os.walk(tdbDir): for fbase in files: if fbase.startswith("."): continue localPath = normpath(join(rootDir, fbase)) logging.debug("localPath: %s" % localPath) localMtime = os.stat(localPath).st_mtime # skip files that have not changed their mtime since last upload if localPath in uploadCache: cacheMtime = uploadCache[fpath]["mtime"] if localMtime == cacheMtime: logging.info("%s: file mtime unchanged, not uploading again" % localPath) continue else: logging.debug("file %s: mtime is %f, cache mtime is %f, need to re-upload" % (localPath, localMtime, cacheMtime)) else: logging.debug("file %s not in upload cache" % localPath) fileType = getFileType(fbase) fileAbsPath = abspath(localPath) remoteRelPath = relpath(fileAbsPath, tdbDir) remoteDir = dirname(remoteRelPath) meta = { "apiKey" : apiKey, "parentDir" : remoteDir, "genome":"NA", "fileName" : fbase, "hubName" : hubName, "hubtools" : "true", "fileType": fileType, "lastModified" : str(int(localMtime)), } logging.info(f"Uploading {localPath}, meta {meta}") uploader = my_client.uploader(localPath, metadata=meta) uploader.upload() # note that this file was uploaded cache = {} mtime = os.stat(localPath).st_mtime cache["mtime"] = mtime cache["size"] = os.stat(localPath).st_size - uploadCache[fpath] = cache + uploadCache[localPath] = cache cacheWrite(uploadCache, cacheFname) def iterRaStanzas(fname): " parse an ra-style (trackDb) file and yield dictionaries " data = dict() logging.debug("Parsing %s in trackDb format" % fname) with open(fname, "rt") as ifh: for l in ifh: l = l.lstrip(" ").rstrip("\r\n") if len(l)==0: yield data data = dict() else: if " " not in l: continue key, val = l.split(" ", maxsplit=1) data[key] = val if len(data)!=0: yield data def raToTab(fname): " convert .ra file to .tsv " stanzas = [] allFields = set() for stanza in iterRaStanzas(fname): if "hub" in stanza or "genome" in stanza: continue allFields.update(stanza.keys()) stanzas.append(stanza) if "track" in allFields: allFields.remove("track") if "shortLabel" in allFields: allFields.remove("shortLabel") hasLongLabel = False if "longLabel" in allFields: allFields.remove("longLabel") hasLongLabel = True sortedFields = sorted(list(allFields)) # make sure that track shortLabel and longLabel come first and always there, handy for manual edits if hasLongLabel: sortedFields.insert(0, "longLabel") sortedFields.insert(0, "shortLabel") sortedFields.insert(0, "track") ofh = sys.stdout ofh.write("#") ofh.write("\t".join(sortedFields)) ofh.write("\n") for s in stanzas: row = [] for fieldName in sortedFields: row.append(s.get(fieldName, "")) ofh.write("\t".join(row)) ofh.write("\n") def guessFieldDesc(fieldNames): "given a list of field names, try to guess to which fields in bed12 they correspond " logging.info("No field description specified, guessing fields from TSV headers: %s" % fieldNames) fieldDesc = {} skipFields = set() for fieldIdx, fieldName in enumerate(fieldNames): caseField = fieldName.lower() if caseField in ["chrom", "chromosome"]: name = "chrom" elif caseField.endswith(" id") or caseField.endswith("accession"): name = "name" elif caseField in ["start", "chromstart", "position"]: name = "start" elif caseField in ["Reference"]: name = "refAllele" elif caseField in ["strand"]: name = "strand" elif caseField in ["score"]: name = "score" else: continue fieldDesc[name] = fieldIdx skipFields.add(fieldIdx) logging.info("TSV <-> BED correspondance: %s" % fieldDesc) return fieldDesc, skipFields def parseFieldDesc(fieldDescStr, fieldNames): " given a string chrom=1,start=2,end=3,... return dict {chrom:1,...} " if not fieldDescStr: return guessFieldDesc(fieldNames) fieldDesc, skipFields = guessFieldDesc(fieldNames) if fieldDescStr: for part in fieldDescStr.split(","): fieldName, fieldIdx = part.split("=") fieldIdx = int(fieldIdx) fieldDesc[fieldName] = fieldIdx skipFields.add(fieldIdx) return fieldDesc, skipFields def makeBedRow(row, fieldDesc, skipFields, isOneBased): " given a row of a tsv file and a fieldDesc with BedFieldName -> field-index, return a bed12+ row with extra fields " bedRow = [] # first construct the bed 12 fields for fieldName in ["chrom", "start", "end", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "chromStarts"]: fieldIdx = fieldDesc.get(fieldName) if fieldIdx is not None: if fieldName=="start": chromStart = int(row[fieldDesc["start"]]) if isOneBased: chromStart = chromStart-1 val = str(chromStart) elif fieldName=="end": chromEnd = int(val) else: val = row[fieldIdx] else: if fieldName=="end": chromEnd = chromStart+1 val = str(chromEnd) elif fieldName=="score": val = "0" elif fieldName=="strand": val = "." elif fieldName=="thickStart": val = str(chromStart) elif fieldName=="thickEnd": val = str(chromEnd) elif fieldName=="itemRgb": val = "0,0,0" elif fieldName=="blockCount": val = "1" elif fieldName=="blockSizes": val = str(chromEnd-chromStart) elif fieldName=="chromStarts": #val = str(chromStart) val = "0" else: logging.error("Cannot find a field for %s" % fieldName) sys.exit(1) bedRow.append(val) # now handle all other fields for fieldIdx, val in enumerate(row): if not fieldIdx in skipFields: bedRow.append( row[fieldIdx] ) return bedRow def fetchChromSizes(db, outputFileName): " find on local disk or download a <db>.sizes text file " # Construct the URL based on the database name url = f"https://hgdownload.cse.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz" # Send a request to download the file response = requests.get(url, stream=True) # Check if the request was successful if response.status_code == 200: # Open the output gzip file for writing with gzip.open(outputFileName, 'wt') as outFile: # Open the response content as a gzip file in text mode with gzip.GzipFile(fileobj=response.raw, mode='r') as inFile: # Read the content using csv reader to handle tab-separated values reader = csv.reader(inFile, delimiter='\t') writer = csv.writer(outFile, delimiter='\t', lineterminator='\n') # Iterate through each row, and retain only the first two fields for row in reader: writer.writerow(row[:2]) # Write only the first two fields else: raise Exception(f"Failed to download file from {url}, status code: {response.status_code}") logging.info("Downloaded %s to %s" % (db, outputFileName)) def getChromSizesFname(db): " return fname of chrom sizes, download into ~/.local/ucscData/ if not found " fname = "/hive/data/genomes/%s/chrom.sizes" % db if isfile(fname): return fname dataDir = "~/.local/hubtools" fname = join(dataDir, "%s.sizes" % db) fname = os.path.expanduser(fname) if not isfile(fname): makedirs(dataDir) fetchChromSizes(db, fname) return fname def convTsv(db, tsvFname, outBedFname, outAsFname, outBbFname): " convert tsv files in inDir to outDir, assume that they all have one column for chrom, start and end. Try to guess these or fail. " #tsvFname, outBedFname, outAsFname = args # join and output merged bed bigCols = set() # col names of columns with > 255 chars #bedFh = open(bedFname, "w") unsortedBedFh = tempfile.NamedTemporaryFile(suffix=".bed", dir=dirname(outBedFname), mode="wt") fieldNames = None #isOneBased = options.oneBased isOneBased = True bedFieldsDesc = None # in the future, the user may want to input a string like name=0,start=1, but switch this off for now for line in open(tsvFname): row = line.rstrip("\r\n").split("\t") if fieldNames is None: fieldNames = row fieldDesc, notExtraFields = parseFieldDesc(bedFieldsDesc, fieldNames) continue # note fields with data > 255 chars. for colName, colData in zip(fieldNames, row): if len(colData)>255: bigCols.add(colName) bedRow = makeBedRow(row, fieldDesc, notExtraFields, isOneBased) chrom = bedRow[0] if chrom.isdigit() or chrom in ["X", "Y"]: bedRow[0] = "chr"+bedRow[0] unsortedBedFh.write( ("\t".join(bedRow))) unsortedBedFh.write("\n") unsortedBedFh.flush() cmd = "sort -k1,1 -k2,2n %s > %s" % (unsortedBedFh.name, outBedFname) assert(os.system(cmd)==0) unsortedBedFh.close() # removes the temp file # generate autosql # BED fields #bedColCount = int(options.type.replace("bed", "").replace("+" , "")) bedColCount = 12 asFh = open(outAsFname, "w") asFh.write(asHead) asFh.write("\n".join(asLines[:bedColCount+1])) asFh.write("\n") # extra fields #fieldNames = fieldNames[bedColCount:] for fieldIdx, field in enumerate(fieldNames): if fieldIdx in notExtraFields: continue name = field.replace(" ","") name = field.replace("%","perc_") name = re.sub("[^a-zA-Z0-9]", "", name) name = name[0].lower()+name[1:] fType = "string" if field in bigCols: fType = "lstring" asFh.write(' %s %s; "%s" \n' % (fType, name, field)) asFh.write(")\n") asFh.close() chromSizesFname = getChromSizesFname(db) cmd = ["bedToBigBed", outBedFname, chromSizesFname, outBbFname, "-tab", "-type=bed%d+" % bedColCount, "-as=%s" % outAsFname] subprocess.check_call(cmd) def convTsvDir(inDir, db, outDir): " find tsv files under inDir and convert them all to .bb " ext = "tsv" pat = join(inDir, '**/*.'+ext) logging.debug("Finding files under %s (%s), writing output to %s" % (inDir, pat, outDir)) for fname in glob.glob(pat, recursive=True): logging.debug("Found %s" % fname) absFname = abspath(fname) relFname = relpath(absFname, inDir) outPath = Path(join(outDir, relFname)) if not outPath.parents[0].is_dir(): logging.debug("mkdir -p %s" % outPath.parents[0]) makedirs(outPath.parents[0]) bedFname = outPath.with_suffix(".bed") asFname = outPath.with_suffix(".as") bbFname = outPath.with_suffix(".bb") convTsv(db, fname, bedFname, asFname, bbFname) def parseTrackLine(s): " Use shlex to split the string respecting quotes, written by chatGPT " lexer = shlex.shlex(s, posix=True) lexer.whitespace_split = True lexer.wordchars += '=' # Allow '=' as part of words tokens = list(lexer) # Convert the tokens into a dictionary it = iter(tokens) result = {} for token in it: if '=' in token: key, value = token.split('=', 1) result[key] = value.strip('"') # Remove surrounding quotes if present else: result[token] = next(it).strip('"') # Handle cases like name="...". return result def readTrackLines(fnames): " read the first line and convert to a dict of all fnames. " logging.debug("Reading track lines from %s" % fnames) ret = {} for fn in fnames: line1 = open(fn).readline().rstrip("\n") notTrack = line1.replace("track ", "", 1) tdb = parseTrackLine(notTrack) ret[fn] = tdb return ret def stripFirstLine(inputFilename, outputFilename): " written by chatGpt: copies all lines to output, except the first line " with open(inputFilename, 'r') as infile, open(outputFilename, 'w') as outfile: # Skip the first line next(infile) # Write the rest of the lines to the output file for line in infile: outfile.write(line) def bedToBigBed(inFname, db, outFname): " convert bed to bigbed file, handles chrom.sizes download " chromSizesFname = getChromSizesFname(db) cmd = ["bedToBigBed", inFname, chromSizesFname, outFname] logging.debug("Running %s" % cmd) logging.info(f'Converting {inFname} to {outFname}. (chromSizes: {chromSizesFname}') subprocess.check_call(cmd) def convCtDb(db, inDir, outDir): " convert one db part of a track archive to an output directory " findGlob = join(inDir, "*.ct") inFnames = glob.glob(findGlob) if len(inFnames)==0: logging.info("No *.ct files found in %s" % findGlob) return tdbData = readTrackLines(inFnames) makedirs(outDir) hubTxtFname = join(outDir, "hub.txt") ofh = open(hubTxtFname, "wt") meta = parseMeta(inDir) writeHubGenome(ofh, db, meta) tdbIdx = 0 for fname, tdb in tdbData.items(): tdbIdx += 1 # custom track names can include spaces, spec characters, etc. Strip all those # append a number to make sure that the result is unique track = tdb["name"] track = re.sub('[^A-Za-z0-9]+', '', track)+"_"+str(tdbIdx) tdb["track"] = track tdb["shortLabel"] = tdb["name"] del tdb["name"] tdb["longLabel"] = tdb["description"] del tdb["description"] if "bigDataUrl" not in tdb: bedFname = join(outDir, tdb["track"]+".bed") bbFname = join(outDir, tdb["track"]+".bb") stripFirstLine(fname, bedFname) bedToBigBed(bedFname, db, bbFname) os.remove(bedFname) tdb["bigDataUrl"] = tdb["track"]+".bb" tdb["type"] = "bigBed" writeStanza(ofh, 0, tdb) logging.info("Wrote %s" % hubTxtFname) ofh.close() def convArchDir(inDir, outDir): " convert a directory created from the .tar.gz file downloaded via our track archive feature " dbContent = os.listdir(inDir) dbDirs = [] for db in dbContent: subDir = join(inDir, db) if isdir(subDir): dbDirs.append((db, subDir)) if len(dbDirs)==0: errAbort("No directories found under %s. Extract the tarfile and point this program at the 'archive' directory." % inDir) for db, inSubDir in dbDirs: logging.debug("Processing %s, db=%s" % (inSubDir, db)) outSubDir = join(outDir, db) convCtDb(db, inSubDir, outSubDir) def hgsidFromUrl(url): " return hgsid given a URL. written by chatGPT " # Parse the URL parsed_url = urllib.parse.urlparse(url) server_name = f"{parsed_url.scheme}://{parsed_url.netloc}" query_params = urllib.parse.parse_qs(parsed_url.query) hgsid = query_params.get('hgsid') # Return the hgsid value if it exists if hgsid: return server_name, hgsid[0] # Extracting the first value from the list errAbort("hgsid parameter not found in "+url) def convCtUrl(url, outDir): " given an hgTracks URL with an hgsid on it, get all custom track lines and create a hub file for it. Does not download custom tracks. " serverUrl, hgsid = hgsidFromUrl(url) cartDumpUrl = serverUrl+"/cgi-bin/cartDump?hgsid="+hgsid def hubtools(args, options): """ find files under dirName and create a trackDb.txt for them""" cmd = args[0] inDir = "." if options.inDir: inDir = options.inDir outDir = "." if options.outDir: outDir = options.outDir if cmd=="up": if len(args)<2: errAbort("The 'up' command requires one argument, the name of the hub. You can specify any name, " "ideally a short, meaningful string, e.g. atacseq, muscle-rna or yamamoto2022. Avoid special characters.") hubName = args[1] uploadFiles(inDir, hubName) return tdbDir = inDir if options.outDir: tdbDir = options.outDir if cmd=="jbrowse": importJbrowse(args[1], args[2], tdbDir) elif cmd == "tab": raToTab(args[1]) elif cmd == "make": db = args[1] meta = parseMeta(inDir) dirFiles = readDirs(inDir, meta) hubFname = join(tdbDir, "hub.txt") logging.info("Writing %s" % hubFname) ofh = open(hubFname, "w") meta = parseMeta(inDir) writeHubGenome(ofh, db, meta) writeTdb(inDir, dirFiles, "top", tdbDir, ofh) writeTdb(inDir, dirFiles, "comps", tdbDir, ofh) ofh.close() elif cmd=="conv": db = args[1] convTsvDir(inDir, db, outDir) elif cmd=="archive": convArchDir(inDir, outDir) elif cmd=="ct": url = args[1] convCtUrl(url, outDir) else: logging.error("Unknown command: '%s'" % args[1]) # ----------- main -------------- def main(): args, options = parseArgs() hubtools(args, options) main()