0f3ca3eaf5792df01b7c600a5428d2d0b2809fcd max Fri Sep 20 13:18:01 2024 -0700 Revert "more features to hubtools: search in both parent and subdirs, better docs" This reverts commit 05e67c59a20a5d00b810a981aef3b00c5bef82e1. diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools index d11fed5..a9c06a7 100755 --- src/utils/hubtools/hubtools +++ src/utils/hubtools/hubtools @@ -5,59 +5,50 @@ from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir #import pyyaml # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml() # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs hubtools make <assemblyCode>: create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt - bigBed/bigWig files in the current directory will be top level tracks - big* files in subdirectories become composites - for every filename, the part before the first dot becomes the track base name - if a directory has more than 80% of track base names with both a bigBed and bigWig file, views are activated for this composite - - track attributes can be changed using tracks.tsv or json/ra/yaml files, in - each top or subdirectory. Both subdir and top directory are searched, subdir - values override any values specified in a parent directory. - - tracks.tsv (or tracks.json/ra/yaml) must have a first column named 'track'. For json and yaml, - the key is the track name. "hub" is a special key to provide email/shortLabel of the hub. - - the track name in this column can be either the full one <parent>__fileBasename__type or just - the fileBasename, this makes it easier. In practice, filenames are unique enough and don't - usually clash between subdirs or filetype, in practice. - - the track order is the one on disk, so relatively random. To order tracks, create a hub, - use the "tab" command below, reorder tracks in tsv file, then re-run the "make" command. + - track attributes can be changed using tracks.tsv or tracks.json files, in + each top or subdirectory + - tracks.tsv must have a first column named 'track'. hubtools up: upload files to hubSpace - needs ~/.hubt.conf with username and password. Create one with 'hubt conf' - uploads all files from the -i directory or the current dir if not specified. hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt. - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/ - <db> is assembly identifier hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc. - The resulting file if named tracks.tsv can be used as inputs for future runs. - <fname> is the input filename. "hub" and "genome" stanzas are skipped. - output goes to stdout Examples: - hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3 hubtools make hg38 - hubtools tab hub.txt | cut -f1-3 > tracks.tsv # then edit tracks.tsv - hubtools make hg38 # will pick up tracks.tsv edits to create a new hub.txt + hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3 + hubtools tab hub.txt > tracks.tsv tracks.json can look like this: { "hub" : {"hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons"} } """) parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.") #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages") parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace") (options, args) = parser.parse_args() if len(args)==0: @@ -137,37 +128,34 @@ def parseMetaJson(fname): " parse a json file and merge it into meta and return " logging.debug("Reading %s as json" % fname) newMeta = json.load(open(fname)) return newMeta def parseMetaYaml(fname): " parse yaml file " import yaml # if this doesn't work, run 'pip install pyyaml' with open(fname) as stream: try: return yaml.safe_load(stream) except yaml.YAMLError as exc: logging.error(exc) -def parseMeta(inDirs): - """ parse a tab-sep file with headers and return an ordered dict firstField -> dictionary. - Takes a list of input directories and the last directory overrides values in parent directories. - """ - meta = OrderedDict() - for inDir in inDirs: +def parseMeta(inDir): + " parse a tab-sep file with headers and return an ordered dict firstField -> dictionary " fname = join(inDir, "tracks.tsv") + meta = OrderedDict() if isfile(fname): tsvMeta = parseMetaTsv(fname) meta = allMetaOverride(meta, tsvMeta) fname = join(inDir, "tracks.json") if isfile(fname): jsonMeta = parseMetaJson(fname) meta = allMetaOverride(meta, jsonMeta) fname = join(inDir, "tracks.ra") if isfile(fname): raMeta = parseMetaRa(fname) meta = allMetaOverride(meta, raMeta) fname = join(inDir, "tracks.yaml") @@ -383,102 +371,102 @@ doneTracks.add(trackBase) logging.debug("Track ordering from meta data used: %s" % newFiles.keys()) # then add all other tracks for trackBase, fileData in fileDict.items(): if trackBase not in doneTracks: newFiles[trackBase] = fileDict[trackBase] logging.debug("Not specified in meta, so adding at the end: %s" % trackBase) logging.debug("Final track order is: %s" % newFiles.keys()) assert(len(newFiles)==len(fileDict)) return newFiles -def makeTrackDbEntries(inDir, dirDict, dirType, tdbDir, ofh): +def writeTdb(inDir, dirDict, dirType, tdbDir, ofh): " given a dict with basename -> type -> filenames, write track entries to ofh " # this code is getting increasingly complex because it supports composites/views and pairing of bigBed/bigWig files # either this needs better comments or maybe a separate code path for this rare use case global compCount fnameDict = dirDict[dirType] for parentName, typeDict in fnameDict.items(): if parentName is None: # top level tracks: use top tracks.tsv - subDirs = [inDir] - else: # container tracks -> use tracks.tsv in the parent and subdirectory - subDirs = [inDir, join(inDir, parentName)] + subDir = inDir + else: # container tracks -> use tracks.tsv in the subdirectory + subDir = join(inDir, parentName) - parentMeta = parseMeta(subDirs) + parentMeta = parseMeta(subDir) indent = 0 parentHasViews = False groupMeta = {} if dirType=="comps": tdb = { "track" : parentName, "shortLabel": parentName, "visibility" : "dense", "compositeTrack" : "on", "autoScale" : "group", "type" : "bed 4" } metaOverride(tdb, parentMeta) groupMeta = parentMeta parentHasViews = mostFilesArePaired(typeDict) if parentHasViews: tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals" logging.info("Container track %s has >80%% of paired files, activating views" % parentName) writeStanza(ofh, indent, tdb) indent = 4 if parentHasViews: # we have composites with paired files? -> write the track stanzas for the two views - groupMeta = parseMeta(subDirs) + groupMeta = parseMeta(subDir) tdbViewPeaks = { "track" : parentName+"ViewPeaks", "shortLabel" : parentName+" Peaks", "parent" : parentName, "view" : "PK", "visibility" : "dense", "type" : "bigBed", "scoreFilter" : "off", "viewUi" : "on" } metaOverride(tdbViewPeaks, parentMeta) writeStanza(ofh, indent, tdbViewPeaks) tdbViewSig = { "track" : parentName+"ViewSignal", "shortLabel" : parentName+" Signal", "parent" : parentName, "view" : "SIG", "visibility" : "dense", "type" : "bigWig", "viewUi" : "on" } metaOverride(tdbViewSig, parentMeta) writeStanza(ofh, indent, tdbViewSig) else: # no composites - groupMeta = parseMeta(subDirs) + groupMeta = parseMeta(subDir) typeDict = reorderTracks(typeDict, groupMeta) for trackBase, typeFnames in typeDict.items(): for fileType, absFnames in typeFnames.items(): assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type absFname = absFnames[0] fileBase = basename(absFname) relFname = relpath(absFname, tdbDir) labelSuff = "" if parentHasViews: if fileType=="bigWig": labelSuff = " Signal" elif fileType=="bigBed": @@ -663,41 +651,42 @@ if cmd=="up": uploadFiles(inDir) return tdbDir = inDir if options.outDir: tdbDir = options.outDir if cmd=="jbrowse": importJbrowse(args[1], args[2], tdbDir) elif cmd == "tab": raToTab(args[1]) elif cmd == "make": db = args[1] - meta = parseMeta([inDir]) + meta = parseMeta(inDir) dirFiles = readDirs(inDir, meta) hubFname = join(tdbDir, "hub.txt") logging.info("Writing %s" % hubFname) ofh = open(hubFname, "w") + meta = parseMeta(inDir) writeHubGenome(ofh, db, meta) - makeTrackDbEntries(inDir, dirFiles, "top", tdbDir, ofh) - makeTrackDbEntries(inDir, dirFiles, "comps", tdbDir, ofh) + writeTdb(inDir, dirFiles, "top", tdbDir, ofh) + writeTdb(inDir, dirFiles, "comps", tdbDir, ofh) ofh.close() else: logging.error("Unknown command: '%s'" % args[1]) # ----------- main -------------- def main(): args, options = parseArgs() hubt(args, options) #if options.test: #logging.debug("test is set") #f = open(options.file, "r")