f6c44a86607233aaf1755321277340fade45f2b2 max Wed Sep 18 07:55:41 2024 -0700 adding a .ra to .tsv converter to hubtools, motivated by MarcP diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools index 93a8af3..c6169c3 100755 --- src/utils/hubtools/hubtools +++ src/utils/hubtools/hubtools @@ -1,45 +1,49 @@ #!/usr/bin/env python3 -import logging, sys, optparse, os, json, subprocess, shutil +import logging, sys, optparse, os, json, subprocess, shutil, string from collections import defaultdict from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] - create and edit UCSC track hubs hubtools make : create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt - bigBed/bigWig files in the current directory will be top level tracks - big* files in subdirectories become composites - for every filename, the part before the first dot becomes the track base name - if a directory has more than 80% of track base names with both a bigBed and bigWig file, views are activated for this composite - track attributes can be changed using tracks.tsv or tracks.json files, in each top or subdirectory - tracks.tsv must have a first column named 'track'. hubtools up: upload files to hubSpace - needs ~/.hubt.conf with username and password. Create one with 'hubt conf' - uploads all files from the -i directory or the current dir if not specified. hubtools jbrowse : convert Jbrowse trackList.json files to hub.txt. - is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/ - is assembly identifier + hubtools tab : convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc. + - is the input filename. "hub" and "genome" stanzas are skipped. + - output goes to stdout + Examples: hubtools make hg38 hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3 """) parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.") #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages") parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace") (options, args) = parser.parse_args() if len(args)==0: @@ -294,30 +298,32 @@ " given a dict with basename -> type -> filenames, write track entries to ofh " global compCount fnameDict = dirDict[dirType] for parentName, typeDict in fnameDict.items(): if parentName is None: # top level tracks subDir = inDir else: # container tracks subDir = join(inDir, parentName) parentMeta = parseMeta(inDir) indent = 0 parentHasViews = False + groupMeta = {} + if dirType=="comps": tdb = { "track" : parentName, "shortLabel": parentName, "visibility" : "dense", "compositeTrack" : "on", "autoScale" : "group", "type" : "bed 4" } metaOverride(tdb, parentMeta) parentHasViews = mostFilesArePaired(typeDict) if parentHasViews: tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals" @@ -341,30 +347,31 @@ metaOverride(tdbViewPeaks, parentMeta) writeStanza(ofh, indent, tdbViewPeaks) tdbViewSig = { "track" : parentName+"ViewSignal", "shortLabel" : parentName+" Signal", "parent" : parentName, "view" : "SIG", "visibility" : "dense", "type" : "bigWig", "viewUi" : "on" } metaOverride(tdbViewSig, parentMeta) writeStanza(ofh, indent, tdbViewSig) else: + # no composites groupMeta = parseMeta(subDir) for trackBase, typeFnames in typeDict.items(): for fileType, absFnames in typeFnames.items(): assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type absFname = absFnames[0] fileBase = basename(absFname) relFname = relpath(absFname, tdbDir) labelSuff = "" if parentHasViews: if fileType=="bigWig": labelSuff = " Signal" elif fileType=="bigBed": labelSuff = " Peaks" @@ -441,75 +448,126 @@ tdb["type"] = "bigWig" dispMode = tl.get("display_mode") if dispMode: if dispMode=="normal": tdb["visibility"] = "pack" elif dispMode=="compact": tdb["visibility"] = "dense" else: tdb["visibility"] = "pack" else: tdb["visibility"] = "pack" writeStanza(ofh, 0, tdb) - def installModule(package): " install a package " logging.info("Could not find Python module '%s', trying to install with pip" % package) subprocess.check_call([sys.executable, "-m", "pip", "install", package]) def uploadFiles(tdbDir): "upload files to hubspace" try: from tusclient import client except ModuleNotFoundError: installModule("tuspy") serverUrl="https://hgwdev-hubspace.gi.ucsc.edu/files" my_client = client.TusClient(serverUrl, headers={}) logging.info(f"Target server is {serverUrl}") for fname in os.listdir(tdbDir): fpath = join(tdbDir, fname) if isdir(fpath): continue logging.info(f"Uploading {fpath}") meta = {"db":"hg19"} uploader = my_client.uploader(fpath, metadata=meta) uploader.upload() +def iterRaStanzas(fname): + " parse an ra-style (trackDb) file and yield dictionaries " + data = dict() + with open(fname, "rt") as ifh: + for l in ifh: + l = l.lstrip(" ").rstrip("\r\n") + if len(l)==0: + yield data + data = dict() + else: + if " " not in l: + continue + key, val = l.split(" ", maxsplit=1) + data[key] = val + + if len(data)!=0: + yield data + +def raToTab(fname): + " convert .ra file to .tsv " + stanzas = [] + allFields = set() + for stanza in iterRaStanzas(fname): + if "hub" in stanza or "genome" in stanza: + continue + allFields.update(stanza.keys()) + stanzas.append(stanza) + + if "track" in allFields: + allFields.remove("track") + if "shortLabel" in allFields: + allFields.remove("shortLabel") + + sortedFields = sorted(list(allFields)) + # make sure that track and shortLabel are first and always there, handy for manual edits + sortedFields.insert(0, "shortLabel") + sortedFields.insert(0, "track") + + ofh = sys.stdout + ofh.write("#") + ofh.write("\t".join(sortedFields)) + ofh.write("\n") + + for s in stanzas: + row = [] + for fieldName in sortedFields: + row.append(s.get(fieldName, "")) + ofh.write("\t".join(row)) + ofh.write("\n") + def hubt(args, options): """ get writeLn(ofh, indent, d .bb files under dirName and create a trackDb.txt for them""" cmd = args[0] inDir = "." if options.inDir: inDir = options.inDir if cmd=="up": uploadFiles(inDir) return tdbDir = inDir if options.outDir: tdbDir = options.outDir if cmd=="jbrowse": importJbrowse(args[1], args[2], tdbDir) + elif cmd == "tab": + raToTab(args[1]) elif cmd == "make": db = args[1] dirFiles = readDirs(inDir) hubFname = join(tdbDir, "hub.txt") logging.info("Writing %s" % hubFname) ofh = open(hubFname, "w") meta = parseMeta(inDir) writeHubGenome(ofh, db, meta) writeTdb(inDir, dirFiles, "top", tdbDir, ofh) writeTdb(inDir, dirFiles, "comps", tdbDir, ofh) ofh.close()