fea4e31015340a94c10cec9261bfaaefc733b52f max Thu Sep 19 09:34:17 2024 -0700 adding features to hubtools: yaml support, reorder composites by meta, order tracks by meta, allow both baseNames or actual trackNames in the meta for both this and all other meta data lookups diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools index c6169c3..a9c06a7 100755 --- src/utils/hubtools/hubtools +++ src/utils/hubtools/hubtools @@ -1,20 +1,21 @@ #!/usr/bin/env python3 import logging, sys, optparse, os, json, subprocess, shutil, string -from collections import defaultdict +from collections import defaultdict, OrderedDict from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir +#import pyyaml # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml() # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] - create and edit UCSC track hubs hubtools make : create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt - bigBed/bigWig files in the current directory will be top level tracks - big* files in subdirectories become composites - for every filename, the part before the first dot becomes the track base name - if a directory has more than 80% of track base names with both a bigBed and bigWig file, views are activated for this composite - track attributes can be changed using tracks.tsv or tracks.json files, in each top or subdirectory @@ -23,30 +24,34 @@ hubtools up: upload files to hubSpace - needs ~/.hubt.conf with username and password. Create one with 'hubt conf' - uploads all files from the -i directory or the current dir if not specified. hubtools jbrowse : convert Jbrowse trackList.json files to hub.txt. - is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/ - is assembly identifier hubtools tab : convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc. - is the input filename. "hub" and "genome" stanzas are skipped. - output goes to stdout Examples: hubtools make hg38 hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3 + hubtools tab hub.txt > tracks.tsv + + tracks.json can look like this: + { "hub" : {"hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons"} } """) parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.") #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages") parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace") (options, args) = parser.parse_args() if len(args)==0: parser.print_help() exit(1) @@ -89,113 +94,150 @@ if key == "track": trackName = val continue else: stanzaData[key] = val if len(stanzaData)!=0: ret[trackName] = stanzaData logging.debug("Got %s from .ra" % str(ret)) return ret def parseMetaTsv(fname): " parse a tracks.tsv file and return as a dict of trackName -> dict of key->val " headers = None meta = {} - logging.debug("Parsing %s as tab-sep" % fname) + logging.debug("Parsing track meta data from %s in tsv format" % fname) for line in open(fname): row = line.rstrip("\r\n").split("\t") if headers is None: - assert(line.startswith("track\t")) + assert(line.startswith("track\t") or line.startswith("#track")) + row[0] = row[0].lstrip("#") headers = row continue assert(len(row)==len(headers)) key = row[0] rowDict = {} for header, val in zip(headers[1:], row[1:]): rowDict[header] = val #row = {k:v for k,v in zip(headers, fs)} meta[key] = rowDict return meta def parseMetaJson(fname): " parse a json file and merge it into meta and return " logging.debug("Reading %s as json" % fname) newMeta = json.load(open(fname)) return newMeta +def parseMetaYaml(fname): + " parse yaml file " + import yaml # if this doesn't work, run 'pip install pyyaml' + with open(fname) as stream: + try: + return yaml.safe_load(stream) + except yaml.YAMLError as exc: + logging.error(exc) + def parseMeta(inDir): - " parse a tab-sep file with headers and return a dict firstField -> dictionary " + " parse a tab-sep file with headers and return an ordered dict firstField -> dictionary " fname = join(inDir, "tracks.tsv") - meta = {} + meta = OrderedDict() if isfile(fname): tsvMeta = parseMetaTsv(fname) meta = allMetaOverride(meta, tsvMeta) fname = join(inDir, "tracks.json") if isfile(fname): jsonMeta = parseMetaJson(fname) meta = allMetaOverride(meta, jsonMeta) fname = join(inDir, "tracks.ra") if isfile(fname): raMeta = parseMetaRa(fname) meta = allMetaOverride(meta, raMeta) - logging.debug("Got overrides from %s: %s" % (inDir, str(meta))) + fname = join(inDir, "tracks.yaml") + if isfile(fname): + yamlMeta = parseMetaYaml(fname) + meta = allMetaOverride(meta, yamlMeta) + + logging.debug("Got meta from %s: %s" % (inDir, str(meta))) return meta def writeHubGenome(ofh, db, inMeta): " create a hub.txt and genomes.txt file, hub.txt is just a template " meta = inMeta.get("hub", {}) ofh.write("hub autoHub\n") ofh.write("shortLabel %s\n" % meta.get("shortLabel", "Auto-generated hub")) ofh.write("longLabel %s\n" % meta.get("longLabel", "Auto-generated hub")) #ofh.write("genomesFile genomes.txt\n") if "descriptionUrl" in meta: ofh.write("descriptionUrl %s\n" % meta["descriptionUrl"]) ofh.write("email %s\n" % meta.get("email", "yourEmail@example.com")) ofh.write("useOneFile on\n\n") ofh.write("genome %s\n\n" % db) return ofh def readSubdirs(inDir, subDirs): - " given a list of dirs, find those that are composite dirs and those that are supertrack dirs " + " given a list of dirs, find those that are composite dirs (not supporting supertracks for now) " compDicts, superDicts = {}, {} for subDir in subDirs: subPath = join(inDir, subDir) subSubDirs, subDict = readFnames(subPath) if len(subDict)==0: # no files in this dir continue if len(subSubDirs)==0: compDicts[subDir] = subDict #else: #superDicts[subDir] = subDict return compDicts, superDicts -def readDirs(inDir): +def reorderDirs(compDirs, meta): + " order the directories in compDirs in the order that they appear in the meta data -> will mean that composites have the right order " + if len(meta)==0: + logging.debug("Not reordering these subdirectories: %s" % compDirs.keys()) + return compDirs + + # first use the names in the meta data, and put in the right order + newCompDirs = OrderedDict() + for dirName in meta.keys(): + if dirName in compDirs: + newCompDirs[dirName] = compDirs[dirName] + + # then add everything else at the end + for dirName in compDirs: + if dirName not in newCompDirs: + newCompDirs[dirName] = compDirs[dirName] + + logging.debug("Reordered input directories based on meta data. New order is: %s" % newCompDirs.keys()) + return newCompDirs + + +def readDirs(inDir, meta): " recurse down into directories and return containerType -> parentName -> fileBase -> type -> list of absPath " ret = {} subDirs, topFiles = readFnames(inDir) ret["top"] = { None : topFiles } # top-level track files have None as the parent compDirs, superDirs = readSubdirs(inDir, subDirs) + compDirs = reorderDirs(compDirs, meta) # superDirs not used yet, no time ret["comps"] = compDirs ret["supers"] = superDirs return ret def readFnames(inDir): " return dict with basename -> fileType -> filePath " fnameDict = defaultdict(dict) #tdbDir = abspath(dirname(trackDbPath)) subDirs = [] for fname in os.listdir(inDir): filePath = join(inDir, fname) if isdir(filePath): @@ -257,145 +299,199 @@ writeLn(ofh, indent, "parent %s" % tdb["parent"]) writeLn(ofh, indent, "type %s" % trackType) writeLn(ofh, indent, "visibility %s" % visibility) if "bigDataUrl" in tdb: writeLn(ofh, indent, "bigDataUrl %s" % tdb["bigDataUrl"]) for key, val in tdb.items(): if key in ["track", "shortLabel", "longLabel", "type", "bigDataUrl", "visibility", "parent"]: continue writeLn(ofh, indent, "%s %s" % (key, val)) ofh.write("\n") def metaOverride(tdb, meta): " override track info for one single track, from meta into tdb " - if not tdb["track"] in meta: + trackName = tdb["track"] + + if trackName not in meta and "__" in trackName: + logging.debug("Using only basename of track %s" % trackName) + trackName = trackName.split("__")[1] + + if trackName not in meta: + logging.debug("No meta info for track %s" % tdb["track"]) return - trackMeta = meta[tdb["track"]] + trackMeta = meta[trackName] for key, val in trackMeta.items(): + if val!="": tdb[key] = trackMeta[key] def allMetaOverride(tdb, meta): " override track info for all tracks, from meta into tdb " if meta is None: return tdb for trackName in meta: trackMeta = meta[trackName] if trackName not in tdb: tdb[trackName] = {} trackTdb = tdb[trackName] for key, val in trackMeta.items(): trackTdb[key] = val return tdb +def reorderTracks(fileDict, meta): + " given an unsorted dictionary of files and ordered metadata, try to sort the files according to the metadata" + if len(meta)==0: + return fileDict # no meta data -> no ordering necessary + + trackOrder = [] + # meta is an OrderedDict, so the keys are also ordered + for trackName in meta.keys(): + if "__" in trackName: # in composite mode, the tracknames contain the parent and the track type + trackName = trackName.split("__")[1] + trackOrder.append( trackName ) + + trackOrder = list(meta.keys()) + + newFiles = OrderedDict() + doneTracks = set() + # first add the tracks in the order of the meta data + for trackBase in trackOrder: + # the tsv file can have the track names either as basenames or as full tracknames + if trackBase not in fileDict and "__" in trackBase: + trackBase = trackBase.split("__")[1] + + if trackBase in fileDict: + newFiles[trackBase] = fileDict[trackBase] + doneTracks.add(trackBase) + + logging.debug("Track ordering from meta data used: %s" % newFiles.keys()) + + # then add all other tracks + for trackBase, fileData in fileDict.items(): + if trackBase not in doneTracks: + newFiles[trackBase] = fileDict[trackBase] + logging.debug("Not specified in meta, so adding at the end: %s" % trackBase) + + logging.debug("Final track order is: %s" % newFiles.keys()) + + assert(len(newFiles)==len(fileDict)) + return newFiles + def writeTdb(inDir, dirDict, dirType, tdbDir, ofh): " given a dict with basename -> type -> filenames, write track entries to ofh " + # this code is getting increasingly complex because it supports composites/views and pairing of bigBed/bigWig files + # either this needs better comments or maybe a separate code path for this rare use case global compCount fnameDict = dirDict[dirType] for parentName, typeDict in fnameDict.items(): - if parentName is None: # top level tracks + if parentName is None: # top level tracks: use top tracks.tsv subDir = inDir - else: # container tracks + else: # container tracks -> use tracks.tsv in the subdirectory subDir = join(inDir, parentName) - parentMeta = parseMeta(inDir) + + parentMeta = parseMeta(subDir) indent = 0 parentHasViews = False groupMeta = {} if dirType=="comps": tdb = { "track" : parentName, "shortLabel": parentName, "visibility" : "dense", "compositeTrack" : "on", "autoScale" : "group", "type" : "bed 4" } metaOverride(tdb, parentMeta) + groupMeta = parentMeta parentHasViews = mostFilesArePaired(typeDict) if parentHasViews: tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals" logging.info("Container track %s has >80%% of paired files, activating views" % parentName) writeStanza(ofh, indent, tdb) indent = 4 if parentHasViews: + # we have composites with paired files? -> write the track stanzas for the two views groupMeta = parseMeta(subDir) tdbViewPeaks = { "track" : parentName+"ViewPeaks", "shortLabel" : parentName+" Peaks", "parent" : parentName, "view" : "PK", "visibility" : "dense", "type" : "bigBed", "scoreFilter" : "off", "viewUi" : "on" } metaOverride(tdbViewPeaks, parentMeta) writeStanza(ofh, indent, tdbViewPeaks) tdbViewSig = { "track" : parentName+"ViewSignal", "shortLabel" : parentName+" Signal", "parent" : parentName, "view" : "SIG", "visibility" : "dense", "type" : "bigWig", "viewUi" : "on" } metaOverride(tdbViewSig, parentMeta) writeStanza(ofh, indent, tdbViewSig) else: # no composites groupMeta = parseMeta(subDir) + typeDict = reorderTracks(typeDict, groupMeta) + for trackBase, typeFnames in typeDict.items(): for fileType, absFnames in typeFnames.items(): assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type absFname = absFnames[0] fileBase = basename(absFname) relFname = relpath(absFname, tdbDir) labelSuff = "" if parentHasViews: if fileType=="bigWig": labelSuff = " Signal" elif fileType=="bigBed": labelSuff = " Peaks" else: assert(False) # views and non-bigWig/Bed are not supported yet? if parentName is not None: - parentPrefix = parentName+"-" + parentPrefix = parentName+"__" else: parentPrefix = "" - trackName = parentPrefix+trackBase+"-"+fileType + trackName = parentPrefix+trackBase+"__"+fileType tdb = { "track" : trackName, "shortLabel" : trackBase+labelSuff, "longLabel" : trackBase+labelSuff, "visibility" : "dense", "type" : fileType, "bigDataUrl" : relFname, } if parentName: tdb["parent"] = parentName if parentHasViews: onOff = "on" if trackName in groupMeta and "visibility" in groupMeta[trackName]: @@ -476,62 +572,70 @@ my_client = client.TusClient(serverUrl, headers={}) logging.info(f"Target server is {serverUrl}") for fname in os.listdir(tdbDir): fpath = join(tdbDir, fname) if isdir(fpath): continue logging.info(f"Uploading {fpath}") meta = {"db":"hg19"} uploader = my_client.uploader(fpath, metadata=meta) uploader.upload() def iterRaStanzas(fname): " parse an ra-style (trackDb) file and yield dictionaries " data = dict() + logging.debug("Parsing %s in trackDb format" % fname) with open(fname, "rt") as ifh: for l in ifh: l = l.lstrip(" ").rstrip("\r\n") if len(l)==0: yield data data = dict() else: if " " not in l: continue key, val = l.split(" ", maxsplit=1) data[key] = val if len(data)!=0: yield data def raToTab(fname): " convert .ra file to .tsv " stanzas = [] allFields = set() for stanza in iterRaStanzas(fname): if "hub" in stanza or "genome" in stanza: continue allFields.update(stanza.keys()) stanzas.append(stanza) if "track" in allFields: allFields.remove("track") if "shortLabel" in allFields: allFields.remove("shortLabel") + hasLongLabel = False + if "longLabel" in allFields: + allFields.remove("longLabel") + hasLongLabel = True + sortedFields = sorted(list(allFields)) - # make sure that track and shortLabel are first and always there, handy for manual edits + # make sure that track shortLabel and longLabel come first and always there, handy for manual edits + if hasLongLabel: + sortedFields.insert(0, "longLabel") sortedFields.insert(0, "shortLabel") sortedFields.insert(0, "track") ofh = sys.stdout ofh.write("#") ofh.write("\t".join(sortedFields)) ofh.write("\n") for s in stanzas: row = [] for fieldName in sortedFields: row.append(s.get(fieldName, "")) ofh.write("\t".join(row)) ofh.write("\n") @@ -546,41 +650,45 @@ if cmd=="up": uploadFiles(inDir) return tdbDir = inDir if options.outDir: tdbDir = options.outDir if cmd=="jbrowse": importJbrowse(args[1], args[2], tdbDir) elif cmd == "tab": raToTab(args[1]) elif cmd == "make": db = args[1] - dirFiles = readDirs(inDir) + + meta = parseMeta(inDir) + dirFiles = readDirs(inDir, meta) hubFname = join(tdbDir, "hub.txt") logging.info("Writing %s" % hubFname) ofh = open(hubFname, "w") meta = parseMeta(inDir) writeHubGenome(ofh, db, meta) writeTdb(inDir, dirFiles, "top", tdbDir, ofh) writeTdb(inDir, dirFiles, "comps", tdbDir, ofh) ofh.close() + else: + logging.error("Unknown command: '%s'" % args[1]) # ----------- main -------------- def main(): args, options = parseArgs() hubt(args, options) #if options.test: #logging.debug("test is set") #f = open(options.file, "r") main()