05e67c59a20a5d00b810a981aef3b00c5bef82e1 max Fri Sep 20 06:03:18 2024 -0700 more features to hubtools: search in both parent and subdirs, better docs diff --git src/utils/hubMake/hubMake src/utils/hubMake/hubMake deleted file mode 100755 index 9d8a9c3..0000000 --- src/utils/hubMake/hubMake +++ /dev/null @@ -1,434 +0,0 @@ -#!/usr/bin/env python3 - -import logging, sys, optparse, os, json -from collections import defaultdict -from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir - -# ==== functions ===== - -def parseArgs(): - " setup logging, parse command line arguments and options. -h shows auto-generated help page " - parser = optparse.OptionParser("""usage: %prog [options] <db> - create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt - - - bigBed/bigWig files in the current directory will be top level tracks - - big* files in subdirectories become composites - - the part before the first dot is the track base name - - if a directory has more than 80% of track base names with both a bigBed and bigWig file, views are activated for this composite - - track attributes can be changed using tracks.tsv or tracks.json files, in each top or subdirectory - - tracks.tsv must have the first column named 'track'.""") - - #parser.add_option("-d", "--db", dest="db", action="store", help="The assembly, e.g. hg38 or mm10. Required.") - parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory") - parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.") - - parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages") - (options, args) = parser.parse_args() - - if len(args)==0: - parser.print_help() - exit(1) - - if options.debug: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - return args, options - -def errAbort(msg): - " print and abort) " - logging.error(msg) - sys.exit(1) - -def parseMetaRa(fname): - """parse tracks.ra or tracks.txt and return as a dict of trackName -> dict of key->val """ - logging.debug("Reading %s as .ra" % fname) - trackName = None - stanzaData = {} - ret = {} - for line in open(fname): - line = line.strip() - if line.startswith("#"): - continue - if line=="": - if len(stanzaData)==0: - # double newline - continue - if trackName is None: - errAbort("File %s has a stanza without a track name" % fname) - if trackName in ret: - errAbort("File %s contains two stanzas with the same track name '%s' " % trackName) - - ret[trackName] = stanzaData - stanzaData = {} - trackName = None - continue - - key, val = line.split(" ", maxsplit=1) - if key == "track": - trackName = val - continue - else: - stanzaData[key] = val - - if len(stanzaData)!=0: - ret[trackName] = stanzaData - logging.debug("Got %s from .ra" % str(ret)) - return ret - -def parseMetaTsv(fname): - " parse a tracks.tsv file and return as a dict of trackName -> dict of key->val " - headers = None - meta = {} - logging.debug("Parsing %s as tab-sep" % fname) - for line in open(fname): - row = line.rstrip("\r\n").split("\t") - if headers is None: - assert(line.startswith("track\t")) - headers = row - continue - - assert(len(row)==len(headers)) - key = row[0] - - rowDict = {} - for header, val in zip(headers[1:], row[1:]): - rowDict[header] = val - #row = {k:v for k,v in zip(headers, fs)} - meta[key] = rowDict - return meta - -def parseMetaJson(fname): - " parse a json file and merge it into meta and return " - logging.debug("Reading %s as json" % fname) - newMeta = json.load(open(fname)) - return newMeta - -def parseMeta(inDir): - " parse a tab-sep file with headers and return a dict firstField -> dictionary " - fname = join(inDir, "tracks.tsv") - meta = {} - if isfile(fname): - tsvMeta = parseMetaTsv(fname) - meta = allMetaOverride(meta, tsvMeta) - - fname = join(inDir, "tracks.json") - if isfile(fname): - jsonMeta = parseMetaJson(fname) - meta = allMetaOverride(meta, jsonMeta) - - fname = join(inDir, "tracks.ra") - if isfile(fname): - raMeta = parseMetaRa(fname) - meta = allMetaOverride(meta, raMeta) - - logging.debug("Got overrides from %s: %s" % (inDir, str(meta))) - return meta - -def writeHubGenome(ofh, db, inMeta): - " create a hub.txt and genomes.txt file, hub.txt is just a template " - meta = inMeta.get("hub", {}) - ofh.write("hub autoHub\n") - ofh.write("shortLabel %s\n" % meta.get("shortLabel", "Auto-generated hub")) - ofh.write("longLabel %s\n" % meta.get("longLabel", "Auto-generated hub")) - #ofh.write("genomesFile genomes.txt\n") - if "descriptionUrl" in meta: - ofh.write("descriptionUrl %s\n" % meta["descriptionUrl"]) - ofh.write("email %s\n" % meta.get("email", "yourEmail@example.com")) - ofh.write("useOneFile on\n\n") - - ofh.write("genome %s\n\n" % db) - return ofh - -def readSubdirs(inDir, subDirs): - " given a list of dirs, find those that are composite dirs and those that are supertrack dirs " - compDicts, superDicts = {}, {} - for subDir in subDirs: - subPath = join(inDir, subDir) - subSubDirs, subDict = readFnames(subPath) - if len(subDict)==0: - # no files in this dir - continue - if len(subSubDirs)==0: - compDicts[subDir] = subDict - #else: - #superDicts[subDir] = subDict - - return compDicts, superDicts - -def readDirs(inDir): - " recurse down into directories and return containerType -> parentName -> fileBase -> type -> list of absPath " - ret = {} - subDirs, topFiles = readFnames(inDir) - ret["top"] = { None : topFiles } # top-level track files have None as the parent - - compDirs, superDirs = readSubdirs(inDir, subDirs) - # superDirs not used yet, no time - - ret["comps"] = compDirs - ret["supers"] = superDirs - - return ret - -def readFnames(inDir): - " return dict with basename -> fileType -> filePath " - fnameDict = defaultdict(dict) - #tdbDir = abspath(dirname(trackDbPath)) - subDirs = [] - for fname in os.listdir(inDir): - filePath = join(inDir, fname) - if isdir(filePath): - subDirs.append(fname) - continue - baseName, ext = splitext(basename(fname)) - ext = ext.lower() - - # actually, use the part before the first dot, not the one before the extension, as the track name - # this means that a.scaled.bigBed and a.raw.bigWig get paired correctly - fileBase = basename(fname).split(".")[0] - - if ext==".bw" or ext==".bigwig": - fileType = "bigWig" - elif ext==".bb" or ext==".bigbed": - fileType = "bigBed" - else: - logging.debug("file %s is not bigBed nor bigWig, skipping" % fname) - continue - - absFname = abspath(filePath) - #relFname = relFname(absFname, tdbDir) - fnameDict[fileBase].setdefault(fileType, []) - #fnameDict[baseName][fileType].setdefault([]) - fnameDict[fileBase][fileType].append(absFname) - return subDirs, fnameDict - -def mostFilesArePaired(fnameDict): - " check if 80% of the tracks have a pair bigBed+bigWig" - pairCount = 0 - for baseName, typeDict in fnameDict.items(): - if "bigBed" in typeDict and "bigWig" in typeDict: - pairCount += 1 - - pairShare = pairCount / len(fnameDict) - return ( pairShare > 0.8 ) - -def writeLn(ofh, spaceCount, line): - "write line to ofh, with spaceCount before it " - ofh.write("".join([" "]*spaceCount)) - ofh.write(line) - ofh.write("\n") - -def writeStanza(ofh, indent, tdb): - " write a stanza given a tdb key-val dict " - track = tdb["track"] - - shortLabel = tdb.get("shortLabel", track) - visibility = tdb.get("visibility", "pack") - longLabel = tdb.get("longLabel", shortLabel) - - trackType = tdb["type"] - - writeLn(ofh, indent, "track %s" % track) - writeLn(ofh, indent, "shortLabel %s" % shortLabel) - if longLabel: - writeLn(ofh, indent, "longLabel %s" % longLabel) - if "parent" in tdb: - writeLn(ofh, indent, "parent %s" % tdb["parent"]) - writeLn(ofh, indent, "type %s" % trackType) - writeLn(ofh, indent, "visibility %s" % visibility) - if "bigDataUrl" in tdb: - writeLn(ofh, indent, "bigDataUrl %s" % tdb["bigDataUrl"]) - - for key, val in tdb.items(): - if key in ["track", "shortLabel", "longLabel", "type", "bigDataUrl", "visibility", "parent"]: - continue - writeLn(ofh, indent, "%s %s" % (key, val)) - - ofh.write("\n") - -def metaOverride(tdb, meta): - " override track info for one single track, from meta into tdb " - if not tdb["track"] in meta: - return - - trackMeta = meta[tdb["track"]] - - for key, val in trackMeta.items(): - tdb[key] = trackMeta[key] - -def allMetaOverride(tdb, meta): - " override track info for all tracks, from meta into tdb " - if meta is None: - return tdb - - for trackName in meta: - trackMeta = meta[trackName] - if trackName not in tdb: - tdb[trackName] = {} - - trackTdb = tdb[trackName] - - for key, val in trackMeta.items(): - trackTdb[key] = val - - return tdb - -def writeTdb(inDir, dirDict, dirType, tdbDir, ofh): - " given a dict with basename -> type -> filenames, write track entries to ofh " - global compCount - - fnameDict = dirDict[dirType] - - for parentName, typeDict in fnameDict.items(): - if parentName is None: # top level tracks - subDir = inDir - else: # container tracks - subDir = join(inDir, parentName) - parentMeta = parseMeta(inDir) - - indent = 0 - parentHasViews = False - - if dirType=="comps": - tdb = { - "track" : parentName, - "shortLabel": parentName, - "visibility" : "dense", - "compositeTrack" : "on", - "autoScale" : "group", - "type" : "bed 4" - } - metaOverride(tdb, parentMeta) - - parentHasViews = mostFilesArePaired(typeDict) - - if parentHasViews: - tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals" - logging.info("Container track %s has >80%% of paired files, activating views" % parentName) - - writeStanza(ofh, indent, tdb) - indent = 4 - - if parentHasViews: - groupMeta = parseMeta(subDir) - tdbViewPeaks = { - "track" : parentName+"ViewPeaks", - "shortLabel" : parentName+" Peaks", - "parent" : parentName, - "view" : "PK", - "visibility" : "dense", - "type" : "bigBed", - "scoreFilter" : "off", - "viewUi" : "on" - } - metaOverride(tdbViewPeaks, parentMeta) - writeStanza(ofh, indent, tdbViewPeaks) - - tdbViewSig = { - "track" : parentName+"ViewSignal", - "shortLabel" : parentName+" Signal", - "parent" : parentName, - "view" : "SIG", - "visibility" : "dense", - "type" : "bigWig", - "viewUi" : "on" - } - metaOverride(tdbViewSig, parentMeta) - writeStanza(ofh, indent, tdbViewSig) - else: - groupMeta = parseMeta(subDir) - - for trackBase, typeFnames in typeDict.items(): - for fileType, absFnames in typeFnames.items(): - assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type - absFname = absFnames[0] - fileBase = basename(absFname) - relFname = relpath(absFname, tdbDir) - - labelSuff = "" - if parentHasViews: - if fileType=="bigWig": - labelSuff = " Signal" - elif fileType=="bigBed": - labelSuff = " Peaks" - else: - assert(False) # views and non-bigWig/Bed are not supported yet? - - if parentName is not None: - parentPrefix = parentName+"-" - else: - parentPrefix = "" - - trackName = parentPrefix+trackBase+"-"+fileType - tdb = { - "track" : trackName, - "shortLabel": trackBase+labelSuff, - "longLabel": trackBase+labelSuff, - "visibility" : "dense", - "type" : fileType, - "bigDataUrl" : relFname, - } - - if parentName: - tdb["parent"] = parentName - - if parentHasViews: - onOff = "on" - if trackName in groupMeta and "visibility" in groupMeta[trackName]: - vis = groupMeta[trackName]["visibility"] - if vis=="hide": - onOff = "off" - del tdb["visibility"] - - if fileType=="bigBed": - tdb["parent"] = parentName+"ViewPeaks"+" "+onOff - tdb["subGroups"] = "view=PK" - else: - tdb["parent"] = parentName+"ViewSignal"+" "+onOff - tdb["subGroups"] = "view=SIG" - - metaOverride(tdb, groupMeta) - - if trackName in groupMeta and "visibility" in groupMeta[trackName]: - del tdb["visibility"] - - writeStanza(ofh, indent, tdb) - -def makeTrackHub(db, options): - """ get writeLn(ofh, indent, d .bb files under dirName and create a trackDb.txt for them""" - - inDir = "." - if options.inDir: - inDir = options.inDir - - tdbDir = inDir - if options.outDir: - tdbDir = options.outDir - - #db = args[0] - - dirFiles = readDirs(inDir) - - hubFname = join(tdbDir, "hub.txt") - print("Writing %s" % hubFname) - ofh = open(hubFname, "w") - - meta = parseMeta(inDir) - writeHubGenome(ofh, db, meta) - - writeTdb(inDir, dirFiles, "top", tdbDir, ofh) - writeTdb(inDir, dirFiles, "comps", tdbDir, ofh) - - ofh.close() - - -# ----------- main -------------- -def main(): - args, options = parseArgs() - - makeTrackHub(args[0], options) - - #if options.test: - #logging.debug("test is set") - #f = open(options.file, "r") - -main()