fea4e31015340a94c10cec9261bfaaefc733b52f
max
  Thu Sep 19 09:34:17 2024 -0700
adding features to hubtools: yaml support, reorder composites by meta, order tracks by meta, allow both baseNames or actual trackNames in the meta for both this and all other meta data lookups

diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools
index c6169c3..a9c06a7 100755
--- src/utils/hubtools/hubtools
+++ src/utils/hubtools/hubtools
@@ -1,20 +1,21 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, os, json, subprocess, shutil, string
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir
+#import pyyaml   # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml()
 
 # ==== functions =====
     
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs
             
     hubtools make <assemblyCode>: create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt
     - bigBed/bigWig files in the current directory will be top level tracks
     - big* files in subdirectories become composites
     - for every filename, the part before the first dot becomes the track base name
     - if a directory has more than 80% of track base names with both a bigBed
       and bigWig file, views are activated for this composite
     - track attributes can be changed using tracks.tsv or tracks.json files, in
       each top or subdirectory
@@ -23,30 +24,34 @@
     hubtools up: upload files to hubSpace
     - needs ~/.hubt.conf with username and password. Create one with 'hubt conf'
     - uploads all files from the -i directory or the current dir if not specified.
 
     hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt.
     - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/
     - <db> is assembly identifier 
 
     hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc.
     - <fname> is the input filename. "hub" and "genome" stanzas are skipped.
     - output goes to stdout
 
     Examples:
     hubtools make hg38
     hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3
+    hubtools tab hub.txt > tracks.tsv
+
+    tracks.json can look like this:
+    { "hub" : {"hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons"} }
     """)
 
 
     parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory")
     parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.")
 
     #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy")
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages")
     parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace")
     (options, args) = parser.parse_args()
 
     if len(args)==0:
         parser.print_help()
         exit(1)
 
@@ -89,113 +94,150 @@
         if key == "track":
             trackName = val
             continue
         else:
             stanzaData[key] = val
 
     if len(stanzaData)!=0:
         ret[trackName] = stanzaData
     logging.debug("Got %s from .ra" % str(ret))
     return ret
 
 def parseMetaTsv(fname):
     " parse a tracks.tsv file and return as a dict of trackName -> dict of key->val "
     headers = None
     meta = {}
-    logging.debug("Parsing %s as tab-sep" % fname)
+    logging.debug("Parsing track meta data from %s in tsv format" % fname)
     for line in open(fname):
         row = line.rstrip("\r\n").split("\t")
         if headers is None:
-            assert(line.startswith("track\t"))
+            assert(line.startswith("track\t") or line.startswith("#track"))
+            row[0] = row[0].lstrip("#")
             headers = row
             continue
 
         assert(len(row)==len(headers))
         key = row[0]
 
         rowDict = {}
         for header, val in zip(headers[1:], row[1:]):
             rowDict[header] = val
         #row = {k:v for k,v in zip(headers, fs)}
         meta[key] = rowDict
     return meta
 
 def parseMetaJson(fname):
     " parse a json file and merge it into meta and return "
     logging.debug("Reading %s as json" % fname)
     newMeta = json.load(open(fname))
     return newMeta
 
+def parseMetaYaml(fname):
+    " parse yaml file "
+    import yaml # if this doesn't work, run 'pip install pyyaml'
+    with open(fname) as stream:
+        try:
+            return yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+            logging.error(exc)
+
 def parseMeta(inDir):
-    " parse a tab-sep file with headers and return a dict firstField -> dictionary "
+    " parse a tab-sep file with headers and return an ordered dict firstField -> dictionary "
     fname = join(inDir, "tracks.tsv")
-    meta = {}
+    meta = OrderedDict()
     if isfile(fname):
         tsvMeta = parseMetaTsv(fname)
         meta = allMetaOverride(meta, tsvMeta)
 
     fname = join(inDir, "tracks.json")
     if isfile(fname):
         jsonMeta = parseMetaJson(fname)
         meta = allMetaOverride(meta, jsonMeta)
 
     fname = join(inDir, "tracks.ra")
     if isfile(fname):
         raMeta = parseMetaRa(fname)
         meta = allMetaOverride(meta, raMeta)
 
-    logging.debug("Got overrides from %s: %s" % (inDir, str(meta)))
+    fname = join(inDir, "tracks.yaml")
+    if isfile(fname):
+        yamlMeta = parseMetaYaml(fname)
+        meta = allMetaOverride(meta, yamlMeta)
+
+    logging.debug("Got meta from %s: %s" % (inDir, str(meta)))
     return meta
 
 def writeHubGenome(ofh, db, inMeta):
     " create a hub.txt and genomes.txt file, hub.txt is just a template "
     meta = inMeta.get("hub", {})
     ofh.write("hub autoHub\n")
     ofh.write("shortLabel %s\n" % meta.get("shortLabel", "Auto-generated hub"))
     ofh.write("longLabel %s\n" % meta.get("longLabel", "Auto-generated hub"))
     #ofh.write("genomesFile genomes.txt\n")
     if "descriptionUrl" in meta:
         ofh.write("descriptionUrl %s\n" % meta["descriptionUrl"])
     ofh.write("email %s\n" % meta.get("email", "yourEmail@example.com"))
     ofh.write("useOneFile on\n\n")
 
     ofh.write("genome %s\n\n" % db)
     return ofh
 
 def readSubdirs(inDir, subDirs):
-    " given a list of dirs, find those that are composite dirs and those that are supertrack dirs "
+    " given a list of dirs, find those that are composite dirs (not supporting supertracks for now) "
     compDicts, superDicts = {}, {}
     for subDir in subDirs:
         subPath = join(inDir, subDir)
         subSubDirs, subDict = readFnames(subPath)
         if len(subDict)==0:
             # no files in this dir
             continue
         if len(subSubDirs)==0:
             compDicts[subDir] = subDict
         #else:
             #superDicts[subDir] = subDict
 
     return compDicts, superDicts
 
-def readDirs(inDir):
+def reorderDirs(compDirs, meta):
+    " order the directories in compDirs in the order that they appear in the meta data -> will mean that composites have the right order "
+    if len(meta)==0:
+        logging.debug("Not reordering these subdirectories: %s" % compDirs.keys())
+        return compDirs
+
+    # first use the names in the meta data, and put in the right order
+    newCompDirs = OrderedDict()
+    for dirName in meta.keys():
+        if dirName in compDirs:
+            newCompDirs[dirName] = compDirs[dirName]
+
+    # then add everything else at the end
+    for dirName in compDirs:
+        if dirName not in newCompDirs:
+            newCompDirs[dirName] = compDirs[dirName]
+
+    logging.debug("Reordered input directories based on meta data. New order is: %s" % newCompDirs.keys())
+    return newCompDirs
+
+
+def readDirs(inDir, meta):
     " recurse down into directories and return containerType -> parentName -> fileBase -> type -> list of absPath "
     ret = {}
     subDirs, topFiles = readFnames(inDir)
     ret["top"] = { None : topFiles } # top-level track files have None as the parent
 
     compDirs, superDirs = readSubdirs(inDir, subDirs)
+    compDirs = reorderDirs(compDirs, meta)
     # superDirs not used yet, no time
 
     ret["comps"] = compDirs
     ret["supers"] = superDirs
 
     return ret
 
 def readFnames(inDir):
     " return dict with basename -> fileType -> filePath "
     fnameDict = defaultdict(dict)
     #tdbDir = abspath(dirname(trackDbPath))
     subDirs = []
     for fname in os.listdir(inDir):
         filePath = join(inDir, fname)
         if isdir(filePath):
@@ -257,145 +299,199 @@
         writeLn(ofh, indent, "parent %s" % tdb["parent"])
     writeLn(ofh, indent, "type %s" % trackType)
     writeLn(ofh, indent, "visibility %s" % visibility)
     if "bigDataUrl" in tdb:
         writeLn(ofh, indent, "bigDataUrl %s" % tdb["bigDataUrl"])
 
     for key, val in tdb.items():
         if key in ["track", "shortLabel", "longLabel", "type", "bigDataUrl", "visibility", "parent"]:
             continue
         writeLn(ofh, indent, "%s %s" % (key, val))
 
     ofh.write("\n")
 
 def metaOverride(tdb, meta):
     " override track info for one single track, from meta into tdb "
-    if not tdb["track"] in meta:
+    trackName = tdb["track"]
+
+    if trackName not in meta and "__" in trackName:
+        logging.debug("Using only basename of track %s" % trackName)
+        trackName = trackName.split("__")[1]
+
+    if trackName not in meta:
+        logging.debug("No meta info for track %s" % tdb["track"])
         return
 
-    trackMeta = meta[tdb["track"]]
+    trackMeta = meta[trackName]
 
     for key, val in trackMeta.items():
+        if val!="":
             tdb[key] = trackMeta[key]
 
 def allMetaOverride(tdb, meta):
     " override track info for all tracks, from meta into tdb "
     if meta is None:
         return tdb
 
     for trackName in meta:
         trackMeta = meta[trackName]
         if trackName not in tdb:
             tdb[trackName] = {}
 
         trackTdb = tdb[trackName]
 
         for key, val in trackMeta.items():
             trackTdb[key] = val
 
     return tdb
 
+def reorderTracks(fileDict, meta):
+    " given an unsorted dictionary of files and ordered metadata, try to sort the files according to the metadata"
+    if len(meta)==0:
+        return fileDict # no meta data -> no ordering necessary
+
+    trackOrder = []
+    # meta is an OrderedDict, so the keys are also ordered
+    for trackName in meta.keys():
+        if "__" in trackName: # in composite mode, the tracknames contain the parent and the track type
+            trackName = trackName.split("__")[1]
+        trackOrder.append( trackName )
+
+    trackOrder = list(meta.keys())
+
+    newFiles = OrderedDict()
+    doneTracks = set()
+    # first add the tracks in the order of the meta data
+    for trackBase in trackOrder:
+        # the tsv file can have the track names either as basenames or as full tracknames
+        if trackBase not in fileDict and "__" in trackBase:
+            trackBase = trackBase.split("__")[1]
+
+        if trackBase in fileDict:
+            newFiles[trackBase] = fileDict[trackBase]
+            doneTracks.add(trackBase)
+
+    logging.debug("Track ordering from meta data used: %s" % newFiles.keys())
+
+    # then add all other tracks
+    for trackBase, fileData in fileDict.items():
+        if trackBase not in doneTracks:
+            newFiles[trackBase] = fileDict[trackBase]
+            logging.debug("Not specified in meta, so adding at the end: %s" % trackBase)
+
+    logging.debug("Final track order is: %s" % newFiles.keys())
+
+    assert(len(newFiles)==len(fileDict))
+    return newFiles
+
 def writeTdb(inDir, dirDict, dirType, tdbDir, ofh):
     " given a dict with basename -> type -> filenames, write track entries to ofh "
+    # this code is getting increasingly complex because it supports composites/views and pairing of bigBed/bigWig files
+    # either this needs better comments or maybe a separate code path for this rare use case
     global compCount
 
     fnameDict = dirDict[dirType]
 
     for parentName, typeDict in fnameDict.items():
-        if parentName is None: # top level tracks
+        if parentName is None: # top level tracks: use top tracks.tsv
             subDir = inDir
-        else: # container tracks
+        else: # container tracks -> use tracks.tsv in the subdirectory
             subDir = join(inDir, parentName)
-        parentMeta = parseMeta(inDir)
+
+        parentMeta = parseMeta(subDir)
 
         indent = 0
         parentHasViews = False
 
         groupMeta = {}
 
         if dirType=="comps":
             tdb = {
                     "track" : parentName,
                     "shortLabel": parentName,
                     "visibility" : "dense",
                     "compositeTrack" : "on",
                     "autoScale" : "group",
                     "type" : "bed 4"
                   }
             metaOverride(tdb, parentMeta)
+            groupMeta = parentMeta
 
             parentHasViews = mostFilesArePaired(typeDict)
 
             if parentHasViews:
                 tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals"
                 logging.info("Container track %s has >80%% of paired files, activating views" % parentName)
 
             writeStanza(ofh, indent, tdb)
             indent = 4
 
             if parentHasViews:
+                # we have composites with paired files? -> write the track stanzas for the two views
                 groupMeta = parseMeta(subDir)
                 tdbViewPeaks = {
                         "track" : parentName+"ViewPeaks",
                         "shortLabel" : parentName+" Peaks",
                         "parent" : parentName,
                         "view" : "PK",
                         "visibility" : "dense",
                         "type" : "bigBed",
                         "scoreFilter" : "off",
                         "viewUi" : "on"
                         }
                 metaOverride(tdbViewPeaks, parentMeta)
                 writeStanza(ofh, indent, tdbViewPeaks)
 
                 tdbViewSig = {
                         "track" : parentName+"ViewSignal",
                         "shortLabel" : parentName+" Signal",
                         "parent" : parentName,
                         "view" : "SIG",
                         "visibility" : "dense",
                         "type" : "bigWig",
                         "viewUi" : "on"
                         }
                 metaOverride(tdbViewSig, parentMeta)
                 writeStanza(ofh, indent, tdbViewSig)
         else:
             # no composites
             groupMeta = parseMeta(subDir)
 
+        typeDict = reorderTracks(typeDict, groupMeta)
+
         for trackBase, typeFnames in typeDict.items():
             for fileType, absFnames in typeFnames.items():
                 assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type
                 absFname = absFnames[0]
                 fileBase = basename(absFname)
                 relFname = relpath(absFname, tdbDir)
 
                 labelSuff = ""
                 if parentHasViews:
                     if fileType=="bigWig":
                         labelSuff = " Signal"
                     elif fileType=="bigBed":
                         labelSuff = " Peaks"
                     else:
                         assert(False) # views and non-bigWig/Bed are not supported yet?
 
                 if parentName is not None:
-                    parentPrefix = parentName+"-"
+                    parentPrefix = parentName+"__"
                 else:
                     parentPrefix = ""
 
-                trackName = parentPrefix+trackBase+"-"+fileType
+                trackName = parentPrefix+trackBase+"__"+fileType
                 tdb = {
                         "track"      :  trackName,
                         "shortLabel" :  trackBase+labelSuff,
                         "longLabel"  :  trackBase+labelSuff,
                         "visibility" :  "dense",
                         "type"       :  fileType,
                         "bigDataUrl" :  relFname,
                       }
 
                 if parentName:
                     tdb["parent"] = parentName
 
                 if parentHasViews:
                     onOff = "on"
                     if trackName in groupMeta and "visibility" in groupMeta[trackName]:
@@ -476,62 +572,70 @@
     my_client = client.TusClient(serverUrl, headers={})
     logging.info(f"Target server is {serverUrl}")
 
     for fname in os.listdir(tdbDir):
         fpath = join(tdbDir, fname)
         if isdir(fpath):
             continue
         logging.info(f"Uploading {fpath}")
         meta = {"db":"hg19"}
         uploader = my_client.uploader(fpath, metadata=meta)
         uploader.upload()
 
 def iterRaStanzas(fname):
     " parse an ra-style (trackDb) file and yield dictionaries "
     data = dict()
+    logging.debug("Parsing %s in trackDb format" % fname)
     with open(fname, "rt") as ifh:
         for l in ifh:
             l = l.lstrip(" ").rstrip("\r\n")
             if len(l)==0:
                 yield data
                 data = dict()
             else:
                 if " " not in l:
                     continue
                 key, val = l.split(" ", maxsplit=1)
                 data[key] = val
 
     if len(data)!=0:
         yield data
 
 def raToTab(fname):
     " convert .ra file to .tsv "
     stanzas = []
     allFields = set()
     for stanza in iterRaStanzas(fname):
         if "hub" in stanza or "genome" in stanza:
             continue
         allFields.update(stanza.keys())
         stanzas.append(stanza)
 
     if "track" in allFields:
         allFields.remove("track")
     if "shortLabel" in allFields:
         allFields.remove("shortLabel")
 
+    hasLongLabel = False
+    if "longLabel" in allFields:
+        allFields.remove("longLabel")
+        hasLongLabel = True
+
     sortedFields = sorted(list(allFields))
-    # make sure that track and shortLabel are first and always there, handy for manual edits
+    # make sure that track shortLabel and longLabel come first and always there, handy for manual edits
+    if hasLongLabel:
+        sortedFields.insert(0, "longLabel")
     sortedFields.insert(0, "shortLabel")
     sortedFields.insert(0, "track")
 
     ofh = sys.stdout
     ofh.write("#")
     ofh.write("\t".join(sortedFields))
     ofh.write("\n")
 
     for s in stanzas:
         row = []
         for fieldName in sortedFields:
             row.append(s.get(fieldName, ""))
         ofh.write("\t".join(row))
         ofh.write("\n")
 
@@ -546,41 +650,45 @@
 
     if cmd=="up":
         uploadFiles(inDir)
         return
 
     tdbDir = inDir
     if options.outDir:
         tdbDir = options.outDir
 
     if cmd=="jbrowse":
         importJbrowse(args[1], args[2], tdbDir)
     elif cmd == "tab":
         raToTab(args[1])
     elif cmd == "make":
         db = args[1]
-        dirFiles = readDirs(inDir)
+
+        meta = parseMeta(inDir)
+        dirFiles = readDirs(inDir, meta)
 
         hubFname = join(tdbDir, "hub.txt")
         logging.info("Writing %s" % hubFname)
         ofh = open(hubFname, "w")
 
         meta = parseMeta(inDir)
         writeHubGenome(ofh, db, meta)
 
         writeTdb(inDir, dirFiles, "top", tdbDir, ofh)
         writeTdb(inDir, dirFiles, "comps", tdbDir, ofh)
 
         ofh.close()
+    else:
+        logging.error("Unknown command: '%s'" % args[1])
 
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     hubt(args, options)
 
     #if options.test:
         #logging.debug("test is set")
         #f = open(options.file, "r")
 
 main()