src/utils/hubtools/hubtools 0f3ca3eaf5792df01b7c600a5428d2d0b2809fcd

0f3ca3eaf5792df01b7c600a5428d2d0b2809fcd
max
  Fri Sep 20 13:18:01 2024 -0700
Revert "more features to hubtools: search in both parent and subdirs, better docs"

This reverts commit 05e67c59a20a5d00b810a981aef3b00c5bef82e1.

diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools
index d11fed5..a9c06a7 100755
--- src/utils/hubtools/hubtools
+++ src/utils/hubtools/hubtools
@@ -5,59 +5,50 @@
 from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir
 #import pyyaml   # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml()
 
 # ==== functions =====
     
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs
             
     hubtools make <assemblyCode>: create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt
     - bigBed/bigWig files in the current directory will be top level tracks
     - big* files in subdirectories become composites
     - for every filename, the part before the first dot becomes the track base name
     - if a directory has more than 80% of track base names with both a bigBed
       and bigWig file, views are activated for this composite
-    - track attributes can be changed using tracks.tsv or json/ra/yaml files, in
-      each top or subdirectory. Both subdir and top directory are searched, subdir
-      values override any values specified in a parent directory.
-    - tracks.tsv (or tracks.json/ra/yaml) must have a first column named 'track'. For json and yaml,
-      the key is the track name. "hub" is a special key to provide email/shortLabel of the hub.
-    - the track name in this column can be either the full one <parent>__fileBasename__type or just
-      the fileBasename, this makes it easier. In practice, filenames are unique enough and don't
-      usually clash between subdirs or filetype, in practice.
-    - the track order is the one on disk, so relatively random. To order tracks, create a hub,
-      use the "tab" command below, reorder tracks in tsv file, then re-run the "make" command.
+    - track attributes can be changed using tracks.tsv or tracks.json files, in
+      each top or subdirectory
+    - tracks.tsv must have a first column named 'track'.
 
     hubtools up: upload files to hubSpace
     - needs ~/.hubt.conf with username and password. Create one with 'hubt conf'
     - uploads all files from the -i directory or the current dir if not specified.
 
     hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt.
     - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/
     - <db> is assembly identifier 
 
     hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc.
-       The resulting file if named tracks.tsv can be used as inputs for future runs.
     - <fname> is the input filename. "hub" and "genome" stanzas are skipped.
     - output goes to stdout
 
     Examples:
-    hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3
     hubtools make hg38
-    hubtools tab hub.txt | cut -f1-3 > tracks.tsv # then edit tracks.tsv
-    hubtools make hg38 # will pick up tracks.tsv edits to create a new hub.txt
+    hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3
+    hubtools tab hub.txt > tracks.tsv
 
     tracks.json can look like this:
     { "hub" : {"hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons"} }
     """)
 
 
     parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory")
     parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.")
 
     #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy")
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages")
     parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace")
     (options, args) = parser.parse_args()
 
     if len(args)==0:
@@ -137,37 +128,34 @@
 def parseMetaJson(fname):
     " parse a json file and merge it into meta and return "
     logging.debug("Reading %s as json" % fname)
     newMeta = json.load(open(fname))
     return newMeta
 
 def parseMetaYaml(fname):
     " parse yaml file "
     import yaml # if this doesn't work, run 'pip install pyyaml'
     with open(fname) as stream:
         try:
             return yaml.safe_load(stream)
         except yaml.YAMLError as exc:
             logging.error(exc)
 
-def parseMeta(inDirs):
-    """ parse a tab-sep file with headers and return an ordered dict firstField -> dictionary.
-    Takes a list of input directories and the last directory overrides values in parent directories.
-    """
-    meta = OrderedDict()
-    for inDir in inDirs:
+def parseMeta(inDir):
+    " parse a tab-sep file with headers and return an ordered dict firstField -> dictionary "
     fname = join(inDir, "tracks.tsv")
+    meta = OrderedDict()
     if isfile(fname):
         tsvMeta = parseMetaTsv(fname)
         meta = allMetaOverride(meta, tsvMeta)
 
     fname = join(inDir, "tracks.json")
     if isfile(fname):
         jsonMeta = parseMetaJson(fname)
         meta = allMetaOverride(meta, jsonMeta)
 
     fname = join(inDir, "tracks.ra")
     if isfile(fname):
         raMeta = parseMetaRa(fname)
         meta = allMetaOverride(meta, raMeta)
 
     fname = join(inDir, "tracks.yaml")
@@ -383,102 +371,102 @@
             doneTracks.add(trackBase)
 
     logging.debug("Track ordering from meta data used: %s" % newFiles.keys())
 
     # then add all other tracks
     for trackBase, fileData in fileDict.items():
         if trackBase not in doneTracks:
             newFiles[trackBase] = fileDict[trackBase]
             logging.debug("Not specified in meta, so adding at the end: %s" % trackBase)
 
     logging.debug("Final track order is: %s" % newFiles.keys())
 
     assert(len(newFiles)==len(fileDict))
     return newFiles
 
-def makeTrackDbEntries(inDir, dirDict, dirType, tdbDir, ofh):
+def writeTdb(inDir, dirDict, dirType, tdbDir, ofh):
     " given a dict with basename -> type -> filenames, write track entries to ofh "
     # this code is getting increasingly complex because it supports composites/views and pairing of bigBed/bigWig files
     # either this needs better comments or maybe a separate code path for this rare use case
     global compCount
 
     fnameDict = dirDict[dirType]
 
     for parentName, typeDict in fnameDict.items():
         if parentName is None: # top level tracks: use top tracks.tsv
-            subDirs = [inDir]
-        else: # container tracks -> use tracks.tsv in the parent and subdirectory
-            subDirs = [inDir, join(inDir, parentName)]
+            subDir = inDir
+        else: # container tracks -> use tracks.tsv in the subdirectory
+            subDir = join(inDir, parentName)
 
-        parentMeta = parseMeta(subDirs)
+        parentMeta = parseMeta(subDir)
 
         indent = 0
         parentHasViews = False
 
         groupMeta = {}
 
         if dirType=="comps":
             tdb = {
                     "track" : parentName,
                     "shortLabel": parentName,
                     "visibility" : "dense",
                     "compositeTrack" : "on",
                     "autoScale" : "group",
                     "type" : "bed 4"
                   }
             metaOverride(tdb, parentMeta)
             groupMeta = parentMeta
 
             parentHasViews = mostFilesArePaired(typeDict)
 
             if parentHasViews:
                 tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals"
                 logging.info("Container track %s has >80%% of paired files, activating views" % parentName)
 
             writeStanza(ofh, indent, tdb)
             indent = 4
 
             if parentHasViews:
                 # we have composites with paired files? -> write the track stanzas for the two views
-                groupMeta = parseMeta(subDirs)
+                groupMeta = parseMeta(subDir)
                 tdbViewPeaks = {
                         "track" : parentName+"ViewPeaks",
                         "shortLabel" : parentName+" Peaks",
                         "parent" : parentName,
                         "view" : "PK",
                         "visibility" : "dense",
                         "type" : "bigBed",
                         "scoreFilter" : "off",
                         "viewUi" : "on"
                         }
                 metaOverride(tdbViewPeaks, parentMeta)
                 writeStanza(ofh, indent, tdbViewPeaks)
 
                 tdbViewSig = {
                         "track" : parentName+"ViewSignal",
                         "shortLabel" : parentName+" Signal",
                         "parent" : parentName,
                         "view" : "SIG",
                         "visibility" : "dense",
                         "type" : "bigWig",
                         "viewUi" : "on"
                         }
                 metaOverride(tdbViewSig, parentMeta)
                 writeStanza(ofh, indent, tdbViewSig)
         else:
             # no composites
-            groupMeta = parseMeta(subDirs)
+            groupMeta = parseMeta(subDir)
 
         typeDict = reorderTracks(typeDict, groupMeta)
 
         for trackBase, typeFnames in typeDict.items():
             for fileType, absFnames in typeFnames.items():
                 assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type
                 absFname = absFnames[0]
                 fileBase = basename(absFname)
                 relFname = relpath(absFname, tdbDir)
 
                 labelSuff = ""
                 if parentHasViews:
                     if fileType=="bigWig":
                         labelSuff = " Signal"
                     elif fileType=="bigBed":
@@ -663,41 +651,42 @@
     if cmd=="up":
         uploadFiles(inDir)
         return
 
     tdbDir = inDir
     if options.outDir:
         tdbDir = options.outDir
 
     if cmd=="jbrowse":
         importJbrowse(args[1], args[2], tdbDir)
     elif cmd == "tab":
         raToTab(args[1])
     elif cmd == "make":
         db = args[1]
 
-        meta = parseMeta([inDir])
+        meta = parseMeta(inDir)
         dirFiles = readDirs(inDir, meta)
 
         hubFname = join(tdbDir, "hub.txt")
         logging.info("Writing %s" % hubFname)
         ofh = open(hubFname, "w")
 
+        meta = parseMeta(inDir)
         writeHubGenome(ofh, db, meta)
 
-        makeTrackDbEntries(inDir, dirFiles, "top", tdbDir, ofh)
-        makeTrackDbEntries(inDir, dirFiles, "comps", tdbDir, ofh)
+        writeTdb(inDir, dirFiles, "top", tdbDir, ofh)
+        writeTdb(inDir, dirFiles, "comps", tdbDir, ofh)
 
         ofh.close()
     else:
         logging.error("Unknown command: '%s'" % args[1])
 
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     hubt(args, options)
 
     #if options.test:
         #logging.debug("test is set")
         #f = open(options.file, "r")