f6c44a86607233aaf1755321277340fade45f2b2
max
  Wed Sep 18 07:55:41 2024 -0700
adding a .ra to .tsv converter to hubtools, motivated by MarcP

diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools
index 93a8af3..c6169c3 100755
--- src/utils/hubtools/hubtools
+++ src/utils/hubtools/hubtools
@@ -1,45 +1,49 @@
 #!/usr/bin/env python3
 
-import logging, sys, optparse, os, json, subprocess, shutil
+import logging, sys, optparse, os, json, subprocess, shutil, string
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir
 
 # ==== functions =====
     
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs
             
     hubtools make <assemblyCode>: create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt
     - bigBed/bigWig files in the current directory will be top level tracks
     - big* files in subdirectories become composites
     - for every filename, the part before the first dot becomes the track base name
     - if a directory has more than 80% of track base names with both a bigBed
       and bigWig file, views are activated for this composite
     - track attributes can be changed using tracks.tsv or tracks.json files, in
       each top or subdirectory
     - tracks.tsv must have a first column named 'track'.
 
     hubtools up: upload files to hubSpace
     - needs ~/.hubt.conf with username and password. Create one with 'hubt conf'
     - uploads all files from the -i directory or the current dir if not specified.
 
     hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt.
     - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/
     - <db> is assembly identifier 
 
+    hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc.
+    - <fname> is the input filename. "hub" and "genome" stanzas are skipped.
+    - output goes to stdout
+
     Examples:
     hubtools make hg38
     hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3
     """)
 
 
     parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory")
     parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.")
 
     #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy")
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages")
     parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace")
     (options, args) = parser.parse_args()
 
     if len(args)==0:
@@ -294,30 +298,32 @@
     " given a dict with basename -> type -> filenames, write track entries to ofh "
     global compCount
 
     fnameDict = dirDict[dirType]
 
     for parentName, typeDict in fnameDict.items():
         if parentName is None: # top level tracks
             subDir = inDir
         else: # container tracks
             subDir = join(inDir, parentName)
         parentMeta = parseMeta(inDir)
 
         indent = 0
         parentHasViews = False
 
+        groupMeta = {}
+
         if dirType=="comps":
             tdb = {
                     "track" : parentName,
                     "shortLabel": parentName,
                     "visibility" : "dense",
                     "compositeTrack" : "on",
                     "autoScale" : "group",
                     "type" : "bed 4"
                   }
             metaOverride(tdb, parentMeta)
 
             parentHasViews = mostFilesArePaired(typeDict)
 
             if parentHasViews:
                 tdb["subGroup1"] = "view Views PK=Peaks SIG=Signals"
@@ -341,30 +347,31 @@
                 metaOverride(tdbViewPeaks, parentMeta)
                 writeStanza(ofh, indent, tdbViewPeaks)
 
                 tdbViewSig = {
                         "track" : parentName+"ViewSignal",
                         "shortLabel" : parentName+" Signal",
                         "parent" : parentName,
                         "view" : "SIG",
                         "visibility" : "dense",
                         "type" : "bigWig",
                         "viewUi" : "on"
                         }
                 metaOverride(tdbViewSig, parentMeta)
                 writeStanza(ofh, indent, tdbViewSig)
         else:
+            # no composites
             groupMeta = parseMeta(subDir)
 
         for trackBase, typeFnames in typeDict.items():
             for fileType, absFnames in typeFnames.items():
                 assert(len(absFnames)==1) # for now, not sure what to do when we get multiple basenames of the same file type
                 absFname = absFnames[0]
                 fileBase = basename(absFname)
                 relFname = relpath(absFname, tdbDir)
 
                 labelSuff = ""
                 if parentHasViews:
                     if fileType=="bigWig":
                         labelSuff = " Signal"
                     elif fileType=="bigBed":
                         labelSuff = " Peaks"
@@ -441,75 +448,126 @@
             tdb["type"] = "bigWig"
 
         dispMode = tl.get("display_mode")
         if dispMode:
             if dispMode=="normal":
                 tdb["visibility"] = "pack"
             elif dispMode=="compact":
                 tdb["visibility"] = "dense"
             else:
                 tdb["visibility"] = "pack"
         else:
             tdb["visibility"] = "pack"
     
         writeStanza(ofh, 0, tdb)
 
-
 def installModule(package):
     " install a package "
     logging.info("Could not find Python module '%s', trying to install with pip" % package)
     subprocess.check_call([sys.executable, "-m", "pip", "install", package])
 
 def uploadFiles(tdbDir):
     "upload files to hubspace"
     try:
         from tusclient import client
     except ModuleNotFoundError:
         installModule("tuspy")
 
     serverUrl="https://hgwdev-hubspace.gi.ucsc.edu/files"
     my_client = client.TusClient(serverUrl, headers={})
     logging.info(f"Target server is {serverUrl}")
 
     for fname in os.listdir(tdbDir):
         fpath = join(tdbDir, fname)
         if isdir(fpath):
             continue
         logging.info(f"Uploading {fpath}")
         meta = {"db":"hg19"}
         uploader = my_client.uploader(fpath, metadata=meta)
         uploader.upload()
 
+def iterRaStanzas(fname):
+    " parse an ra-style (trackDb) file and yield dictionaries "
+    data = dict()
+    with open(fname, "rt") as ifh:
+        for l in ifh:
+            l = l.lstrip(" ").rstrip("\r\n")
+            if len(l)==0:
+                yield data
+                data = dict()
+            else:
+                if " " not in l:
+                    continue
+                key, val = l.split(" ", maxsplit=1)
+                data[key] = val
+
+    if len(data)!=0:
+        yield data
+
+def raToTab(fname):
+    " convert .ra file to .tsv "
+    stanzas = []
+    allFields = set()
+    for stanza in iterRaStanzas(fname):
+        if "hub" in stanza or "genome" in stanza:
+            continue
+        allFields.update(stanza.keys())
+        stanzas.append(stanza)
+
+    if "track" in allFields:
+        allFields.remove("track")
+    if "shortLabel" in allFields:
+        allFields.remove("shortLabel")
+
+    sortedFields = sorted(list(allFields))
+    # make sure that track and shortLabel are first and always there, handy for manual edits
+    sortedFields.insert(0, "shortLabel")
+    sortedFields.insert(0, "track")
+
+    ofh = sys.stdout
+    ofh.write("#")
+    ofh.write("\t".join(sortedFields))
+    ofh.write("\n")
+
+    for s in stanzas:
+        row = []
+        for fieldName in sortedFields:
+            row.append(s.get(fieldName, ""))
+        ofh.write("\t".join(row))
+        ofh.write("\n")
+
 def hubt(args, options):
     """ get writeLn(ofh, indent, d .bb files under dirName and create a trackDb.txt for them"""
 
     cmd = args[0]
 
     inDir = "."
     if options.inDir:
         inDir = options.inDir
 
     if cmd=="up":
         uploadFiles(inDir)
         return
 
     tdbDir = inDir
     if options.outDir:
         tdbDir = options.outDir
 
     if cmd=="jbrowse":
         importJbrowse(args[1], args[2], tdbDir)
+    elif cmd == "tab":
+        raToTab(args[1])
     elif cmd == "make":
         db = args[1]
         dirFiles = readDirs(inDir)
 
         hubFname = join(tdbDir, "hub.txt")
         logging.info("Writing %s" % hubFname)
         ofh = open(hubFname, "w")
 
         meta = parseMeta(inDir)
         writeHubGenome(ofh, db, meta)
 
         writeTdb(inDir, dirFiles, "top", tdbDir, ofh)
         writeTdb(inDir, dirFiles, "comps", tdbDir, ofh)
 
         ofh.close()