src/utils/hubtools/hubtools 2adda61e8983ceca1a9bd577c47a686a6f6990f9

2adda61e8983ceca1a9bd577c47a686a6f6990f9
max
  Wed Feb 12 08:14:29 2025 -0800
hubtools gets support for session links, upload, and archives, refs #34405

diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools
index d91dafc02c4..de74fcdd06c 100755
--- src/utils/hubtools/hubtools
+++ src/utils/hubtools/hubtools
@@ -1,118 +1,132 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, os, json, subprocess, shutil, string, glob, tempfile, re
 import shlex
 from pathlib import Path
 from collections import defaultdict, OrderedDict
 from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir
 #import pyyaml   # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml()
 
 # ==== functions =====
 # allowed file types by hubtools up
-fileTypes = {
-    "bb" : "bigBed",
-    "bigBed" : "bigBed",
-    "bam" : "bam",
-    "vcf.gz" : "vcf",
-    "bigWig" : "bigWig",
-    "bw" : "bigWig",
-    "hic" : "hic",
-    "cram" : "cram",
-    "txt" : "text",
+# copied from the javascript file hgHubConnect.js
+fileTypeExtensions = {
+    "bigBed": [ ".bb", ".bigbed" ],
+    "bam": [ ".bam" ],
+    "vcf": [ ".vcf" ],
+    "vcfTabix": [ ".vcf.gz", "vcf.bgz" ],
+    "bigWig": [ ".bw", ".bigwig" ],
+    "hic": [ ".hic" ],
+    "cram": [ ".cram" ],
+    "bigBarChart": [ ".bigbarchart" ],
+    "bigGenePred": [ ".bgp", ".biggenepred" ],
+    "bigMaf": [ ".bigmaf" ],
+    "bigInteract": [ ".biginteract" ],
+    "bigPsl": [ ".bigpsl" ],
+    "bigChain": [ ".bigchain" ],
+    "bamIndex": [ ".bam.bai", ".bai" ],
+    "tabixIndex": [ ".vcf.gz.tbi", "vcf.bgz.tbi" ]
 }
 
 asHead = """table bed
 "Browser extensible data (<=12 fields) "
     (
 """
 
 asLines = """
     string chrom;      "Chromosome (or contig, scaffold, etc.)"
     uint   chromStart; "Start position in chromosome"
     uint   chromEnd;   "End position in chromosome"
     string name;       "Name of item"
     uint   score;      "Score from 0-1000"
     char[1] strand;    "+ or -"
     uint thickStart;   "Start of where display should be thick (start codon)"
     uint thickEnd;     "End of where display should be thick (stop codon)"
     uint reserved;     "Used as itemRgb as of 2004-11-22"
     int blockCount;    "Number of blocks"
     int[blockCount] blockSizes; "Comma separated list of block sizes"
     int[blockCount] chromStarts; "Start positions relative to chromStart"
 """.split("\n")
 
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs
 
-    hubtools make <assemblyCode>: create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt
+    hubtools make <assemblyCode e.g. hg38>: create a track hub for all bigBed/bigWig files under a directory.
+    Creates single-file hub.txt, and tries to guess reasonable settings from the file names:
     - bigBed/bigWig files in the current directory will be top level tracks
     - big* files in subdirectories become composites
     - for every filename, the part before the first dot becomes the track base name
     - if a directory has more than 80% of track base names with both a bigBed
       and bigWig file, views are activated for this composite
     - track attributes can be changed using tracks.ra, tracks.tsv, tracks.json or tracks.yaml files, in
       each top or subdirectory
     - The first column of tracks.tsv must be 'track'.
-    - tracks.json and tracks.yaml must be object at the top level, the attributes are <trackName> or
+    - tracks.json and tracks.yaml must have objects at the top level, the attributes are <trackName> or
       the special attribute "hub".
 
-    hubtools up: upload files to hubSpace
-    - needs ~/.hubt.conf with username and password. Create one with 'hubt conf'
+    hubtools up <hubName>: upload files to hubSpace
+    - needs ~/.hubtools.conf with a line apiKey="xxxx". Create a key by going to
+      My Data > Track Hubs > Track Development on https://genome.ucsc.edu
     - uploads all files from the -i directory or the current dir if not specified.
 
     hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt.
     - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/
     - <db> is assembly identifier
 
     hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc.
     - <fname> is the input filename. "hub" and "genome" stanzas are skipped.
     - output goes to stdout
 
     hubtools conv -i myTsvDir
     - convert .tsv files in inDir to .bigBed files in current directory
 
     hubtools archive -i xxx/archive -o outDir
     - Create a hub from an extracted xxx.tar.gz file "archive" directory. You can download a .tar.gz from
       in the Genome Browser under My Data > My Session. The archive contains all your custom tracks.
+      It is usually easier to use the 'ct' command, as this will automatically download an archive
+      an convert it, given a full genome browser URL with an "hgsid" parameter on the URL.
 
     hubtools ct <hgTracksUrl>
+    - Download all custom tracks from a Genome Browser URL, either as a /s/ stable shortlink or 
+      the temporary hgsid=xxxx URL
 
     Examples:
     hubtools conv -i myTsvs/
     hubtools make hg38
     hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3
     hubtools tab hub.txt > tracks.tsv
     hubtools session
 
     tar xvfz SC_20230723_backup.tar.gz
     hubtools archive -i archive -o hub
     hubtools ct 'https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr7%3A155798978%2D155810671&hgsid=345826202_8d3Mpumjaw9IXYI2RDaSt5xMoUhH'
 
-
-    For the "make" step:
+    For the "hubtools make" step, you can specify additional options for your tracks via tracks.json/.yaml or tracks.tsv:
 
     tracks.json can look like this, can have more keys, one per track, or the special key "hub":
     {
         "hub" :      { "hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons" },
         "myTrack" :  { "shortLabel" : "My nice track" }
     }
+    tracks.tsv should look like this, other columns can be added:
 
-    tracks.tsv should look like this, but can have any number of columns:
         #track<tab>shortLabel
         myTrack<tab>My nice track
+
+    For a list of all possible fields/columns see https://genome.ucsc.edu/goldenpath/help/trackDb/trackDbHub.html:
     """)
 
 
     parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory")
     parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.")
 
     #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy")
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages")
     parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace")
     (options, args) = parser.parse_args()
 
     if len(args)==0:
         parser.print_help()
         exit(1)
 
@@ -675,103 +689,116 @@
     subprocess.check_call([sys.executable, "-m", "pip", "install", package])
 
 def cacheLoad(fname):
     " load file cache from json file "
     if not isfile(fname):
         logging.debug("No upload cache present")
         return {}
 
     logging.debug("Loading "+fname)
     return json.load(open(fname))
 
 def cacheWrite(uploadCache, fname):
     logging.debug("Writing "+fname)
     json.dump(uploadCache, open(fname, "w"), indent=4)
 
+def getFileType(fbase):
+    " return the file type defined in the hubspace system, given a base file name "
+    ret = "NA"
+    for fileType, fileExts in fileTypeExtensions.items():
+        for fileExt in fileExts:
+            if fbase.endswith(fileExt):
+                ret = fileType
+                break
+        if ret!="NA":
+            break
+
+    logging.debug("file type for %s is %s" % (fbase, fileType))
+    return ret
+
 def uploadFiles(tdbDir, hubName):
     "upload files to hubspace. Server name and token can come from ~/.hubtools.conf "
     try:
         from tusclient import client
     except ModuleNotFoundError:
         installModule("tuspy")
         from tusclient import client
 
     serverUrl = cfgOption("tusUrl", "https://hubspace.gi.ucsc.edu/files")
 
     cookies = {}
     cookieNameUser = cfgOption("wiki.userNameCookie", "wikidb_mw1_UserName")
     cookieNameId = cfgOption("wiki.loggedInCookie", "wikidb_mw1_UserID")
 
     apiKey = cfgOption("apiKey")
     if apiKey is None:
-        errAbort("To upload files, the file ~/.hubtools.conf must contain a line like apiKey='xxx@yyy').\n"
-                "Go to https://genome.ucsc.edu/cgi-bin/hgHubConnect#dev to get your apiKey value.")
-
-    if not "@" in apiKey:
-        errAbort("The apiKey value in ~/.hubtools.conf must contain an @ character.")
-
-    cookieUser, cookieId = apiKey.split("@")
-
-    cookies = {}
-    cookies[cookieNameUser] = cookieUser
-    cookies[cookieNameId] = cookieId
-    cookie_header = "; ".join(f"{key}={value}" for key, value in cookies.items())
-
-    headers = {"Cookie" : cookie_header}
+        errAbort("To upload files, the file ~/.hubtools.conf must contain a line like apiKey='xxx').\n"
+                "Go to https://genome.ucsc.edu/cgi-bin/hgHubConnect#dev to create a new  apiKey. Then run \n"
+                "   echo 'apiKey=\"xxxx\"' >> ~/.hubtools.conf \n"
+                "and run the 'hubtools up' command again.")
 
     logging.info(f"TUS server URL: {serverUrl}")
-    logging.debug("HTTP headers: "+repr(headers))
-    my_client = client.TusClient(serverUrl, headers=headers)
+    my_client = client.TusClient(serverUrl)
 
-    cacheFname = join(tdbDir, ".hubtools.json")
+    cacheFname = join(tdbDir, ".hubtools.files.json")
     uploadCache = cacheLoad(cacheFname)
 
     for rootDir, dirs, files in os.walk(tdbDir):
         for fbase in files:
             if fbase.startswith("."):
                 continue
             fpath = join(rootDir, fbase)
-            fullPath = join(tdbDir, fpath)
+            localPath = join(tdbDir, fpath)
+            localMtime = os.stat(localPath).st_mtime
 
             # skip files that have not changed their mtime since last upload
             if fpath in uploadCache:
-                if os.stat(fullPath).st_mtime == uploadCache[fpath]["mtime"]:
-                    logging.info("%s: file mtime unchanged, not uploading again" % fullPath)
+                cacheMtime = uploadCache[fpath]["mtime"]
+                if localMtime == cacheMtime:
+                    logging.info("%s: file mtime unchanged, not uploading again" % localPath)
                     continue
+                else:
+                    logging.debug("file %s: mtime is %f, cache mtime is %f, need to re-upload" %
+                            (fpath, localMtime, cacheMtime))
+            else:
+                logging.debug("file %s not in upload cache" % fpath)
 
-            cache = {}
-            mtime = os.stat(fullPath).st_mtime
-            cache["mtime"] = mtime
-            cache["size"] = os.stat(fullPath).st_size
-            uploadCache[fpath] = cache
-
-            #timestamp = datetime.fromtimestamp(mtime, tz=timezone.utc).timestamp() # seconds since 1970
+            fileType = getFileType(fbase)
 
             parentDir = join(hubName, rootDir)
             meta = {
+                    "apiKey" : apiKey,
                     "parentDir" : parentDir,
                     "genome":"NA",
                     "fileName" : fbase,
                     "hubName" : hubName,
-                    "fileType":"NA",
-                    "lastModified" : str(int(mtime)),
+                    "hubtools" : "true",
+                    "fileType": fileType,
+                    "lastModified" : str(int(localMtime)),
             }
 
             logging.info(f"Uploading {fpath}, meta {meta}")
-            uploader = my_client.uploader(fullPath, metadata=meta)
+            uploader = my_client.uploader(localPath, metadata=meta)
             uploader.upload()
 
+            # note that this file was uploaded
+            cache = {}
+            mtime = os.stat(localPath).st_mtime
+            cache["mtime"] = mtime
+            cache["size"] = os.stat(localPath).st_size
+            uploadCache[fpath] = cache
+
     cacheWrite(uploadCache, cacheFname)
 
 def iterRaStanzas(fname):
     " parse an ra-style (trackDb) file and yield dictionaries "
     data = dict()
     logging.debug("Parsing %s in trackDb format" % fname)
     with open(fname, "rt") as ifh:
         for l in ifh:
             l = l.lstrip(" ").rstrip("\r\n")
             if len(l)==0:
                 yield data
                 data = dict()
             else:
                 if " " not in l:
                     continue