src/utils/hubtools/hubtools afcc702b2a413d9ee8ee3455f0730a5237026e57

afcc702b2a413d9ee8ee3455f0730a5237026e57
max
  Thu Nov 7 07:00:36 2024 -0800
allow upload to hubspace using an apiKey, skip files that were already uploaded, refs #34405

diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools
index 37e55bc..b6a9b93 100755
--- src/utils/hubtools/hubtools
+++ src/utils/hubtools/hubtools
@@ -47,42 +47,47 @@
     hubtools up: upload files to hubSpace
     - needs ~/.hubt.conf with username and password. Create one with 'hubt conf'
     - uploads all files from the -i directory or the current dir if not specified.
 
     hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt.
     - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/
     - <db> is assembly identifier
 
     hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc.
     - <fname> is the input filename. "hub" and "genome" stanzas are skipped.
     - output goes to stdout
 
     hubtools conv -i myTsvDir
     - convert .tsv files in inDir to .bigBed files in current directory
 
-    hubtools ct -i xxx/archive -o outDir
+    hubtools archive -i xxx/archive -o outDir
     - Create a hub from an extracted xxx.tar.gz file "archive" directory. You can download a .tar.gz from
       in the Genome Browser under My Data > My Session. The archive contains all your custom tracks.
 
+    hubtools ct <hgTracksUrl>
+
     Examples:
     hubtools conv -i myTsvs/
     hubtools make hg38
     hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3
     hubtools tab hub.txt > tracks.tsv
+    hubtools session
 
     tar xvfz SC_20230723_backup.tar.gz
-    hubtools ct -i archive -o hub
+    hubtools archive -i archive -o hub
+    hubtools ct 'https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg38&position=chr7%3A155798978%2D155810671&hgsid=345826202_8d3Mpumjaw9IXYI2RDaSt5xMoUhH'
+
 
     For the "make" step:
 
     tracks.json can look like this, can have more keys, one per track, or the special key "hub":
     {
         "hub" :      { "hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons" },
         "myTrack" :  { "shortLabel" : "My nice track" }
     }
 
     tracks.tsv should look like this, but can have any number of columns:
     #track<tab>shortLabel
     myTrack<tab>My nice track
     """)
 
 
@@ -101,30 +106,77 @@
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 def errAbort(msg):
     " print and abort) "
     logging.error(msg)
     sys.exit(1)
 
 def makedirs(d):
     if not isdir(d):
         os.makedirs(d)
 
+def parseConf(fname):
+    " parse a hg.conf style file, return as dict key -> value (all strings) "
+    logging.debug("Parsing "+fname)
+    conf = {}
+    for line in open(fname):
+        line = line.strip()
+        if line.startswith("#"):
+            continue
+        elif line.startswith("include "):
+            inclFname = line.split()[1]
+            absFname = normpath(join(dirname(fname), inclFname))
+            if os.path.isfile(absFname):
+                inclDict = parseConf(absFname)
+                conf.update(inclDict)
+        elif "=" in line: # string search for "="
+            key, value = line.split("=",1)
+            conf[key] = value
+    return conf
+
+# cache of hg.conf contents
+hgConf = None
+
+def parseHgConf():
+    """ return hg.conf as dict key:value. """
+    global hgConf
+    if hgConf is not None:
+        return hgConf
+
+    hgConf = dict() # python dict = hash table
+
+    fname = os.path.expanduser("~/.hubtools.conf")
+    if isfile(fname):
+        hgConf = parseConf(fname)
+    else:
+        fname = os.path.expanduser("~/.hg.conf")
+        if isfile(fname):
+            hgConf = parseConf(fname)
+
+def cfgOption(name, default=None):
+    " return hg.conf option or default "
+    global hgConf
+
+    if not hgConf:
+        parseHgConf()
+
+    return hgConf.get(name, default)
+
 def parseMetaRa(fname):
     """parse tracks.ra or tracks.txt and return as a dict of trackName -> dict of key->val """
     logging.debug("Reading %s as .ra" % fname)
     trackName = None
     stanzaData = {}
     ret = {}
     for line in open(fname):
         line = line.strip()
         if line.startswith("#"):
             continue
         if line=="":
             if len(stanzaData)==0:
                 # double newline
                 continue
             if trackName is None:
@@ -597,50 +649,106 @@
                 tdb["visibility"] = "pack"
             elif dispMode=="compact":
                 tdb["visibility"] = "dense"
             else:
                 tdb["visibility"] = "pack"
         else:
             tdb["visibility"] = "pack"
     
         writeStanza(ofh, 0, tdb)
 
 def installModule(package):
     " install a package "
     logging.info("Could not find Python module '%s', trying to install with pip" % package)
     subprocess.check_call([sys.executable, "-m", "pip", "install", package])
 
-def uploadFiles(tdbDir):
-    "upload files to hubspace"
+def cacheLoad(fname):
+    " load file cache from json file "
+    if not isfile(fname):
+        logging.debug("No upload cache present")
+        return {}
+
+    logging.debug("Loading "+fname)
+    return json.load(open(fname))
+
+def cacheWrite(uploadCache, fname):
+    logging.debug("Writing "+fname)
+    json.dump(uploadCache, open(fname, "w"), indent=4)
+
+def uploadFiles(tdbDir, hubName):
+    "upload files to hubspace. Server name and token can come from ~/.hubtools.conf "
     try:
         from tusclient import client
     except ModuleNotFoundError:
         installModule("tuspy")
+        from tusclient import client
 
-    serverUrl="https://hgwdev-hubspace.gi.ucsc.edu/files"
-    my_client = client.TusClient(serverUrl, headers={})
-    logging.info(f"Target server is {serverUrl}")
+    serverUrls = cfgOption("tusUrl", "https://hubspace.gi.ucsc.edu/files")
 
-    for fname in os.listdir(tdbDir):
-        fpath = join(tdbDir, fname)
-        if isdir(fpath):
+    cookies = {}
+    cookieNameUser = cfgOption("wiki.userNameCookie", "wikidb_mw1_UserName")
+    cookieNameId = cfgOption("wiki.loggedInCookie", "wikidb_mw1_UserID")
+
+    apiKey = cfgOption("apiKey")
+    if apiKey is None:
+        errAbort("To upload files, the file ~/.hubtools.conf must contain a line like apiKey='xxx@yyy').\n"
+                "Go to https://genome.ucsc.edu/cgi-bin/hgHubConnect#dev to get your apiKey value.")
+
+    if not "@" in apiKey:
+        errAbort("The apiKey value in ~/.hubtools.conf must contain an @ character.")
+
+    cookieUser, cookieId = apiKey.split("@")
+
+    cookies = {}
+    cookies[cookieNameUser] = cookieUser
+    cookies[cookieNameId] = cookieId
+    cookie_header = "; ".join(f"{key}={value}" for key, value in cookies.items())
+
+    headers = {"Cookie" : cookie_header}
+
+    logging.info(f"TUS server URL: {serverUrl}")
+    logging.debug("HTTP headers: "+repr(headers))
+    my_client = client.TusClient(serverUrl, headers=headers)
+
+    cacheFname = join(tdbDir, ".hubtools.json")
+    uploadCache = cacheLoad(cacheFname)
+
+    for rootDir, dirs, files in os.walk(tdbDir):
+        for fbase in files:
+            if fbase.startswith("."):
                 continue
-        logging.info(f"Uploading {fpath}")
-        meta = {"db":"hg19"}
-        uploader = my_client.uploader(fpath, metadata=meta)
+            fpath = join(rootDir, fbase)
+            fullPath = join(tdbDir, fpath)
+
+            # skip files that have not changed their mtime since last upload
+            if fpath in uploadCache:
+                if os.stat(fullPath).st_mtime == uploadCache[fpath]["mtime"]:
+                    logging.info("%s: file mtime unchanged, not uploading again" % fullPath)
+                    continue
+
+            cache = {}
+            cache["mtime"] = os.stat(fullPath).st_mtime
+            cache["size"] = os.stat(fullPath).st_size
+            uploadCache[fpath] = cache
+
+            meta = {"subdir" : rootDir, "genome":"NA", "hubName" : hubName}
+            logging.info(f"Uploading {fpath}, meta {meta}")
+            uploader = my_client.uploader(fullPath, metadata=meta)
             uploader.upload()
 
+    cacheWrite(uploadCache, cacheFname)
+
 def iterRaStanzas(fname):
     " parse an ra-style (trackDb) file and yield dictionaries "
     data = dict()
     logging.debug("Parsing %s in trackDb format" % fname)
     with open(fname, "rt") as ifh:
         for l in ifh:
             l = l.lstrip(" ").rstrip("\r\n")
             if len(l)==0:
                 yield data
                 data = dict()
             else:
                 if " " not in l:
                     continue
                 key, val = l.split(" ", maxsplit=1)
                 data[key] = val
@@ -992,62 +1100,84 @@
 
         if "bigDataUrl" not in tdb:
             bedFname = join(outDir, tdb["track"]+".bed")
             bbFname = join(outDir, tdb["track"]+".bb")
             stripFirstLine(fname, bedFname)
             bedToBigBed(bedFname, db, bbFname)
             os.remove(bedFname)
             tdb["bigDataUrl"] = tdb["track"]+".bb"
             tdb["type"] = "bigBed"
 
         writeStanza(ofh, 0, tdb)
 
     logging.info("Wrote %s" % hubTxtFname)
     ofh.close()
 
-def convCtDir(inDir, outDir):
+def convArchDir(inDir, outDir):
     " convert a directory created from the .tar.gz file downloaded via our track archive feature "
     dbContent = os.listdir(inDir)
     dbDirs = []
     for db in dbContent:
         subDir = join(inDir, db)
         if isdir(subDir):
             dbDirs.append((db, subDir))
 
     if len(dbDirs)==0:
         errAbort("No directories found under %s. Extract the tarfile and point this program at the 'archive' directory." % inDir)
 
     for db, inSubDir in dbDirs:
         logging.debug("Processing %s, db=%s" % (inSubDir, db))
         outSubDir = join(outDir, db)
         convCtDb(db, inSubDir, outSubDir)
 
+def hgsidFromUrl(url):
+    " return hgsid given a URL. written by chatGPT "
+    # Parse the URL
+    parsed_url = urllib.parse.urlparse(url)
+    server_name = f"{parsed_url.scheme}://{parsed_url.netloc}"
+    query_params = urllib.parse.parse_qs(parsed_url.query)
+    hgsid = query_params.get('hgsid')
+    # Return the hgsid value if it exists
+    if hgsid:
+        return server_name, hgsid[0]  # Extracting the first value from the list
+    errAbort("hgsid parameter not found in "+url)
+
+def convCtUrl(url, outDir):
+    " given an hgTracks URL with an hgsid on it, get all custom track lines and create a hub file for it. Does not download custom tracks. "
+    serverUrl, hgsid = hgsidFromUrl(url)
+    cartDumpUrl = serverUrl+"/cgi-bin/cartDump?hgsid="+hgsid
+
+
 def hubtools(args, options):
     """ find files under dirName and create a trackDb.txt for them"""
 
     cmd = args[0]
 
     inDir = "."
     if options.inDir:
         inDir = options.inDir
 
     outDir = "."
     if options.outDir:
         outDir = options.outDir
 
     if cmd=="up":
-        uploadFiles(inDir)
+        if len(args)<2:
+            errAbort("The 'up' command requires one argument, the name of the hub. You can specify any name, "
+                "ideally a short, meaningful string, e.g. atacseq, muscle-rna or yamamoto2022.")
+        hubName = args[1]
+        uploadFiles(inDir, hubName)
         return
 
     tdbDir = inDir
     if options.outDir:
         tdbDir = options.outDir
 
     if cmd=="jbrowse":
         importJbrowse(args[1], args[2], tdbDir)
     elif cmd == "tab":
         raToTab(args[1])
     elif cmd == "make":
         db = args[1]
 
         meta = parseMeta(inDir)
         dirFiles = readDirs(inDir, meta)
@@ -1057,30 +1187,34 @@
         ofh = open(hubFname, "w")
 
         meta = parseMeta(inDir)
         writeHubGenome(ofh, db, meta)
 
         writeTdb(inDir, dirFiles, "top", tdbDir, ofh)
         writeTdb(inDir, dirFiles, "comps", tdbDir, ofh)
 
         ofh.close()
 
     elif cmd=="conv":
         db = args[1]
 
         convTsvDir(inDir, db, outDir)
 
+    elif cmd=="archive":
+        convArchDir(inDir, outDir)
+
     elif cmd=="ct":
-        convCtDir(inDir, outDir)
+        url = args[1]
+        convCtUrl(url, outDir)
 
     else:
         logging.error("Unknown command: '%s'" % args[1])
 
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     hubtools(args, options)
 
 
 main()