c5a92d81293edb5d95fbeb37cbe381557e61fac0 max Thu Oct 17 09:04:56 2024 -0700 extending hubtools to convert track archives to hubs, only supports BED for now, refs #34472 diff --git src/utils/hubtools/hubtools src/utils/hubtools/hubtools index a9c06a7..37e55bc 100755 --- src/utils/hubtools/hubtools +++ src/utils/hubtools/hubtools @@ -1,83 +1,130 @@ #!/usr/bin/env python3 -import logging, sys, optparse, os, json, subprocess, shutil, string +import logging, sys, optparse, os, json, subprocess, shutil, string, glob, tempfile, re +import shlex +from pathlib import Path from collections import defaultdict, OrderedDict from os.path import join, basename, dirname, isfile, relpath, abspath, splitext, isdir #import pyyaml # not loaded here, so it's not a hard requirement, is lazy loaded in parseMetaYaml() # ==== functions ===== +asHead = """table bed +"Browser extensible data (<=12 fields) " + ( +""" + +asLines = """ + string chrom; "Chromosome (or contig, scaffold, etc.)" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Name of item" + uint score; "Score from 0-1000" + char[1] strand; "+ or -" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint reserved; "Used as itemRgb as of 2004-11-22" + int blockCount; "Number of blocks" + int[blockCount] blockSizes; "Comma separated list of block sizes" + int[blockCount] chromStarts; "Start positions relative to chromStart" +""".split("\n") def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] <cmd> - create and edit UCSC track hubs hubtools make <assemblyCode>: create a track hub for all bigBed/bigWig files under a directory. Creates single-file hub.txt - bigBed/bigWig files in the current directory will be top level tracks - big* files in subdirectories become composites - for every filename, the part before the first dot becomes the track base name - if a directory has more than 80% of track base names with both a bigBed and bigWig file, views are activated for this composite - - track attributes can be changed using tracks.tsv or tracks.json files, in + - track attributes can be changed using tracks.ra, tracks.tsv, tracks.json or tracks.yaml files, in each top or subdirectory - - tracks.tsv must have a first column named 'track'. + - The first column of tracks.tsv must be 'track'. + - tracks.json and tracks.yaml must be object at the top level, the attributes are <trackName> or + the special attribute "hub". hubtools up: upload files to hubSpace - needs ~/.hubt.conf with username and password. Create one with 'hubt conf' - uploads all files from the -i directory or the current dir if not specified. hubtools jbrowse <url> <db> : convert Jbrowse trackList.json files to hub.txt. - <url> is the URL to the Jbrowse2 installation, e.g. http://furlonglab.embl.de/FurlongBrowser/ - <db> is assembly identifier hubtools tab <fname>: convert a hub.txt or trackDb.txt to tab-sep format, easier to bulk-edit with sed/cut/etc. - <fname> is the input filename. "hub" and "genome" stanzas are skipped. - output goes to stdout + hubtools conv -i myTsvDir + - convert .tsv files in inDir to .bigBed files in current directory + + hubtools ct -i xxx/archive -o outDir + - Create a hub from an extracted xxx.tar.gz file "archive" directory. You can download a .tar.gz from + in the Genome Browser under My Data > My Session. The archive contains all your custom tracks. + Examples: + hubtools conv -i myTsvs/ hubtools make hg38 hubtools jbrowse http://furlonglab.embl.de/FurlongBrowser/ dm3 hubtools tab hub.txt > tracks.tsv - tracks.json can look like this: - { "hub" : {"hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons"} } + tar xvfz SC_20230723_backup.tar.gz + hubtools ct -i archive -o hub + + For the "make" step: + + tracks.json can look like this, can have more keys, one per track, or the special key "hub": + { + "hub" : { "hub": "mouse_motor_atac", "shortLabel":"scATAC-seq Developing Cranial Motor Neurons" }, + "myTrack" : { "shortLabel" : "My nice track" } + } + + tracks.tsv should look like this, but can have any number of columns: + #track<tab>shortLabel + myTrack<tab>My nice track """) parser.add_option("-i", "--inDir", dest="inDir", action="store", help="Input directory where files are stored. Default is current directory") parser.add_option("-o", "--outDir", dest="outDir", action="store", help="Input directory where hub.txt file is created. Default is same as input directory.") #parser.add_option("", "--igv", dest="igv", action="store", help="import an igv.js trackList.json file hierarchy") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show verbose debug messages") parser.add_option("-u", "--upload", dest="upload", action="store_true", help="upload all files from outDir to hubSpace") (options, args) = parser.parse_args() if len(args)==0: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def errAbort(msg): " print and abort) " logging.error(msg) sys.exit(1) +def makedirs(d): + if not isdir(d): + os.makedirs(d) + def parseMetaRa(fname): """parse tracks.ra or tracks.txt and return as a dict of trackName -> dict of key->val """ logging.debug("Reading %s as .ra" % fname) trackName = None stanzaData = {} ret = {} for line in open(fname): line = line.strip() if line.startswith("#"): continue if line=="": if len(stanzaData)==0: # double newline continue if trackName is None: @@ -277,30 +324,32 @@ def writeLn(ofh, spaceCount, line): "write line to ofh, with spaceCount before it " ofh.write("".join([" "]*spaceCount)) ofh.write(line) ofh.write("\n") def writeStanza(ofh, indent, tdb): " write a stanza given a tdb key-val dict " track = tdb["track"] shortLabel = tdb.get("shortLabel", track.replace("_", " ")) visibility = tdb.get("visibility", "pack") longLabel = tdb.get("longLabel", shortLabel) + if not "type" in tdb: + errAbort("Track info has no type attribute: %s" % tdb) trackType = tdb["type"] writeLn(ofh, indent, "track %s" % track) writeLn(ofh, indent, "shortLabel %s" % shortLabel) if longLabel: writeLn(ofh, indent, "longLabel %s" % longLabel) if "parent" in tdb: writeLn(ofh, indent, "parent %s" % tdb["parent"]) writeLn(ofh, indent, "type %s" % trackType) writeLn(ofh, indent, "visibility %s" % visibility) if "bigDataUrl" in tdb: writeLn(ofh, indent, "bigDataUrl %s" % tdb["bigDataUrl"]) for key, val in tdb.items(): if key in ["track", "shortLabel", "longLabel", "type", "bigDataUrl", "visibility", "parent"]: @@ -514,31 +563,30 @@ writeStanza(ofh, indent, tdb) def importJbrowse(baseUrl, db, outDir): " import an IGV trackList.json hierarchy " import requests outFn = join(outDir, "hub.txt") ofh = open(outFn, "w") writeHubGenome(ofh, db, {}) trackListUrl = baseUrl+"/data/trackList.json" logging.info("Loading %s" % trackListUrl) trackList = requests.get(trackListUrl).json() tdbs = [] for tl in trackList["tracks"]: - #print(tl) if "type" in tl and tl["type"]=="SequenceTrack": logging.info("Genome is: "+tl["label"]) continue if tl["storeClass"]=="JBrowse/Store/SeqFeature/NCList": logging.info("NCList found: ",tl) continue tdb = {} tdb["track"] = tl["label"] tdb["shortLabel"] = tl["key"] if "data_file" in tl: tdb["bigDataUrl"] = baseUrl+"/data/"+tl["data_file"] else: tdb["bigDataUrl"] = baseUrl+"/data/"+tl["urlTemplate"] if tl["storeClass"] == "JBrowse/Store/SeqFeature/BigWig": tdb["type"] = "bigWig" @@ -627,68 +675,412 @@ sortedFields.insert(0, "shortLabel") sortedFields.insert(0, "track") ofh = sys.stdout ofh.write("#") ofh.write("\t".join(sortedFields)) ofh.write("\n") for s in stanzas: row = [] for fieldName in sortedFields: row.append(s.get(fieldName, "")) ofh.write("\t".join(row)) ofh.write("\n") -def hubt(args, options): - """ get writeLn(ofh, indent, d .bb files under dirName and create a trackDb.txt for them""" +def guessFieldDesc(fieldNames): + "given a list of field names, try to guess to which fields in bed12 they correspond " + logging.info("No field description specified, guessing fields from TSV headers: %s" % fieldNames) + fieldDesc = {} + skipFields = set() + + for fieldIdx, fieldName in enumerate(fieldNames): + caseField = fieldName.lower() + if caseField in ["chrom", "chromosome"]: + name = "chrom" + elif caseField.endswith(" id") or caseField.endswith("accession"): + name = "name" + elif caseField in ["start", "chromstart", "position"]: + name = "start" + elif caseField in ["Reference"]: + name = "refAllele" + elif caseField in ["strand"]: + name = "strand" + elif caseField in ["score"]: + name = "score" + else: + continue + fieldDesc[name] = fieldIdx + skipFields.add(fieldIdx) + logging.info("TSV <-> BED correspondance: %s" % fieldDesc) + return fieldDesc, skipFields + +def parseFieldDesc(fieldDescStr, fieldNames): + " given a string chrom=1,start=2,end=3,... return dict {chrom:1,...} " + if not fieldDescStr: + return guessFieldDesc(fieldNames) + + fieldDesc, skipFields = guessFieldDesc(fieldNames) + + if fieldDescStr: + for part in fieldDescStr.split(","): + fieldName, fieldIdx = part.split("=") + fieldIdx = int(fieldIdx) + fieldDesc[fieldName] = fieldIdx + skipFields.add(fieldIdx) + + return fieldDesc, skipFields + +def makeBedRow(row, fieldDesc, skipFields, isOneBased): + " given a row of a tsv file and a fieldDesc with BedFieldName -> field-index, return a bed12+ row with extra fields " + bedRow = [] + + # first construct the bed 12 fields + for fieldName in ["chrom", "start", "end", "name", "score", "strand", + "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "chromStarts"]: + + fieldIdx = fieldDesc.get(fieldName) + if fieldIdx is not None: + if fieldName=="start": + chromStart = int(row[fieldDesc["start"]]) + if isOneBased: + chromStart = chromStart-1 + val = str(chromStart) + elif fieldName=="end": + chromEnd = int(val) + else: + val = row[fieldIdx] + else: + if fieldName=="end": + chromEnd = chromStart+1 + val = str(chromEnd) + elif fieldName=="score": + val = "0" + elif fieldName=="strand": + val = "." + elif fieldName=="thickStart": + val = str(chromStart) + elif fieldName=="thickEnd": + val = str(chromEnd) + elif fieldName=="itemRgb": + val = "0,0,0" + elif fieldName=="blockCount": + val = "1" + elif fieldName=="blockSizes": + val = str(chromEnd-chromStart) + elif fieldName=="chromStarts": + #val = str(chromStart) + val = "0" + else: + logging.error("Cannot find a field for %s" % fieldName) + sys.exit(1) + + bedRow.append(val) + + # now handle all other fields + for fieldIdx, val in enumerate(row): + if not fieldIdx in skipFields: + bedRow.append( row[fieldIdx] ) + + return bedRow + +def fetchChromSizes(db, outputFileName): + " find on local disk or download a <db>.sizes text file " + # Construct the URL based on the database name + url = f"https://hgdownload.cse.ucsc.edu/goldenPath/{db}/database/chromInfo.txt.gz" + # Send a request to download the file + response = requests.get(url, stream=True) + # Check if the request was successful + if response.status_code == 200: + # Open the output gzip file for writing + with gzip.open(outputFileName, 'wt') as outFile: + # Open the response content as a gzip file in text mode + with gzip.GzipFile(fileobj=response.raw, mode='r') as inFile: + # Read the content using csv reader to handle tab-separated values + reader = csv.reader(inFile, delimiter='\t') + writer = csv.writer(outFile, delimiter='\t', lineterminator='\n') + # Iterate through each row, and retain only the first two fields + for row in reader: + writer.writerow(row[:2]) # Write only the first two fields + else: + raise Exception(f"Failed to download file from {url}, status code: {response.status_code}") + logging.info("Downloaded %s to %s" % (db, outputFileName)) + +def getChromSizesFname(db): + " return fname of chrom sizes, download into ~/.local/ucscData/ if not found " + fname = "/hive/data/genomes/%s/chrom.sizes" % db + if isfile(fname): + return fname + + dataDir = "~/.local/hubtools" + fname = join(dataDir, "%s.sizes" % db) + fname = os.path.expanduser(fname) + if not isfile(fname): + makedirs(dataDir) + fetchChromSizes(db, fname) + return fname + +def convTsv(db, tsvFname, outBedFname, outAsFname, outBbFname): + " convert tsv files in inDir to outDir, assume that they all have one column for chrom, start and end. Try to guess these or fail. " + #tsvFname, outBedFname, outAsFname = args + + # join and output merged bed + bigCols = set() # col names of columns with > 255 chars + + #bedFh = open(bedFname, "w") + unsortedBedFh = tempfile.NamedTemporaryFile(suffix=".bed", dir=dirname(outBedFname), mode="wt") + fieldNames = None + #isOneBased = options.oneBased + isOneBased = True + bedFieldsDesc = None # in the future, the user may want to input a string like name=0,start=1, but switch this off for now + + for line in open(tsvFname): + row = line.rstrip("\r\n").split("\t") + if fieldNames is None: + fieldNames = row + fieldDesc, notExtraFields = parseFieldDesc(bedFieldsDesc, fieldNames) + continue + + # note fields with data > 255 chars. + for colName, colData in zip(fieldNames, row): + if len(colData)>255: + bigCols.add(colName) + + bedRow = makeBedRow(row, fieldDesc, notExtraFields, isOneBased) + + chrom = bedRow[0] + if chrom.isdigit() or chrom in ["X", "Y"]: + bedRow[0] = "chr"+bedRow[0] + + unsortedBedFh.write( ("\t".join(bedRow))) + unsortedBedFh.write("\n") + unsortedBedFh.flush() + + cmd = "sort -k1,1 -k2,2n %s > %s" % (unsortedBedFh.name, outBedFname) + assert(os.system(cmd)==0) + unsortedBedFh.close() # removes the temp file + + # generate autosql + # BED fields + #bedColCount = int(options.type.replace("bed", "").replace("+" , "")) + bedColCount = 12 + asFh = open(outAsFname, "w") + asFh.write(asHead) + asFh.write("\n".join(asLines[:bedColCount+1])) + asFh.write("\n") + + # extra fields + #fieldNames = fieldNames[bedColCount:] + for fieldIdx, field in enumerate(fieldNames): + if fieldIdx in notExtraFields: + continue + name = field.replace(" ","") + name = field.replace("%","perc_") + name = re.sub("[^a-zA-Z0-9]", "", name) + name = name[0].lower()+name[1:] + fType = "string" + if field in bigCols: + fType = "lstring" + asFh.write(' %s %s; "%s" \n' % (fType, name, field)) + asFh.write(")\n") + asFh.close() + + chromSizesFname = getChromSizesFname(db) + + cmd = ["bedToBigBed", outBedFname, chromSizesFname, outBbFname, "-tab", "-type=bed%d+" % bedColCount, "-as=%s" % outAsFname] + subprocess.check_call(cmd) + +def convTsvDir(inDir, db, outDir): + " find tsv files under inDir and convert them all to .bb " + ext = "tsv" + pat = join(inDir, '**/*.'+ext) + logging.debug("Finding files under %s (%s), writing output to %s" % (inDir, pat, outDir)) + + for fname in glob.glob(pat, recursive=True): + logging.debug("Found %s" % fname) + absFname = abspath(fname) + relFname = relpath(absFname, inDir) + outPath = Path(join(outDir, relFname)) + + if not outPath.parents[0].is_dir(): + logging.debug("mkdir -p %s" % outPath.parents[0]) + makedirs(outPath.parents[0]) + + bedFname = outPath.with_suffix(".bed") + asFname = outPath.with_suffix(".as") + bbFname = outPath.with_suffix(".bb") + + convTsv(db, fname, bedFname, asFname, bbFname) + +def parseTrackLine(s): + " Use shlex to split the string respecting quotes, written by chatGPT " + lexer = shlex.shlex(s, posix=True) + lexer.whitespace_split = True + lexer.wordchars += '=' # Allow '=' as part of words + + tokens = list(lexer) + + # Convert the tokens into a dictionary + it = iter(tokens) + result = {} + for token in it: + if '=' in token: + key, value = token.split('=', 1) + result[key] = value.strip('"') # Remove surrounding quotes if present + else: + result[token] = next(it).strip('"') # Handle cases like name="...". + + return result + +def readTrackLines(fnames): + " read the first line and convert to a dict of all fnames. " + logging.debug("Reading track lines from %s" % fnames) + ret = {} + for fn in fnames: + line1 = open(fn).readline().rstrip("\n") + notTrack = line1.replace("track ", "", 1) + tdb = parseTrackLine(notTrack) + ret[fn] = tdb + return ret + +def stripFirstLine(inputFilename, outputFilename): + " written by chatGpt: copies all lines to output, except the first line " + with open(inputFilename, 'r') as infile, open(outputFilename, 'w') as outfile: + # Skip the first line + next(infile) + # Write the rest of the lines to the output file + for line in infile: + outfile.write(line) + +def bedToBigBed(inFname, db, outFname): + " convert bed to bigbed file, handles chrom.sizes download " + chromSizesFname = getChromSizesFname(db) + cmd = ["bedToBigBed", inFname, chromSizesFname, outFname] + logging.debug("Running %s" % cmd) + logging.info(f'Converting {inFname} to {outFname}. (chromSizes: {chromSizesFname}') + subprocess.check_call(cmd) + +def convCtDb(db, inDir, outDir): + " convert one db part of a track archive to an output directory " + findGlob = join(inDir, "*.ct") + inFnames = glob.glob(findGlob) + if len(inFnames)==0: + logging.info("No *.ct files found in %s" % findGlob) + return + + tdbData = readTrackLines(inFnames) + + makedirs(outDir) + hubTxtFname = join(outDir, "hub.txt") + ofh = open(hubTxtFname, "wt") + + meta = parseMeta(inDir) + writeHubGenome(ofh, db, meta) + + tdbIdx = 0 + for fname, tdb in tdbData.items(): + tdbIdx += 1 + # custom track names can include spaces, spec characters, etc. Strip all those + # append a number to make sure that the result is unique + track = tdb["name"] + track = re.sub('[^A-Za-z0-9]+', '', track)+"_"+str(tdbIdx) + tdb["track"] = track + + tdb["shortLabel"] = tdb["name"] + del tdb["name"] + tdb["longLabel"] = tdb["description"] + del tdb["description"] + + if "bigDataUrl" not in tdb: + bedFname = join(outDir, tdb["track"]+".bed") + bbFname = join(outDir, tdb["track"]+".bb") + stripFirstLine(fname, bedFname) + bedToBigBed(bedFname, db, bbFname) + os.remove(bedFname) + tdb["bigDataUrl"] = tdb["track"]+".bb" + tdb["type"] = "bigBed" + + writeStanza(ofh, 0, tdb) + + logging.info("Wrote %s" % hubTxtFname) + ofh.close() + +def convCtDir(inDir, outDir): + " convert a directory created from the .tar.gz file downloaded via our track archive feature " + dbContent = os.listdir(inDir) + dbDirs = [] + for db in dbContent: + subDir = join(inDir, db) + if isdir(subDir): + dbDirs.append((db, subDir)) + + if len(dbDirs)==0: + errAbort("No directories found under %s. Extract the tarfile and point this program at the 'archive' directory." % inDir) + + for db, inSubDir in dbDirs: + logging.debug("Processing %s, db=%s" % (inSubDir, db)) + outSubDir = join(outDir, db) + convCtDb(db, inSubDir, outSubDir) + +def hubtools(args, options): + """ find files under dirName and create a trackDb.txt for them""" cmd = args[0] inDir = "." if options.inDir: inDir = options.inDir + outDir = "." + if options.outDir: + outDir = options.outDir + if cmd=="up": uploadFiles(inDir) return tdbDir = inDir if options.outDir: tdbDir = options.outDir if cmd=="jbrowse": importJbrowse(args[1], args[2], tdbDir) elif cmd == "tab": raToTab(args[1]) elif cmd == "make": db = args[1] meta = parseMeta(inDir) dirFiles = readDirs(inDir, meta) hubFname = join(tdbDir, "hub.txt") logging.info("Writing %s" % hubFname) ofh = open(hubFname, "w") meta = parseMeta(inDir) writeHubGenome(ofh, db, meta) writeTdb(inDir, dirFiles, "top", tdbDir, ofh) writeTdb(inDir, dirFiles, "comps", tdbDir, ofh) ofh.close() + + elif cmd=="conv": + db = args[1] + + convTsvDir(inDir, db, outDir) + + elif cmd=="ct": + convCtDir(inDir, outDir) + else: logging.error("Unknown command: '%s'" % args[1]) # ----------- main -------------- def main(): args, options = parseArgs() - hubt(args, options) + hubtools(args, options) - #if options.test: - #logging.debug("test is set") - #f = open(options.file, "r") main()