05561c4b23fbcf412ab896628db14146f9802143 max Tue Apr 4 06:00:14 2023 -0700 improving bigGuessDb a little before sending off to NCBI, refs #30316 diff --git src/utils/bigGuessDb/bigGuessDb src/utils/bigGuessDb/bigGuessDb index c57f76e..3021c8e 100755 --- src/utils/bigGuessDb/bigGuessDb +++ src/utils/bigGuessDb/bigGuessDb @@ -1,43 +1,55 @@ #!/usr/bin/env python # pylint: disable=C0103,C0326,C0410,W0402 """ guess the best assembly given a bigWig or bigBed file """ import logging, optparse, sys from collections import defaultdict from os.path import join, isfile, expanduser import os, gzip, subprocess # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " - parser = optparse.OptionParser("usage: %prog [options] filename - given a bigBed or "\ - "bigWig file, " \ - "guess the assembly based on the chrom sizes") + parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed or "\ +bigWig file or URL, +guess the assembly based on the chrom names and sizes. Must have bigBedInfo and +bigWigInfo in PATH. Also requires a bigGuessDb.txt.gz, an alpha version of +which can be downloaded at https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz + +Example run: + $ wget https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz + $ bigGuessDb --best https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1014nnn/GSM1014177/suppl/GSM1014177_mm9_wgEncodeUwDnaseNih3t3NihsMImmortalSigRep2.bigWig + mm9 + +""") parser.add_option("-d", "--debug", dest="debug", action="store_true", \ help="show debug messages") parser.add_option("", "--index", dest="index", action="store_true", \ - help="go /hive/data/genomes and build an index file with all chromSizes") + help="used by UCSC staff: go over /hive/data/genomes and build an index of all chromSizes") parser.add_option("-b", "--best", dest="best", action="store_true", \ - help="only print a single string, the best matching assembly, or 'emptyFile' or 'notFound'. ") - parser.add_option("", "--indexFile", dest="indexFname", action="store", \ - help="Use specified index file, default is %default. Use --index to create this file.", default="~/.guessAssembly.txt.gz") + help="only print a single string, the best matching assembly, or 'emptyFile' or 'notFound'. " \ + "If multiple arguments are given or --fromFile is used, a tab-sep table is output.") + parser.add_option("-i", "--indexFile", dest="indexFname", action="store", \ + help="Use specified index file, default is %default. ", default="bigGuessDb.txt.gz") + parser.add_option("", "--fromFile", dest="fromFile", action="store", \ + help="Read URLs to process from input file, can be /dev/stdin") (options, args) = parser.parse_args() - if args==[] and not options.index: + if args==[] and not options.index and not options.fromFile: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options def parseSizes(inFname, doSubset): " given a chrom.sizes file, return the 10 longest and 10 shortest chrom names " logging.info("Reading %s",inFname) @@ -58,30 +70,47 @@ return sizes someSizes = sizes[-20:] # small chroms carry less information and have fewer features return someSizes def writeSizes(allSizes, outFname): " write all sizes to the index file " ofh = gzip.open(outFname, "wt") # "write" "text" for db, dbSizes in allSizes.items(): sizeParts = ["%s=%d" % (chrom, size) for size,chrom in dbSizes] sizeStr = ",".join(sizeParts) ofh.write("%s\t%s\n" % (db, sizeStr)) ofh.close() logging.info("Wrote %s", outFname) +def indexGenarkSizes(allSizes, inDir): + import glob + from os.path import basename + for dbType in ["GCA", "GCF"]: + baseDir = join(inDir, "asmHubs", dbType) + for (dirpath, dirnames, fnames) in os.walk(baseDir): + for fname in fnames: + if not fname.endswith("chrom.sizes.txt"): + continue + fullFname = join(dirpath, fname) + db = fname.split("/")[-1].replace(".chrom.sizes.txt", "") + # e.g. /GCA/000/002/305/GCA_000002305.1/GCA_000002305.1.chrom.sizes.txt + sizes = parseSizes(fullFname, False) + size1 = sizes.pop() + allSizes[db] = [size1] + return allSizes + def buildIndex(inDir, outFname): """ go over all direct subdirectories of inDir and find a chrom.sizes file, compact it to format db -> list of (chrom,size) and write to outFname """ allSizes = dict() import json # this is not style guide conform, but makes sure that these packages don't lead to problems for users of this script from six.moves import urllib # works in python2 and 3 apiData = json.load(urllib.request.urlopen("https://api.genome.ucsc.edu/list/ucscGenomes")) dbList = list() for db, dbData in apiData["ucscGenomes"].items(): orderKey = dbData["orderKey"] dbList.append( (orderKey, db) ) @@ -92,30 +121,32 @@ chromFname = join(subDir, "chrom.sizes") if not isfile(chromFname): chromFname = join(subDir, db+".sizes") if not isfile(chromFname): print("not found "+chromFname) continue doSubset = True if db.startswith("hg") or db.startswith("mm"): doSubset = False if os.path.getsize(chromFname) != 0: allSizes[db] = parseSizes(chromFname, doSubset) + allSizes = indexGenarkSizes(allSizes, inDir) + writeSizes(allSizes, outFname) def readSizeIndex(inFname): " read chrom sizes index and return as dict (chromName, size) - > db " sizeToDbs = defaultdict(list) #sizeToDb = dict() for line in gzip.open(inFname, "rt"): db, sizeStr = line.rstrip("\n").split("\t") sizes = sizeStr.split(",") sizes = [x.split("=") for x in sizes] sizes = [(chrom, int(size)) for (chrom, size) in sizes] for chrom, size in sizes: #assert( (chrom, size) not in sizeToDb ) #sizeToDb[ (chrom, size) ] = db sizeToDbs[ (chrom, size) ].append(db) @@ -160,50 +191,67 @@ def findBestDb(sizeIndex, fileSizes): """ given a list of file sizes, look up all (chrom, size) in chrom size index and report best DBs sorted by number of matches """ dbChromMatch = defaultdict(list) for (chrom, size) in fileSizes: if (chrom, size) in sizeIndex: dbs = sizeIndex[(chrom, size)] for db in dbs: dbChromMatch[db].append(chrom) dbMatches = list(dbChromMatch.items()) # dbMatches is now a list of db -> list of chromosomes dbMatches.sort(key=sortBySecondLen) return dbMatches -def printAllMatches(dbChromMatch): +def printAllMatches(inFname, dbChromMatch): " print all matching dbs as a tsv " - print("#db\tmatchCount\tmatchList") for db, chromList in dbChromMatch: - print("\t".join([db, str(len(chromList)), ",".join(chromList)])) + print("\t".join([inFname, db, str(len(chromList)), ",".join(chromList)])) # ----------- main -------------- def main(): " entry point to script " args, options = parseArgs() indexFname = expanduser(options.indexFname) if options.index: buildIndex("/hive/data/genomes", indexFname) + exit(0) + + if len(args)>0: + inFnames = args else: - inFname = args[0] + inFnames = open(options.fromFile).read().splitlines() + + if options.best: + if len(inFnames)>1: + print("#fname\tbestDb") + else: + print("#fname\tdb\tmatchCount\tmatchList") + + for inFname in inFnames: fileSizes = bigSizes(inFname) if len(fileSizes) == 0: logging.debug("%s is empty. Cannot determine assembly." % inFname) - hits = ( ("emptyFile", 0), ) + hits = ( ("emptyFile", ["0"]), ) else: sizeIndex = readSizeIndex(indexFname) hits = findBestDb(sizeIndex, fileSizes) if options.best: if len(hits)==0: - print("notFound") + bestDb = "notFound" else: #if (len(hits[0][1]) >= 2): # need more than a single match, as chrM often matches - print(hits[0][0]) + # - deactivated for now, as we store only a single size for GenArk genomes + bestDb = hits[0][0] + if len(inFnames)>1: + print(inFname+"\t"+bestDb) + else: + print(bestDb) + else: - printAllMatches(hits) + printAllMatches(inFname, hits) main()