ab5e63af16ca6461e7853f9a665abb74bbe40a7f max Mon Nov 28 05:26:07 2022 -0800 adding bigBed to bigGuessDb, fixing stupid makefile error and checking missing tools with an error message, refs #30316 diff --git src/utils/bigGuessDb/bigGuessDb src/utils/bigGuessDb/bigGuessDb index 0edd832..419b62a 100755 --- src/utils/bigGuessDb/bigGuessDb +++ src/utils/bigGuessDb/bigGuessDb @@ -1,164 +1,184 @@ #!/usr/bin/env python # pylint: disable=C0103,C0326,C0410,W0402 """ guess the best assembly given a bigWig or bigBed file """ -import logging, optparse +import logging, optparse, sys from collections import defaultdict from os.path import join, isfile, expanduser import os, gzip, subprocess # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] filename - given a bigBed or "\ "bigWig file, " \ "guess the assembly based on the chrom sizes") parser.add_option("-d", "--debug", dest="debug", action="store_true", \ help="show debug messages") parser.add_option("", "--index", dest="index", action="store_true", \ help="go /hive/data/genomes and build a file ~/.guessAssembly.txt with all chromSizes") parser.add_option("-b", "--best", dest="best", action="store_true", \ help="only print a single string, the best matching assembly") (options, args) = parser.parse_args() if args==[] and not options.index: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options def parseSizes(inFname, doSubset): " given a chrom.sizes file, return the 10 longest and 10 shortest chrom names " logging.info("Reading %s",inFname) sizes = list() for line in open(inFname): if line.startswith("#"): continue chrom, size = line.rstrip("\n").split("\t")[:2] sizes.append( (int(size), chrom) ) sizes.sort() if not doSubset: return sizes someSizes = sizes[:10] someSizes.extend(sizes[-10:]) return someSizes def writeSizes(allSizes, outFname): " write all sizes to the index file " ofh = gzip.open(outFname, "wt") # "write" "text" for db, dbSizes in allSizes.items(): sizeParts = ["%s=%d" % (chrom, size) for size,chrom in dbSizes] sizeStr = ",".join(sizeParts) ofh.write("%s\t%s\n" % (db, sizeStr)) ofh.close() logging.info("Wrote %s", outFname) def buildIndex(inDir, outFname): """ go over all direct subdirectories of inDir and find a chrom.sizes file, compact it to format db -> list of (chrom,size) and write to outFname """ allSizes = dict() for db in os.listdir(inDir): - if "Patch" in db or db == "sonMus0": + if "Patch" in db or db == "sonMus0" or db.startswith("braNey"): continue subDir = join(inDir, db) chromFname = join(subDir, "chrom.sizes") if not isfile(chromFname): chromFname = join(subDir, db+".sizes") if not isfile(chromFname): print("not found "+chromFname) continue doSubset = True if db.startswith("hg") or db.startswith("mm"): doSubset = False if os.path.getsize(chromFname) != 0: allSizes[db] = parseSizes(chromFname, doSubset) writeSizes(allSizes, outFname) def readSizeIndex(inFname): " read chrom sizes index and return as dict (chromName, size) - > db " sizeToDbs = defaultdict(list) #sizeToDb = dict() for line in gzip.open(inFname, "rt"): db, sizeStr = line.rstrip("\n").split("\t") sizes = sizeStr.split(",") sizes = [x.split("=") for x in sizes] sizes = [(chrom, int(size)) for (chrom, size) in sizes] for chrom, size in sizes: #assert( (chrom, size) not in sizeToDb ) #sizeToDb[ (chrom, size) ] = db sizeToDbs[ (chrom, size) ].append(db) return sizeToDbs -def bigWigSizes(inFname): +def bigSizes(inFname): " return chrom -> size from bigWig file " sizes = list() - cmd = ["bigWigInfo", "-chroms", inFname] + + cmdExec = "bigWigInfo" + inExt = inFname.lower().split(".")[-1] + if inExt in ["bb", ".igbed"]: + cmdExec = "bigBedInfo" + + cmd = [cmdExec, "-chroms", inFname] + doParse = False + if sys.version_info[0]==2: + p = subprocess.Popen(cmd, stdout=subprocess.PIPE) + else: p = subprocess.Popen(cmd, encoding="latin1", stdout=subprocess.PIPE) + + if p.returncode is not None and p.returncode!=0: + logging.error("Could not run '%s'. Are the tools bigBedInfo and bigWigInfo installed and in your PATH?" % (" ".join(cmd))) + sys.exit(1) + for line in p.stdout: if line.startswith("chromCount: "): doParse = True continue if line.startswith("basesCovered: "): doParse = False continue if doParse: chrom, _idx, size = line.strip().split(" ") sizes.append((chrom, int(size))) return sizes +def sortBySecondLen(e): + " sort function that takes the second element of a tuple and returns its negative length -> put longest lists first " + return -len(e[1]) + def findBestDb(sizeIndex, fileSizes): """ given a list of file sizes, look up all (chrom, size) in chrom size index and report best DBs sorted by number of matches """ dbChromMatch = defaultdict(list) for (chrom, size) in fileSizes: if (chrom, size) in sizeIndex: dbs = sizeIndex[(chrom, size)] for db in dbs: dbChromMatch[db].append(chrom) - dbChromMatch = sorted(dbChromMatch.items(), key=len) - return dbChromMatch + dbMatches = list(dbChromMatch.items()) # dbMatches is now a list of db -> list of chromosomes + dbMatches.sort(key=sortBySecondLen) + return dbMatches def printAllMatches(dbChromMatch): " print all matching dbs as a tsv " - print("#db\tchromMatch\tchromList\n") + print("#db\tmatchCount\tmatchList") for db, chromList in dbChromMatch: print("\t".join([db, str(len(chromList)), ",".join(chromList)])) # ----------- main -------------- def main(): " entry point to script " args, options = parseArgs() indexFname = expanduser("~/.guessAssembly.txt.gz") if options.index: buildIndex("/hive/data/genomes", indexFname) else: inFname = args[0] - fileSizes = bigWigSizes(inFname) + fileSizes = bigSizes(inFname) sizeIndex = readSizeIndex(indexFname) hits = findBestDb(sizeIndex, fileSizes) if options.best: print(hits[0][0]) else: printAllMatches(hits) main()