src/utils/bigGuessDb/bigGuessDb ab5e63af16ca6461e7853f9a665abb74bbe40a7f

ab5e63af16ca6461e7853f9a665abb74bbe40a7f
max
  Mon Nov 28 05:26:07 2022 -0800
adding bigBed to bigGuessDb, fixing stupid makefile error and checking missing tools with an error message, refs #30316

diff --git src/utils/bigGuessDb/bigGuessDb src/utils/bigGuessDb/bigGuessDb
index 0edd832..419b62a 100755
--- src/utils/bigGuessDb/bigGuessDb
+++ src/utils/bigGuessDb/bigGuessDb
@@ -1,164 +1,184 @@
 #!/usr/bin/env python
 # pylint: disable=C0103,C0326,C0410,W0402
 
 """ guess the best assembly given a bigWig or bigBed file """
 
-import logging, optparse
+import logging, optparse, sys
 from collections import defaultdict
 from os.path import join, isfile, expanduser
 import os, gzip, subprocess
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("usage: %prog [options] filename - given a bigBed or "\
             "bigWig file, " \
         "guess the assembly based on the chrom sizes")
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", \
         help="show debug messages")
     parser.add_option("", "--index", dest="index", action="store_true", \
         help="go /hive/data/genomes and build a file ~/.guessAssembly.txt with all chromSizes")
     parser.add_option("-b", "--best", dest="best", action="store_true", \
         help="only print a single string, the best matching assembly")
     (options, args) = parser.parse_args()
 
     if args==[] and not options.index:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
         logging.getLogger().setLevel(logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
         logging.getLogger().setLevel(logging.INFO)
 
     return args, options
 
 def parseSizes(inFname, doSubset):
     " given a chrom.sizes file, return the 10 longest and 10 shortest chrom names "
     logging.info("Reading %s",inFname)
     sizes = list()
     for line in open(inFname):
         if line.startswith("#"):
             continue
         chrom, size = line.rstrip("\n").split("\t")[:2]
         sizes.append( (int(size), chrom) )
     sizes.sort()
 
     if not doSubset:
         return sizes
 
     someSizes = sizes[:10]
     someSizes.extend(sizes[-10:])
     return someSizes
 
 def writeSizes(allSizes, outFname):
     " write all sizes to the index file "
     ofh = gzip.open(outFname, "wt") # "write" "text"
     for db, dbSizes in allSizes.items():
         sizeParts = ["%s=%d" % (chrom, size) for size,chrom in dbSizes]
         sizeStr = ",".join(sizeParts)
         ofh.write("%s\t%s\n" % (db, sizeStr))
     ofh.close()
     logging.info("Wrote %s", outFname)
 
 def buildIndex(inDir, outFname):
     """ go over all direct subdirectories of inDir and find a chrom.sizes file,
     compact it to format db -> list of (chrom,size) and write to outFname """
     allSizes = dict()
     for db in os.listdir(inDir):
-        if "Patch" in db or db == "sonMus0":
+        if "Patch" in db or db == "sonMus0" or db.startswith("braNey"):
             continue
 
         subDir = join(inDir, db)
         chromFname = join(subDir, "chrom.sizes")
         if not isfile(chromFname):
             chromFname = join(subDir, db+".sizes")
 
         if not isfile(chromFname):
             print("not found "+chromFname)
             continue
 
         doSubset = True
         if db.startswith("hg") or db.startswith("mm"):
             doSubset = False
 
         if os.path.getsize(chromFname) != 0:
             allSizes[db] = parseSizes(chromFname, doSubset)
 
     writeSizes(allSizes, outFname)
 
 def readSizeIndex(inFname):
     " read chrom sizes index and return as dict (chromName, size) - > db "
     sizeToDbs = defaultdict(list)
     #sizeToDb = dict()
     for line in gzip.open(inFname, "rt"):
         db, sizeStr = line.rstrip("\n").split("\t")
         sizes = sizeStr.split(",")
         sizes = [x.split("=") for x in sizes]
         sizes = [(chrom, int(size)) for (chrom, size) in sizes]
         for chrom, size in sizes:
             #assert( (chrom, size) not in sizeToDb )
             #sizeToDb[ (chrom, size) ] = db
             sizeToDbs[ (chrom, size) ].append(db)
     return sizeToDbs
 
-def bigWigSizes(inFname):
+def bigSizes(inFname):
     " return chrom -> size from bigWig file "
     sizes = list()
-    cmd = ["bigWigInfo", "-chroms", inFname]
+
+    cmdExec = "bigWigInfo"
+    inExt = inFname.lower().split(".")[-1]
+    if inExt in ["bb", ".igbed"]:
+        cmdExec = "bigBedInfo"
+
+    cmd = [cmdExec, "-chroms", inFname]
+
     doParse = False
+    if sys.version_info[0]==2:
+        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    else:
         p = subprocess.Popen(cmd, encoding="latin1", stdout=subprocess.PIPE)
+
+    if p.returncode is not None and p.returncode!=0:
+        logging.error("Could not run '%s'. Are the tools bigBedInfo and bigWigInfo installed and in your PATH?" % (" ".join(cmd)))
+        sys.exit(1)
+
     for line in p.stdout:
         if line.startswith("chromCount: "):
             doParse = True
             continue
         if line.startswith("basesCovered: "):
             doParse = False
             continue
         if doParse:
             chrom, _idx, size = line.strip().split(" ")
             sizes.append((chrom, int(size)))
     return sizes
 
+def sortBySecondLen(e):
+    " sort function that takes the second element of a tuple and returns its negative length -> put longest lists first "
+    return -len(e[1])
+
 def findBestDb(sizeIndex, fileSizes):
     """ given a list of file sizes, look up all (chrom, size) in chrom size index
     and report best DBs sorted by number of matches """
     dbChromMatch = defaultdict(list)
     for (chrom, size) in fileSizes:
         if (chrom, size) in sizeIndex:
             dbs = sizeIndex[(chrom, size)]
             for db in dbs:
                 dbChromMatch[db].append(chrom)
 
-    dbChromMatch = sorted(dbChromMatch.items(), key=len)
-    return dbChromMatch
+    dbMatches = list(dbChromMatch.items()) # dbMatches is now a list of db -> list of chromosomes
+    dbMatches.sort(key=sortBySecondLen)
+    return dbMatches
 
 def printAllMatches(dbChromMatch):
     " print all matching dbs as a tsv "
-    print("#db\tchromMatch\tchromList\n")
+    print("#db\tmatchCount\tmatchList")
     for db, chromList in dbChromMatch:
         print("\t".join([db, str(len(chromList)), ",".join(chromList)]))
 
 # ----------- main --------------
 def main():
     " entry point to script "
     args, options = parseArgs()
 
     indexFname = expanduser("~/.guessAssembly.txt.gz")
     if options.index:
         buildIndex("/hive/data/genomes", indexFname)
     else:
         inFname = args[0]
-        fileSizes = bigWigSizes(inFname)
+        fileSizes = bigSizes(inFname)
 
         sizeIndex = readSizeIndex(indexFname)
         hits = findBestDb(sizeIndex, fileSizes)
         if options.best:
             print(hits[0][0])
         else:
             printAllMatches(hits)
 
 main()