src/utils/bigGuessDb/bigGuessDb 05561c4b23fbcf412ab896628db14146f9802143

05561c4b23fbcf412ab896628db14146f9802143
max
  Tue Apr 4 06:00:14 2023 -0700
improving bigGuessDb a little before sending off to NCBI, refs #30316

diff --git src/utils/bigGuessDb/bigGuessDb src/utils/bigGuessDb/bigGuessDb
index c57f76e..3021c8e 100755
--- src/utils/bigGuessDb/bigGuessDb
+++ src/utils/bigGuessDb/bigGuessDb
@@ -1,43 +1,55 @@
 #!/usr/bin/env python
 # pylint: disable=C0103,C0326,C0410,W0402
 
 """ guess the best assembly given a bigWig or bigBed file """
 
 import logging, optparse, sys
 from collections import defaultdict
 from os.path import join, isfile, expanduser
 import os, gzip, subprocess
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
-    parser = optparse.OptionParser("usage: %prog [options] filename - given a bigBed or "\
-            "bigWig file, " \
-        "guess the assembly based on the chrom sizes")
+    parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed or "\
+bigWig file or URL, 
+guess the assembly based on the chrom names and sizes. Must have bigBedInfo and
+bigWigInfo in PATH. Also requires a bigGuessDb.txt.gz, an alpha version of
+which can be downloaded at https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz
+
+Example run:
+    $ wget https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz
+    $ bigGuessDb --best https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1014nnn/GSM1014177/suppl/GSM1014177_mm9_wgEncodeUwDnaseNih3t3NihsMImmortalSigRep2.bigWig
+    mm9
+
+""")
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", \
         help="show debug messages")
     parser.add_option("", "--index", dest="index", action="store_true", \
-        help="go /hive/data/genomes and build an index file with all chromSizes")
+            help="used by UCSC staff: go over /hive/data/genomes and build an index of all chromSizes")
     parser.add_option("-b", "--best", dest="best", action="store_true", \
-        help="only print a single string, the best matching assembly, or 'emptyFile' or 'notFound'. ")
-    parser.add_option("", "--indexFile", dest="indexFname", action="store", \
-        help="Use specified index file, default is %default. Use --index to create this file.", default="~/.guessAssembly.txt.gz")
+        help="only print a single string, the best matching assembly, or 'emptyFile' or 'notFound'. " \
+        "If multiple arguments are given or --fromFile is used, a tab-sep table is output.")
+    parser.add_option("-i", "--indexFile", dest="indexFname", action="store", \
+        help="Use specified index file, default is %default. ", default="bigGuessDb.txt.gz")
+    parser.add_option("", "--fromFile", dest="fromFile", action="store", \
+        help="Read URLs to process from input file, can be /dev/stdin")
     (options, args) = parser.parse_args()
 
-    if args==[] and not options.index:
+    if args==[] and not options.index and not options.fromFile:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
         logging.getLogger().setLevel(logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
         logging.getLogger().setLevel(logging.INFO)
 
     return args, options
 
 def parseSizes(inFname, doSubset):
     " given a chrom.sizes file, return the 10 longest and 10 shortest chrom names "
     logging.info("Reading %s",inFname)
@@ -58,30 +70,47 @@
         return sizes
 
     someSizes = sizes[-20:] # small chroms carry less information and have fewer features
     return someSizes
 
 def writeSizes(allSizes, outFname):
     " write all sizes to the index file "
     ofh = gzip.open(outFname, "wt") # "write" "text"
     for db, dbSizes in allSizes.items():
         sizeParts = ["%s=%d" % (chrom, size) for size,chrom in dbSizes]
         sizeStr = ",".join(sizeParts)
         ofh.write("%s\t%s\n" % (db, sizeStr))
     ofh.close()
     logging.info("Wrote %s", outFname)
 
+def indexGenarkSizes(allSizes, inDir):
+    import glob
+    from os.path import basename
+    for dbType in ["GCA", "GCF"]:
+        baseDir = join(inDir, "asmHubs", dbType)
+        for (dirpath, dirnames, fnames) in os.walk(baseDir):
+            for fname in fnames:
+                if not fname.endswith("chrom.sizes.txt"):
+                    continue
+                fullFname = join(dirpath, fname)
+                db = fname.split("/")[-1].replace(".chrom.sizes.txt", "")
+                # e.g. /GCA/000/002/305/GCA_000002305.1/GCA_000002305.1.chrom.sizes.txt 
+                sizes = parseSizes(fullFname, False)
+                size1 = sizes.pop()
+                allSizes[db] = [size1]
+    return allSizes
+
 def buildIndex(inDir, outFname):
     """ go over all direct subdirectories of inDir and find a chrom.sizes file,
     compact it to format db -> list of (chrom,size) and write to outFname """
     allSizes = dict()
 
     import json # this is not style guide conform, but makes sure that these packages don't lead to problems for users of this script
     from six.moves import urllib # works in python2 and 3
 
     apiData = json.load(urllib.request.urlopen("https://api.genome.ucsc.edu/list/ucscGenomes"))
 
     dbList = list()
     for db, dbData in apiData["ucscGenomes"].items():
         orderKey = dbData["orderKey"]
         dbList.append( (orderKey, db) )
 
@@ -92,30 +121,32 @@
         chromFname = join(subDir, "chrom.sizes")
         if not isfile(chromFname):
             chromFname = join(subDir, db+".sizes")
 
         if not isfile(chromFname):
             print("not found "+chromFname)
             continue
 
         doSubset = True
         if db.startswith("hg") or db.startswith("mm"):
             doSubset = False
 
         if os.path.getsize(chromFname) != 0:
             allSizes[db] = parseSizes(chromFname, doSubset)
 
+    allSizes = indexGenarkSizes(allSizes, inDir)
+
     writeSizes(allSizes, outFname)
 
 def readSizeIndex(inFname):
     " read chrom sizes index and return as dict (chromName, size) - > db "
     sizeToDbs = defaultdict(list)
     #sizeToDb = dict()
     for line in gzip.open(inFname, "rt"):
         db, sizeStr = line.rstrip("\n").split("\t")
         sizes = sizeStr.split(",")
         sizes = [x.split("=") for x in sizes]
         sizes = [(chrom, int(size)) for (chrom, size) in sizes]
         for chrom, size in sizes:
             #assert( (chrom, size) not in sizeToDb )
             #sizeToDb[ (chrom, size) ] = db
             sizeToDbs[ (chrom, size) ].append(db)
@@ -160,50 +191,67 @@
 
 def findBestDb(sizeIndex, fileSizes):
     """ given a list of file sizes, look up all (chrom, size) in chrom size index
     and report best DBs sorted by number of matches """
     dbChromMatch = defaultdict(list)
     for (chrom, size) in fileSizes:
         if (chrom, size) in sizeIndex:
             dbs = sizeIndex[(chrom, size)]
             for db in dbs:
                 dbChromMatch[db].append(chrom)
 
     dbMatches = list(dbChromMatch.items()) # dbMatches is now a list of db -> list of chromosomes
     dbMatches.sort(key=sortBySecondLen)
     return dbMatches
 
-def printAllMatches(dbChromMatch):
+def printAllMatches(inFname, dbChromMatch):
     " print all matching dbs as a tsv "
-    print("#db\tmatchCount\tmatchList")
     for db, chromList in dbChromMatch:
-        print("\t".join([db, str(len(chromList)), ",".join(chromList)]))
+        print("\t".join([inFname, db, str(len(chromList)), ",".join(chromList)]))
 
 # ----------- main --------------
 def main():
     " entry point to script "
     args, options = parseArgs()
 
     indexFname = expanduser(options.indexFname)
     if options.index:
         buildIndex("/hive/data/genomes", indexFname)
+        exit(0)
+
+    if len(args)>0:
+        inFnames = args
     else:
-        inFname = args[0]
+        inFnames = open(options.fromFile).read().splitlines()
+
+    if options.best:
+        if len(inFnames)>1:
+            print("#fname\tbestDb")
+    else:
+        print("#fname\tdb\tmatchCount\tmatchList")
+
+    for inFname in inFnames:
         fileSizes = bigSizes(inFname)
 
         if len(fileSizes) == 0:
             logging.debug("%s is empty. Cannot determine assembly." % inFname)
-            hits = ( ("emptyFile", 0),  )
+            hits = ( ("emptyFile", ["0"]),  )
         else:
             sizeIndex = readSizeIndex(indexFname)
             hits = findBestDb(sizeIndex, fileSizes)
 
         if options.best:
             if len(hits)==0:
-                print("notFound")
+                bestDb = "notFound"
             else:
                 #if (len(hits[0][1]) >= 2): # need more than a single match, as chrM often matches
-                print(hits[0][0])
+                # - deactivated for now, as we store only a single size for GenArk genomes 
+                bestDb = hits[0][0]
+            if len(inFnames)>1:
+                print(inFname+"\t"+bestDb)
+            else:
+                print(bestDb)
+
         else:
-            printAllMatches(hits)
+            printAllMatches(inFname, hits)
 
 main()