src/utils/bigGuessDb/bigGuessDb dc1e0e76dbe49861bd0ebe8db64e27f587737794

dc1e0e76dbe49861bd0ebe8db64e27f587737794
max
  Mon Mar 30 15:40:03 2026 -0700
adding two more phased variants tracks, refs #37306

diff --git src/utils/bigGuessDb/bigGuessDb src/utils/bigGuessDb/bigGuessDb
index 016d0840274..87fa6542a97 100755
--- src/utils/bigGuessDb/bigGuessDb
+++ src/utils/bigGuessDb/bigGuessDb
@@ -1,283 +1,290 @@
 #!/usr/bin/env python
 # pylint: disable=C0103,C0326,C0410,W0402
 
 """ guess the best assembly given a bigWig, bigBed, or VCF file """
 
-import logging, optparse, sys
-from collections import defaultdict
-from os.path import join, isfile, expanduser
+import logging, optparse, sys, re, glob
 import os, gzip, subprocess
+from collections import defaultdict
+from os.path import join, isfile, expanduser, basename
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed, "\
 bigWig, or VCF file or URL,
 guess the assembly based on the chrom names and sizes. Must have bigBedInfo and
 bigWigInfo in PATH for big* files. VCF files (.vcf, .vcf.gz) are parsed directly
 from their ##contig header lines. Also requires a bigGuessDb.txt.gz, an alpha version of
 which can be downloaded at https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz
 
 Example run:
     $ wget https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz
     $ bigGuessDb --best https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1014nnn/GSM1014177/suppl/GSM1014177_mm9_wgEncodeUwDnaseNih3t3NihsMImmortalSigRep2.bigWig
     mm9
 
 """)
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", \
         help="show debug messages")
     parser.add_option("", "--index", dest="index", action="store_true", \
             help="used by UCSC staff: go over /hive/data/genomes and build an index of all chromSizes")
     parser.add_option("-b", "--best", dest="best", action="store_true", \
         help="only print a single string, the best matching assembly, or 'emptyFile' or 'notFound'. " \
         "If multiple arguments are given or --fromFile is used, a tab-sep table is output.")
     parser.add_option("-i", "--indexFile", dest="indexFname", action="store", \
         help="Use specified index file, default is %default. ", default="bigGuessDb.txt.gz")
     parser.add_option("", "--fromFile", dest="fromFile", action="store", \
         help="Read URLs to process from input file, can be /dev/stdin")
     (options, args) = parser.parse_args()
 
     if args==[] and not options.index and not options.fromFile:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
         logging.getLogger().setLevel(logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
         logging.getLogger().setLevel(logging.INFO)
 
     return args, options
 
 def parseSizes(inFname, doSubset):
     " given a chrom.sizes file, return the 10 longest and 10 shortest chrom names "
-    logging.info("Reading %s",inFname)
     sizes = list()
+    logging.info("Reading %s",inFname)
     for line in open(inFname):
         if line.startswith("#"):
             continue
         chrom, size = line.rstrip("\n").split("\t")[:2]
         sizes.append( (int(size), chrom) )
 
     if len(sizes)==0:
         logging.error("bigBed/bigWig file is empty. Cannot guess assembly.")
         return None
 
     sizes.sort()
 
     if not doSubset:
         return sizes
 
     someSizes = sizes[-20:] # small chroms carry less information and have fewer features
     return someSizes
 
 def writeSizes(allSizes, outFname):
     " write all sizes to the index file "
     ofh = gzip.open(outFname, "wt") # "write" "text"
     for db, dbSizes in allSizes.items():
         sizeParts = ["%s=%d" % (chrom, size) for size,chrom in dbSizes]
         sizeStr = ",".join(sizeParts)
         ofh.write("%s\t%s\n" % (db, sizeStr))
     ofh.close()
     logging.info("Wrote %s", outFname)
 
 def indexGenarkSizes(allSizes, inDir):
-    import glob
-    from os.path import basename
     for dbType in ["GCA", "GCF"]:
         baseDir = join(inDir, "asmHubs", dbType)
         for (dirpath, dirnames, fnames) in os.walk(baseDir):
             for fname in fnames:
                 if not fname.endswith("chrom.sizes.txt"):
                     continue
                 fullFname = join(dirpath, fname)
-                db = fname.split("/")[-1].replace(".chrom.sizes.txt", "")
                 # e.g. /GCA/000/002/305/GCA_000002305.1/GCA_000002305.1.chrom.sizes.txt 
+                if not isfile(fullFname):
+                    logging.error(fullFname+" does not exist.")
+                    continue
+
+                db = fname.split("/")[-1].replace(".chrom.sizes.txt", "")
                 sizes = parseSizes(fullFname, False)
                 size1 = sizes.pop()
                 allSizes[db] = [size1]
     return allSizes
 
 def buildIndex(inDir, outFname):
     """ go over all direct subdirectories of inDir and find a chrom.sizes file,
     compact it to format db -> list of (chrom,size) and write to outFname """
     allSizes = dict()
 
     import json # this is not style guide conform, but makes sure that these packages don't lead to problems for users of this script
     from six.moves import urllib # works in python2 and 3
 
     apiData = json.load(urllib.request.urlopen("https://api.genome.ucsc.edu/list/ucscGenomes"))
 
     dbList = list()
     for db, dbData in apiData["ucscGenomes"].items():
         orderKey = dbData["orderKey"]
         dbList.append( (orderKey, db) )
 
     dbList.sort()
 
     for orderKey, db in dbList:
         subDir = join(inDir, db)
         chromFname = join(subDir, "chrom.sizes")
         if not isfile(chromFname):
             chromFname = join(subDir, db+".sizes")
 
         if not isfile(chromFname):
             print("not found "+chromFname)
             continue
 
         doSubset = True
         if db.startswith("hg") or db.startswith("mm"):
             doSubset = False
 
         if os.path.getsize(chromFname) != 0:
             allSizes[db] = parseSizes(chromFname, doSubset)
 
     allSizes = indexGenarkSizes(allSizes, inDir)
 
     writeSizes(allSizes, outFname)
 
 def readSizeIndex(inFname):
     " read chrom sizes index and return as dict (chromName, size) - > db "
     sizeToDbs = defaultdict(list)
     #sizeToDb = dict()
     for line in gzip.open(inFname, "rt"):
         db, sizeStr = line.rstrip("\n").split("\t")
         sizes = sizeStr.split(",")
         sizes = [x.split("=") for x in sizes]
         sizes = [(chrom, int(size)) for (chrom, size) in sizes]
         for chrom, size in sizes:
             #assert( (chrom, size) not in sizeToDb )
             #sizeToDb[ (chrom, size) ] = db
             sizeToDbs[ (chrom, size) ].append(db)
     return sizeToDbs
 
 def vcfSizes(inFname):
     " return list of (chrom, size) from VCF ##contig header lines "
-    import re
     sizes = list()
-    opener = gzip.open if inFname.endswith(".gz") else open
+    opener = gzip.open if inFname.endswith(".gz") or inFname.endswith(".bgz") else open
     with opener(inFname, "rt") as fh:
         for line in fh:
             if line.startswith("##contig="):
                 # handles both <ID=chr1,length=123> and <ID=chr1,assembly=None,length=123,species=>
                 m = re.search(r'ID=([^,>]+)', line)
                 mLen = re.search(r'length=(\d+)', line)
                 if m and mLen:
                     sizes.append((m.group(1), int(mLen.group(1))))
             elif not line.startswith("#"):
                 break  # stop at first data line
     return sizes
 
 def isVcf(inFname):
     " return True if the filename looks like a VCF "
     lower = inFname.lower()
     return lower.endswith(".vcf") or lower.endswith(".vcf.gz") or lower.endswith(".vcf.bgz")
 
 def bigSizes(inFname):
     " return chrom -> size from bigWig, bigBed, or VCF file "
     if isVcf(inFname):
         return vcfSizes(inFname)
 
     sizes = list()
 
     cmdExec = "bigWigInfo"
     inExt = inFname.lower().split(".")[-1]
     if inExt in ["bb", ".igbed"]:
         cmdExec = "bigBedInfo"
 
     cmd = [cmdExec, "-chroms", inFname]
 
     doParse = False
     if sys.version_info[0]==2:
         p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
     else:
         p = subprocess.Popen(cmd, encoding="latin1", stdout=subprocess.PIPE)
 
     if p.returncode is not None and p.returncode!=0:
         logging.error("Could not run '%s'. Are the tools bigBedInfo and bigWigInfo installed and in your PATH?" % (" ".join(cmd)))
         sys.exit(1)
 
     for line in p.stdout:
         if line.startswith("chromCount: "):
             doParse = True
             continue
         if line.startswith("basesCovered: "):
             doParse = False
             continue
         if doParse:
             chrom, _idx, size = line.strip().split(" ")
             sizes.append((chrom, int(size)))
     return sizes
 
 def sortBySecondLen(e):
     " sort function that takes the second element of a tuple and returns its negative length -> put longest lists first "
     return -len(e[1])
 
 def findBestDb(sizeIndex, fileSizes):
     """ given a list of file sizes, look up all (chrom, size) in chrom size index
     and report best DBs sorted by number of matches """
     dbChromMatch = defaultdict(list)
     for (chrom, size) in fileSizes:
         if (chrom, size) in sizeIndex:
             dbs = sizeIndex[(chrom, size)]
             for db in dbs:
                 dbChromMatch[db].append(chrom)
 
     dbMatches = list(dbChromMatch.items()) # dbMatches is now a list of db -> list of chromosomes
     dbMatches.sort(key=sortBySecondLen)
     return dbMatches
 
 def printAllMatches(inFname, dbChromMatch):
     " print all matching dbs as a tsv "
     for db, chromList in dbChromMatch:
         print("\t".join([inFname, db, str(len(chromList)), ",".join(chromList)]))
 
 # ----------- main --------------
 def main():
     " entry point to script "
     args, options = parseArgs()
 
+    if not options.indexFname:
+        indexFname = "/hive/data/genomes/bigGuessDb.txt.gz"
+        if not isfile(indexFname):
+            indexFname = "bigGuessDb.txt.gz"
+    else:
         indexFname = expanduser(options.indexFname)
+
     if options.index:
         buildIndex("/hive/data/genomes", indexFname)
         exit(0)
 
     if len(args)>0:
         inFnames = args
     else:
         inFnames = open(options.fromFile).read().splitlines()
 
     if options.best:
         if len(inFnames)>1:
             print("#fname\tbestDb")
     else:
         print("#fname\tdb\tmatchCount\tmatchList")
 
     for inFname in inFnames:
         fileSizes = bigSizes(inFname)
 
         if len(fileSizes) == 0:
             logging.debug("%s is empty. Cannot determine assembly." % inFname)
             hits = ( ("emptyFile", ["0"]),  )
         else:
             sizeIndex = readSizeIndex(indexFname)
             hits = findBestDb(sizeIndex, fileSizes)
 
         if options.best:
             if len(hits)==0:
                 bestDb = "notFound"
             else:
                 #if (len(hits[0][1]) >= 2): # need more than a single match, as chrM often matches
                 # - deactivated for now, as we store only a single size for GenArk genomes 
                 bestDb = hits[0][0]
             if len(inFnames)>1:
                 print(inFname+"\t"+bestDb)
             else:
                 print(bestDb)
 
         else:
             printAllMatches(inFname, hits)
 
 main()