src/utils/bigGuessDb/bigGuessDb 80f3f48aa2295da051eebc3b19c56a46ee39788b

80f3f48aa2295da051eebc3b19c56a46ee39788b
max
  Thu Mar 26 06:43:36 2026 -0700
adding VCF support to bigGuessDb, no redmine

diff --git src/utils/bigGuessDb/bigGuessDb src/utils/bigGuessDb/bigGuessDb
index 3021c8ef039..016d0840274 100755
--- src/utils/bigGuessDb/bigGuessDb
+++ src/utils/bigGuessDb/bigGuessDb
@@ -1,32 +1,33 @@
 #!/usr/bin/env python
 # pylint: disable=C0103,C0326,C0410,W0402
 
-""" guess the best assembly given a bigWig or bigBed file """
+""" guess the best assembly given a bigWig, bigBed, or VCF file """
 
 import logging, optparse, sys
 from collections import defaultdict
 from os.path import join, isfile, expanduser
 import os, gzip, subprocess
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
-    parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed or "\
-bigWig file or URL, 
+    parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed, "\
+bigWig, or VCF file or URL,
 guess the assembly based on the chrom names and sizes. Must have bigBedInfo and
-bigWigInfo in PATH. Also requires a bigGuessDb.txt.gz, an alpha version of
+bigWigInfo in PATH for big* files. VCF files (.vcf, .vcf.gz) are parsed directly
+from their ##contig header lines. Also requires a bigGuessDb.txt.gz, an alpha version of
 which can be downloaded at https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz
 
 Example run:
     $ wget https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz
     $ bigGuessDb --best https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1014nnn/GSM1014177/suppl/GSM1014177_mm9_wgEncodeUwDnaseNih3t3NihsMImmortalSigRep2.bigWig
     mm9
 
 """)
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", \
         help="show debug messages")
     parser.add_option("", "--index", dest="index", action="store_true", \
             help="used by UCSC staff: go over /hive/data/genomes and build an index of all chromSizes")
     parser.add_option("-b", "--best", dest="best", action="store_true", \
         help="only print a single string, the best matching assembly, or 'emptyFile' or 'notFound'. " \
@@ -140,32 +141,57 @@
 def readSizeIndex(inFname):
     " read chrom sizes index and return as dict (chromName, size) - > db "
     sizeToDbs = defaultdict(list)
     #sizeToDb = dict()
     for line in gzip.open(inFname, "rt"):
         db, sizeStr = line.rstrip("\n").split("\t")
         sizes = sizeStr.split(",")
         sizes = [x.split("=") for x in sizes]
         sizes = [(chrom, int(size)) for (chrom, size) in sizes]
         for chrom, size in sizes:
             #assert( (chrom, size) not in sizeToDb )
             #sizeToDb[ (chrom, size) ] = db
             sizeToDbs[ (chrom, size) ].append(db)
     return sizeToDbs
 
+def vcfSizes(inFname):
+    " return list of (chrom, size) from VCF ##contig header lines "
+    import re
+    sizes = list()
+    opener = gzip.open if inFname.endswith(".gz") else open
+    with opener(inFname, "rt") as fh:
+        for line in fh:
+            if line.startswith("##contig="):
+                # handles both <ID=chr1,length=123> and <ID=chr1,assembly=None,length=123,species=>
+                m = re.search(r'ID=([^,>]+)', line)
+                mLen = re.search(r'length=(\d+)', line)
+                if m and mLen:
+                    sizes.append((m.group(1), int(mLen.group(1))))
+            elif not line.startswith("#"):
+                break  # stop at first data line
+    return sizes
+
+def isVcf(inFname):
+    " return True if the filename looks like a VCF "
+    lower = inFname.lower()
+    return lower.endswith(".vcf") or lower.endswith(".vcf.gz") or lower.endswith(".vcf.bgz")
+
 def bigSizes(inFname):
-    " return chrom -> size from bigWig file "
+    " return chrom -> size from bigWig, bigBed, or VCF file "
+    if isVcf(inFname):
+        return vcfSizes(inFname)
+
     sizes = list()
 
     cmdExec = "bigWigInfo"
     inExt = inFname.lower().split(".")[-1]
     if inExt in ["bb", ".igbed"]:
         cmdExec = "bigBedInfo"
 
     cmd = [cmdExec, "-chroms", inFname]
 
     doParse = False
     if sys.version_info[0]==2:
         p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
     else:
         p = subprocess.Popen(cmd, encoding="latin1", stdout=subprocess.PIPE)