80f3f48aa2295da051eebc3b19c56a46ee39788b max Thu Mar 26 06:43:36 2026 -0700 adding VCF support to bigGuessDb, no redmine diff --git src/utils/bigGuessDb/bigGuessDb src/utils/bigGuessDb/bigGuessDb index 3021c8ef039..016d0840274 100755 --- src/utils/bigGuessDb/bigGuessDb +++ src/utils/bigGuessDb/bigGuessDb @@ -1,32 +1,33 @@ #!/usr/bin/env python # pylint: disable=C0103,C0326,C0410,W0402 -""" guess the best assembly given a bigWig or bigBed file """ +""" guess the best assembly given a bigWig, bigBed, or VCF file """ import logging, optparse, sys from collections import defaultdict from os.path import join, isfile, expanduser import os, gzip, subprocess # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " - parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed or "\ -bigWig file or URL, + parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed, "\ +bigWig, or VCF file or URL, guess the assembly based on the chrom names and sizes. Must have bigBedInfo and -bigWigInfo in PATH. Also requires a bigGuessDb.txt.gz, an alpha version of +bigWigInfo in PATH for big* files. VCF files (.vcf, .vcf.gz) are parsed directly +from their ##contig header lines. Also requires a bigGuessDb.txt.gz, an alpha version of which can be downloaded at https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz Example run: $ wget https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz $ bigGuessDb --best https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1014nnn/GSM1014177/suppl/GSM1014177_mm9_wgEncodeUwDnaseNih3t3NihsMImmortalSigRep2.bigWig mm9 """) parser.add_option("-d", "--debug", dest="debug", action="store_true", \ help="show debug messages") parser.add_option("", "--index", dest="index", action="store_true", \ help="used by UCSC staff: go over /hive/data/genomes and build an index of all chromSizes") parser.add_option("-b", "--best", dest="best", action="store_true", \ help="only print a single string, the best matching assembly, or 'emptyFile' or 'notFound'. " \ @@ -140,32 +141,57 @@ def readSizeIndex(inFname): " read chrom sizes index and return as dict (chromName, size) - > db " sizeToDbs = defaultdict(list) #sizeToDb = dict() for line in gzip.open(inFname, "rt"): db, sizeStr = line.rstrip("\n").split("\t") sizes = sizeStr.split(",") sizes = [x.split("=") for x in sizes] sizes = [(chrom, int(size)) for (chrom, size) in sizes] for chrom, size in sizes: #assert( (chrom, size) not in sizeToDb ) #sizeToDb[ (chrom, size) ] = db sizeToDbs[ (chrom, size) ].append(db) return sizeToDbs +def vcfSizes(inFname): + " return list of (chrom, size) from VCF ##contig header lines " + import re + sizes = list() + opener = gzip.open if inFname.endswith(".gz") else open + with opener(inFname, "rt") as fh: + for line in fh: + if line.startswith("##contig="): + # handles both <ID=chr1,length=123> and <ID=chr1,assembly=None,length=123,species=> + m = re.search(r'ID=([^,>]+)', line) + mLen = re.search(r'length=(\d+)', line) + if m and mLen: + sizes.append((m.group(1), int(mLen.group(1)))) + elif not line.startswith("#"): + break # stop at first data line + return sizes + +def isVcf(inFname): + " return True if the filename looks like a VCF " + lower = inFname.lower() + return lower.endswith(".vcf") or lower.endswith(".vcf.gz") or lower.endswith(".vcf.bgz") + def bigSizes(inFname): - " return chrom -> size from bigWig file " + " return chrom -> size from bigWig, bigBed, or VCF file " + if isVcf(inFname): + return vcfSizes(inFname) + sizes = list() cmdExec = "bigWigInfo" inExt = inFname.lower().split(".")[-1] if inExt in ["bb", ".igbed"]: cmdExec = "bigBedInfo" cmd = [cmdExec, "-chroms", inFname] doParse = False if sys.version_info[0]==2: p = subprocess.Popen(cmd, stdout=subprocess.PIPE) else: p = subprocess.Popen(cmd, encoding="latin1", stdout=subprocess.PIPE)