dc1e0e76dbe49861bd0ebe8db64e27f587737794 max Mon Mar 30 15:40:03 2026 -0700 adding two more phased variants tracks, refs #37306 diff --git src/utils/bigGuessDb/bigGuessDb src/utils/bigGuessDb/bigGuessDb index 016d0840274..87fa6542a97 100755 --- src/utils/bigGuessDb/bigGuessDb +++ src/utils/bigGuessDb/bigGuessDb @@ -1,24 +1,24 @@ #!/usr/bin/env python # pylint: disable=C0103,C0326,C0410,W0402 """ guess the best assembly given a bigWig, bigBed, or VCF file """ -import logging, optparse, sys -from collections import defaultdict -from os.path import join, isfile, expanduser +import logging, optparse, sys, re, glob import os, gzip, subprocess +from collections import defaultdict +from os.path import join, isfile, expanduser, basename # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] inFile - given a bigBed, "\ bigWig, or VCF file or URL, guess the assembly based on the chrom names and sizes. Must have bigBedInfo and bigWigInfo in PATH for big* files. VCF files (.vcf, .vcf.gz) are parsed directly from their ##contig header lines. Also requires a bigGuessDb.txt.gz, an alpha version of which can be downloaded at https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz Example run: $ wget https://hgwdev.gi.ucsc.edu/~max/bigGuessDb/bigGuessDb.txt.gz $ bigGuessDb --best https://ftp.ncbi.nlm.nih.gov/geo/samples/GSM1014nnn/GSM1014177/suppl/GSM1014177_mm9_wgEncodeUwDnaseNih3t3NihsMImmortalSigRep2.bigWig mm9 @@ -41,72 +41,74 @@ if args==[] and not options.index and not options.fromFile: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) logging.getLogger().setLevel(logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.getLogger().setLevel(logging.INFO) return args, options def parseSizes(inFname, doSubset): " given a chrom.sizes file, return the 10 longest and 10 shortest chrom names " - logging.info("Reading %s",inFname) sizes = list() + logging.info("Reading %s",inFname) for line in open(inFname): if line.startswith("#"): continue chrom, size = line.rstrip("\n").split("\t")[:2] sizes.append( (int(size), chrom) ) if len(sizes)==0: logging.error("bigBed/bigWig file is empty. Cannot guess assembly.") return None sizes.sort() if not doSubset: return sizes someSizes = sizes[-20:] # small chroms carry less information and have fewer features return someSizes def writeSizes(allSizes, outFname): " write all sizes to the index file " ofh = gzip.open(outFname, "wt") # "write" "text" for db, dbSizes in allSizes.items(): sizeParts = ["%s=%d" % (chrom, size) for size,chrom in dbSizes] sizeStr = ",".join(sizeParts) ofh.write("%s\t%s\n" % (db, sizeStr)) ofh.close() logging.info("Wrote %s", outFname) def indexGenarkSizes(allSizes, inDir): - import glob - from os.path import basename for dbType in ["GCA", "GCF"]: baseDir = join(inDir, "asmHubs", dbType) for (dirpath, dirnames, fnames) in os.walk(baseDir): for fname in fnames: if not fname.endswith("chrom.sizes.txt"): continue fullFname = join(dirpath, fname) - db = fname.split("/")[-1].replace(".chrom.sizes.txt", "") # e.g. /GCA/000/002/305/GCA_000002305.1/GCA_000002305.1.chrom.sizes.txt + if not isfile(fullFname): + logging.error(fullFname+" does not exist.") + continue + + db = fname.split("/")[-1].replace(".chrom.sizes.txt", "") sizes = parseSizes(fullFname, False) size1 = sizes.pop() allSizes[db] = [size1] return allSizes def buildIndex(inDir, outFname): """ go over all direct subdirectories of inDir and find a chrom.sizes file, compact it to format db -> list of (chrom,size) and write to outFname """ allSizes = dict() import json # this is not style guide conform, but makes sure that these packages don't lead to problems for users of this script from six.moves import urllib # works in python2 and 3 apiData = json.load(urllib.request.urlopen("https://api.genome.ucsc.edu/list/ucscGenomes")) @@ -143,33 +145,32 @@ sizeToDbs = defaultdict(list) #sizeToDb = dict() for line in gzip.open(inFname, "rt"): db, sizeStr = line.rstrip("\n").split("\t") sizes = sizeStr.split(",") sizes = [x.split("=") for x in sizes] sizes = [(chrom, int(size)) for (chrom, size) in sizes] for chrom, size in sizes: #assert( (chrom, size) not in sizeToDb ) #sizeToDb[ (chrom, size) ] = db sizeToDbs[ (chrom, size) ].append(db) return sizeToDbs def vcfSizes(inFname): " return list of (chrom, size) from VCF ##contig header lines " - import re sizes = list() - opener = gzip.open if inFname.endswith(".gz") else open + opener = gzip.open if inFname.endswith(".gz") or inFname.endswith(".bgz") else open with opener(inFname, "rt") as fh: for line in fh: if line.startswith("##contig="): # handles both and m = re.search(r'ID=([^,>]+)', line) mLen = re.search(r'length=(\d+)', line) if m and mLen: sizes.append((m.group(1), int(mLen.group(1)))) elif not line.startswith("#"): break # stop at first data line return sizes def isVcf(inFname): " return True if the filename looks like a VCF " lower = inFname.lower() @@ -227,31 +228,37 @@ dbMatches = list(dbChromMatch.items()) # dbMatches is now a list of db -> list of chromosomes dbMatches.sort(key=sortBySecondLen) return dbMatches def printAllMatches(inFname, dbChromMatch): " print all matching dbs as a tsv " for db, chromList in dbChromMatch: print("\t".join([inFname, db, str(len(chromList)), ",".join(chromList)])) # ----------- main -------------- def main(): " entry point to script " args, options = parseArgs() + if not options.indexFname: + indexFname = "/hive/data/genomes/bigGuessDb.txt.gz" + if not isfile(indexFname): + indexFname = "bigGuessDb.txt.gz" + else: indexFname = expanduser(options.indexFname) + if options.index: buildIndex("/hive/data/genomes", indexFname) exit(0) if len(args)>0: inFnames = args else: inFnames = open(options.fromFile).read().splitlines() if options.best: if len(inFnames)>1: print("#fname\tbestDb") else: print("#fname\tdb\tmatchCount\tmatchList")