a3f24a9e80f42401d17724d6b88e6a9df1c24f24 max Tue Feb 2 06:39:57 2021 -0800 changes after code review, refs #26903 diff --git src/utils/chromToUcsc/chromToUcsc src/utils/chromToUcsc/chromToUcsc index 6e17808..4c4bb8d 100755 --- src/utils/chromToUcsc/chromToUcsc +++ src/utils/chromToUcsc/chromToUcsc @@ -1,203 +1,206 @@ #!/usr/bin/env python import logging, optparse, gzip from sys import stdin, stdout, stderr, exit, modules from os.path import basename try: from urllib.request import urlopen # py2 except ImportError: from urllib2 import urlopen # py3 try: from cStringIO import StringIO # py2 except ImportError: from io import BytesIO # py3 # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] filename - change NCBI or Ensembl chromosome names to UCSC names in tabular or wiggle files, using a chromAlias table. Supports these UCSC file formats: BED, genePred, PSL, wiggle (all formats), bedGraph, VCF, SAM ... or any other csv or tsv format where the sequence (chromosome) name is a separate field. Requires a <genome>.chromAlias.tsv file which can be downloaded like this: %prog --get hg19 # download the file hg19.chromAlias.tsv into current directory If you do not want to use the --get option to retrieve the mapping tables, you can also download the alias mapping files yourself, e.g. for mm10 with 'wget https://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/chromAlias.txt.gz' + For NCBI RefSeq assemblies that are available as UCSC track hubs, the chromAlias files can be obtained like this: + 'wget https://hgdownload.soe.ucsc.edu/hubs/GCF/000/001/405/GCF_000001405.39/GCF_000001405.39.chromAlias.txt' + Then the script can be run like this: %prog -i in.bed -o out.bed -a hg19.chromAlias.tsv %prog -i in.bed -o out.bed -a https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/chromAlias.txt.gz Or in pipes, like this: cat test.bed | %prog -a mm10.chromAlias.tsv > test.ucsc.bed For BAM files use in a pipe with samtools: samtools view -h in.bam | ./chromToUcsc -a mm10.chromAlias.tsv | samtools -bS > out.bam """) parser.add_option("", "--get", dest="downloadDb", action="store", help="download a chrom alias table from UCSC for the genomeDb into the current directory and exit") parser.add_option("-a", "--chromAlias", dest="aliasFname", action="store", help="a UCSC chromAlias file in tab-sep format or the http/https URL to one") parser.add_option("-i", "--in", dest="inFname", action="store", help="input filename, default: /dev/stdin") parser.add_option("-o", "--out", dest="outFname", action="store", help="output filename, default: /dev/stdout") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-k", "--field", dest="fieldNo", action="store", type="int", \ help="index of field to convert, default is %default (first field is 1), for most formats, e.g. BED. For genePred, set this option to 2, for PSL it is 10 or 14. If any line starts with @ (SAM format), this field is automatically set to 2.", default=1) (options, args) = parser.parse_args() if options.downloadDb is None and options.aliasFname is None: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options # ----------- main -------------- def splitLines(ifh): " yield (chromName, restOfLine) for all lines of ifh " sep = -1 #if (sys.version_info > (3, 0)): lineNo = 0 for line in ifh: if line.startswith("fixedStep") or line.startswith("variableStep"): row = line.split() yield lineNo, " ", row, line # *step lines are always space-separated else: if sep==-1: # why test first for space? Because after fixedStep, the lines are just single numbers if " " in line: sep = None # = split on any whitespace, consec. whitespc counts as one else: sep = "\t" # default is to split on tab row = line.rstrip("\n\t").split(sep) lineNo += 1 if sep is None: sep = " " yield lineNo, sep, row, line def parseAlias(fname): " parse tsv file with at least two columns, orig chrom name and new chrom name " toUcsc = {} if fname.startswith("http://") or fname.startswith("https://"): ifh = urlopen(fname) if fname.endswith(".gz"): data = gzip.GzipFile(fileobj=ifh).read().decode() ifh = data.splitlines() elif fname.endswith(".gz"): ifh = gzip.open(fname, "rt") else: ifh = open(fname) for line in ifh: if line.startswith("alias"): continue row = line.rstrip("\n").split("\t") toUcsc[row[0]] = row[1] return toUcsc def chromToUcsc(aliasFname, fieldIdx, ifh, ofh): " convert column number fieldIdx to UCSC-style chrom names " toUcsc = parseAlias(aliasFname) ucscChroms = set(toUcsc.values()) isSam = False for lineNo, sep, row, line in splitLines(ifh): if len(row)<=2: pass # fixedStep or variableStep lines are just passed through elif row[0]=="fixedStep" or row[0]=="variableStep": # fixedStep chrom=NC_033660.1 start=15778865 step=1 key, chrom = row[1].split("=") assert(key=="chrom") # we assume that the first field has the chrom, that saves a lot of time ucscChrom = toUcsc.get(chrom) if ucscChrom is None: logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) exit(1) row[1] = "chrom="+ucscChrom elif row[0][0]== "@": fieldIdx = 2 # SAM format detected isSam = True if row[0]=="@SQ": # @SQ SN:1 LN:195471971 chrom = row[1].split(":")[1] if chrom not in ucscChroms: ucscChrom = toUcsc.get(chrom) if ucscChrom is None: logging.error("SAM format, header line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) exit(1) row[1] = "SN:%s" % ucscChrom else: # just pass through any UCSC chrom names chrom = row[fieldIdx] if row[0].startswith("#") or chrom in ucscChroms or chrom=="*": ucscChrom = chrom else: ucscChrom = toUcsc.get(chrom) if ucscChrom is None: logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) exit(1) if isSam: mateChrom = row[6] if mateChrom!="=": row[6] = toUcsc[mateChrom] row[fieldIdx] = ucscChrom line = sep.join(row) ofh.write(line) ofh.write("\n") def download(db): url = "http://hgdownload.soe.ucsc.edu/goldenPath/%s/database/chromAlias.txt.gz" % db gzData = urlopen(url).read() if 'cStringIO' in modules: data = StringIO(gzData) else: data = BytesIO(gzData) data = gzip.GzipFile(fileobj=data).read().decode() outFname = db+".chromAlias.tsv" open(outFname, "w").write(data) print("Wrote %s to %s" % (url, outFname)) print("You can now convert a file with 'chromToUcsc -a %s -i infile.bed -o outfile.bed'" % outFname) exit(0) def main(): args, options = parseArgs() aliasFname = options.aliasFname inFname = options.inFname outFname = options.outFname if options.downloadDb: download(options.downloadDb) if aliasFname is None: logging.error("You need to provide an alias table with the -a option or use --get to download one.") exit(1) if inFname is None: ifh = stdin elif inFname.endswith(".gz"): ifh = gzip.open(inFname, "rt") else: ifh = open(inFname) if outFname is None: ofh = stdout elif outFname.endswith(".gz"): ofh = gzip.open(outFname, "wt") else: ofh = open(outFname, "w") fieldIdx = options.fieldNo-1 chromToUcsc(aliasFname, fieldIdx, ifh, ofh) main()