a1c986a99e519e259360bb99682b8b8842666bd9 max Tue Mar 3 02:40:14 2020 -0800 tiny docs change to chromToUcsc diff --git src/utils/chromToUcsc/chromToUcsc src/utils/chromToUcsc/chromToUcsc index 4a699e0..5051800 100755 --- src/utils/chromToUcsc/chromToUcsc +++ src/utils/chromToUcsc/chromToUcsc @@ -1,36 +1,41 @@ #!/usr/bin/env python import logging, optparse, gzip from sys import stdin, stdout, stderr, exit, modules +from os.path import basename + try: from urllib.request import urlopen # py2 except ImportError: from urllib2 import urlopen # py3 try: from cStringIO import StringIO # py2 except ImportError: from io import BytesIO # py3 # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] filename - change NCBI or Ensembl chromosome names to UCSC names using the chromAlias table of the genome browser. Examples: %prog -g hg19 --get %prog -i test2.bed -o test2.ucsc.bed -a hg19.chromAlias.tsv -g hg19 cat test.bed | %prog -a mm10.chromAlias.tsv > test.ucsc.bed + + If you do not want to use the --get option to retrieve the mapping tables, you can also download the alias mapping + files yourself, e.g. for mm10: wget http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/chromAlias.txt.gz """) parser.add_option("", "--get", dest="doDownload", action="store_true", help="download a chrom alias table from UCSC for --genomeDb into the current directory and exit") parser.add_option("-a", "--chromAlias", dest="aliasFname", action="store", help="a UCSC chromAlias table in tab-sep format. The alias tables for hg19 or hg38 are hardcoded in the script, they do not require a chromAlias table. Use the -g option for those.") parser.add_option("-g", "--genomeDb", dest="db", action="store", help="a UCSC assembly ID, like hg19. Not required. Activates assembly-specific warning messages, only for hg19 right now.") parser.add_option("-i", "--in", dest="inFname", action="store", help="input filename, default: /dev/stdin") parser.add_option("-o", "--out", dest="outFname", action="store", help="output filename, default: /dev/stdout") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") (options, args) = parser.parse_args() if options.db is None and options.aliasFname is None: parser.print_help() exit(1) if options.debug: @@ -60,36 +65,39 @@ toUcsc = {} for line in open(fname): if line.startswith("alias"): continue row = line.rstrip("\n").split("\t") toUcsc[row[0]] = row[1] return toUcsc def chromToUcsc(db, aliasFname, ifh, ofh): " convert the first column to UCSC-style chrom names " toUcsc = parseAlias(aliasFname) ucscChroms = set(toUcsc.values()) mtSkipCount = 0 + + isHg19 = (db=="hg19" or basename(aliasFname).startswith("hg19")) + for lineNo, sep, chrom, rest in splitLines(ifh): # just pass through any UCSC chrom names if chrom in ucscChroms: ucscChrom = chrom else: - if db=="hg19" and (chrom=="MT" or chrom=="M"): + if isHg19 and (chrom=="MT" or chrom=="M"): mtSkipCount += 1 continue ucscChrom = toUcsc.get(chrom) if ucscChrom is None: logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) exit(1) continue ofh.write(ucscChrom) ofh.write(sep) ofh.write(rest) if mtSkipCount!=0: stderr.write("%d features were skipped because they were located on the M or MT chromosome. hg19 includes an older version of the mitochondrial genome and these features cannot be mapped yet.\n" % mtSkipCount)