c5ade4059d7dd62cb99f0801773f0623c050fabb max Tue Feb 9 07:02:15 2021 -0800 adding GTF support for chromToUcsc, refs #26959 diff --git src/utils/chromToUcsc/chromToUcsc src/utils/chromToUcsc/chromToUcsc index 4c4bb8d..dc15eed 100755 --- src/utils/chromToUcsc/chromToUcsc +++ src/utils/chromToUcsc/chromToUcsc @@ -6,91 +6,92 @@ try: from urllib.request import urlopen # py2 except ImportError: from urllib2 import urlopen # py3 try: from cStringIO import StringIO # py2 except ImportError: from io import BytesIO # py3 # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("""usage: %prog [options] filename - change NCBI or Ensembl chromosome names to UCSC names in tabular or wiggle files, using a chromAlias table. Supports these UCSC file formats: - BED, genePred, PSL, wiggle (all formats), bedGraph, VCF, SAM + BED, genePred, PSL, wiggle (all formats), bedGraph, VCF, SAM, GTF, Chain ... or any other csv or tsv format where the sequence (chromosome) name is a separate field. Requires a .chromAlias.tsv file which can be downloaded like this: %prog --get hg19 # download the file hg19.chromAlias.tsv into current directory If you do not want to use the --get option to retrieve the mapping tables, you can also download the alias mapping files yourself, e.g. for mm10 with 'wget https://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/chromAlias.txt.gz' - For NCBI RefSeq assemblies that are available as UCSC track hubs, the chromAlias files can be obtained like this: - 'wget https://hgdownload.soe.ucsc.edu/hubs/GCF/000/001/405/GCF_000001405.39/GCF_000001405.39.chromAlias.txt' - Then the script can be run like this: %prog -i in.bed -o out.bed -a hg19.chromAlias.tsv %prog -i in.bed -o out.bed -a https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/chromAlias.txt.gz Or in pipes, like this: cat test.bed | %prog -a mm10.chromAlias.tsv > test.ucsc.bed For BAM files use in a pipe with samtools: samtools view -h in.bam | ./chromToUcsc -a mm10.chromAlias.tsv | samtools -bS > out.bam """) parser.add_option("", "--get", dest="downloadDb", action="store", help="download a chrom alias table from UCSC for the genomeDb into the current directory and exit") parser.add_option("-a", "--chromAlias", dest="aliasFname", action="store", help="a UCSC chromAlias file in tab-sep format or the http/https URL to one") parser.add_option("-i", "--in", dest="inFname", action="store", help="input filename, default: /dev/stdin") parser.add_option("-o", "--out", dest="outFname", action="store", help="output filename, default: /dev/stdout") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-k", "--field", dest="fieldNo", action="store", type="int", \ - help="index of field to convert, default is %default (first field is 1), for most formats, e.g. BED. For genePred, set this option to 2, for PSL it is 10 or 14. If any line starts with @ (SAM format), this field is automatically set to 2.", default=1) + help="index of field to convert, default is %default (first field is 1). This works for BED, bedGraph, GTF, wiggle, VCF. For genePred, set this option to 2, for PSL use 10 (target) or 14 (query), for Chain use 2 (target) or 7 (query). If any line starts with @ (SAM format), this field is automatically set to 2.", default=1) (options, args) = parser.parse_args() if options.downloadDb is None and options.aliasFname is None: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options # ----------- main -------------- def splitLines(ifh): " yield (chromName, restOfLine) for all lines of ifh " sep = -1 - #if (sys.version_info > (3, 0)): lineNo = 0 for line in ifh: + lineNo += 1 + if line.startswith("#"): + yield lineNo, None, None, line.rstrip("\n\r") + else: if line.startswith("fixedStep") or line.startswith("variableStep"): row = line.split() yield lineNo, " ", row, line # *step lines are always space-separated else: if sep==-1: # why test first for space? Because after fixedStep, the lines are just single numbers - if " " in line: + if "\t" in line: + sep = "\t" + elif " " in line: sep = None # = split on any whitespace, consec. whitespc counts as one else: sep = "\t" # default is to split on tab - row = line.rstrip("\n\t").split(sep) - lineNo += 1 + row = line.rstrip("\n\r").split(sep) if sep is None: sep = " " yield lineNo, sep, row, line def parseAlias(fname): " parse tsv file with at least two columns, orig chrom name and new chrom name " toUcsc = {} if fname.startswith("http://") or fname.startswith("https://"): ifh = urlopen(fname) if fname.endswith(".gz"): data = gzip.GzipFile(fileobj=ifh).read().decode() ifh = data.splitlines() elif fname.endswith(".gz"): ifh = gzip.open(fname, "rt") else: @@ -99,32 +100,36 @@ for line in ifh: if line.startswith("alias"): continue row = line.rstrip("\n").split("\t") toUcsc[row[0]] = row[1] return toUcsc def chromToUcsc(aliasFname, fieldIdx, ifh, ofh): " convert column number fieldIdx to UCSC-style chrom names " toUcsc = parseAlias(aliasFname) ucscChroms = set(toUcsc.values()) isSam = False for lineNo, sep, row, line in splitLines(ifh): - if len(row)<=2: - pass # fixedStep or variableStep lines are just passed through + if line.startswith("#"): + ofh.write(line) # comments (e.g. gtf) are just passed through + ofh.write("\n") + continue + elif len(row)<=2: + pass # fixedStep or variableStep value lines are passed through elif row[0]=="fixedStep" or row[0]=="variableStep": # fixedStep chrom=NC_033660.1 start=15778865 step=1 key, chrom = row[1].split("=") assert(key=="chrom") # we assume that the first field has the chrom, that saves a lot of time ucscChrom = toUcsc.get(chrom) if ucscChrom is None: logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) exit(1) row[1] = "chrom="+ucscChrom elif row[0][0]== "@": fieldIdx = 2 # SAM format detected isSam = True if row[0]=="@SQ": # @SQ SN:1 LN:195471971