8cbd73320598db4c37c75666e518622f82fbc326 markd Sun Jun 6 20:53:15 2021 -0700 added option for chromToUcsc to skip unmapped chromosomes rather than error diff --git src/utils/chromToUcsc/chromToUcsc src/utils/chromToUcsc/chromToUcsc index dc15eed..2e8cdf1 100755 --- src/utils/chromToUcsc/chromToUcsc +++ src/utils/chromToUcsc/chromToUcsc @@ -30,30 +30,31 @@ Then the script can be run like this: %prog -i in.bed -o out.bed -a hg19.chromAlias.tsv %prog -i in.bed -o out.bed -a https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/chromAlias.txt.gz Or in pipes, like this: cat test.bed | %prog -a mm10.chromAlias.tsv > test.ucsc.bed For BAM files use in a pipe with samtools: samtools view -h in.bam | ./chromToUcsc -a mm10.chromAlias.tsv | samtools -bS > out.bam """) parser.add_option("", "--get", dest="downloadDb", action="store", help="download a chrom alias table from UCSC for the genomeDb into the current directory and exit") parser.add_option("-a", "--chromAlias", dest="aliasFname", action="store", help="a UCSC chromAlias file in tab-sep format or the http/https URL to one") parser.add_option("-i", "--in", dest="inFname", action="store", help="input filename, default: /dev/stdin") parser.add_option("-o", "--out", dest="outFname", action="store", help="output filename, default: /dev/stdout") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") + parser.add_option("-s", "--skipUnknown", dest="skipUnknown", action="store_true", help="skip unknown sequence rather than generate an error.") parser.add_option("-k", "--field", dest="fieldNo", action="store", type="int", \ help="index of field to convert, default is %default (first field is 1). This works for BED, bedGraph, GTF, wiggle, VCF. For genePred, set this option to 2, for PSL use 10 (target) or 14 (query), for Chain use 2 (target) or 7 (query). If any line starts with @ (SAM format), this field is automatically set to 2.", default=1) (options, args) = parser.parse_args() if options.downloadDb is None and options.aliasFname is None: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options @@ -92,76 +93,90 @@ if fname.endswith(".gz"): data = gzip.GzipFile(fileobj=ifh).read().decode() ifh = data.splitlines() elif fname.endswith(".gz"): ifh = gzip.open(fname, "rt") else: ifh = open(fname) for line in ifh: if line.startswith("alias"): continue row = line.rstrip("\n").split("\t") toUcsc[row[0]] = row[1] return toUcsc -def chromToUcsc(aliasFname, fieldIdx, ifh, ofh): + +def handledUnmappedChrom(chrom, skipUnknown, skipWarned, message): + "either generate an error or warning when an unknown chromosome is encountered." + if skipUnknown: + if chrom not in skipWarned: + logging.warning(message) + skipWarned.add(chrom) + else: + logging.error(message) + exit(1) + +def chromToUcsc(aliasFname, fieldIdx, skipUnknown, ifh, ofh): " convert column number fieldIdx to UCSC-style chrom names " toUcsc = parseAlias(aliasFname) + skipWarned = set() ucscChroms = set(toUcsc.values()) isSam = False for lineNo, sep, row, line in splitLines(ifh): if line.startswith("#"): ofh.write(line) # comments (e.g. gtf) are just passed through ofh.write("\n") continue elif len(row)<=2: pass # fixedStep or variableStep value lines are passed through elif row[0]=="fixedStep" or row[0]=="variableStep": # fixedStep chrom=NC_033660.1 start=15778865 step=1 key, chrom = row[1].split("=") assert(key=="chrom") # we assume that the first field has the chrom, that saves a lot of time ucscChrom = toUcsc.get(chrom) if ucscChrom is None: logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) exit(1) row[1] = "chrom="+ucscChrom elif row[0][0]== "@": fieldIdx = 2 # SAM format detected isSam = True if row[0]=="@SQ": # @SQ SN:1 LN:195471971 chrom = row[1].split(":")[1] if chrom not in ucscChroms: ucscChrom = toUcsc.get(chrom) if ucscChrom is None: - logging.error("SAM format, header line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) - exit(1) + handledUnmappedChrom(chrom, skipUnknown, skipWarned, + "SAM format, header line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) + continue row[1] = "SN:%s" % ucscChrom else: # just pass through any UCSC chrom names chrom = row[fieldIdx] if row[0].startswith("#") or chrom in ucscChroms or chrom=="*": ucscChrom = chrom else: ucscChrom = toUcsc.get(chrom) if ucscChrom is None: - logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) - exit(1) + handledUnmappedChrom(chrom, skipUnknown, skipWarned, + "line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom))) + continue if isSam: mateChrom = row[6] if mateChrom!="=": row[6] = toUcsc[mateChrom] row[fieldIdx] = ucscChrom line = sep.join(row) ofh.write(line) ofh.write("\n") def download(db): url = "http://hgdownload.soe.ucsc.edu/goldenPath/%s/database/chromAlias.txt.gz" % db gzData = urlopen(url).read() @@ -171,41 +186,42 @@ data = BytesIO(gzData) data = gzip.GzipFile(fileobj=data).read().decode() outFname = db+".chromAlias.tsv" open(outFname, "w").write(data) print("Wrote %s to %s" % (url, outFname)) print("You can now convert a file with 'chromToUcsc -a %s -i infile.bed -o outfile.bed'" % outFname) exit(0) def main(): args, options = parseArgs() aliasFname = options.aliasFname inFname = options.inFname outFname = options.outFname + skipUnknown = options.skipUnknown if options.downloadDb: download(options.downloadDb) if aliasFname is None: logging.error("You need to provide an alias table with the -a option or use --get to download one.") exit(1) if inFname is None: ifh = stdin elif inFname.endswith(".gz"): ifh = gzip.open(inFname, "rt") else: ifh = open(inFname) if outFname is None: ofh = stdout elif outFname.endswith(".gz"): ofh = gzip.open(outFname, "wt") else: ofh = open(outFname, "w") fieldIdx = options.fieldNo-1 - chromToUcsc(aliasFname, fieldIdx, ifh, ofh) + chromToUcsc(aliasFname, fieldIdx, skipUnknown, ifh, ofh) main()