src/utils/chromToUcsc/chromToUcsc 56e876a710840529715cd8a5496f46e3e9d372cb

56e876a710840529715cd8a5496f46e3e9d372cb
max
  Fri Aug 21 03:57:10 2020 -0700
fixing general usage problems with chromToUcsc found by Matt, refs

diff --git src/utils/chromToUcsc/chromToUcsc src/utils/chromToUcsc/chromToUcsc
index c0fe74d..8d95d5f 100755
--- src/utils/chromToUcsc/chromToUcsc
+++ src/utils/chromToUcsc/chromToUcsc
@@ -1,147 +1,165 @@
 #!/usr/bin/env python
 import logging, optparse, gzip
 from sys import stdin, stdout, stderr, exit, modules
 from os.path import basename
 
 try:
     from urllib.request import urlopen # py2
 except ImportError:
     from urllib2 import urlopen # py3
 try:
     from cStringIO import StringIO # py2
 except ImportError:
     from io import BytesIO # py3
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] filename - change NCBI or Ensembl chromosome names to UCSC names using the chromAlias table of the genome browser.
 
-    Examples:
-        %prog -g hg19 --get              # download the file hg19.chromAlias.tsv into current directory
-        %prog -i test2.bed -o test2.ucsc.bed -a hg19.chromAlias.tsv -g hg19
-        cat test.bed | %prog -a mm10.chromAlias.tsv > test.ucsc.bed
+    Requires a <genome>.chromAlias.tsv file which can be downloaded like this:
+        %prog --get hg19              # download the file hg19.chromAlias.tsv into current directory
 
     If you do not want to use the --get option to retrieve the mapping tables, you can also download the alias mapping
-    files yourself, e.g. for mm10 with 'wget http://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/chromAlias.txt.gz'
+    files yourself, e.g. for mm10 with 'wget https://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/chromAlias.txt.gz'
+
+    Then the script can be run like this:
+        %prog -i in.bed -o out.bed -a hg19.chromAlias.tsv
+        %prog -i in.bed -o out.bed -a https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/chromAlias.txt.gz
+    Or in pipes, like this:
+        cat test.bed | %prog -a mm10.chromAlias.tsv > test.ucsc.bed
+
     """)
 
-    parser.add_option("", "--get", dest="doDownload", action="store_true", help="download a chrom alias table from UCSC for --genomeDb into the current directory and exit")
-    parser.add_option("-a", "--chromAlias", dest="aliasFname", action="store", help="a UCSC chromAlias table in tab-sep format. The alias tables for hg19 or hg38 are hardcoded in the script, they do not require a chromAlias table. Use the -g option for those.")
-    parser.add_option("-g", "--genomeDb", dest="db", action="store", help="a UCSC assembly ID, like hg19. Not required. Activates assembly-specific warning messages, only for hg19 right now.")
+    parser.add_option("", "--get", dest="doDownload", action="store", help="download a chrom alias table from UCSC for the genomeDb into the current directory and exit")
+    parser.add_option("-a", "--chromAlias", dest="aliasFname", action="store", help="a UCSC chromAlias file in tab-sep format. or a URL to one")
     parser.add_option("-i", "--in", dest="inFname", action="store", help="input filename, default: /dev/stdin")
     parser.add_option("-o", "--out", dest="outFname", action="store", help="output filename, default: /dev/stdout")
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    parser.add_option("-k", "--field", dest="fieldNo", action="store", type="int", \
+            help="index of field to convert, default is %default (1-based), for most formats, e.g. BED. For genePred, the chromosome is field 2, for PSL it is 10 or 14.", default=1)
+
     (options, args) = parser.parse_args()
 
-    if options.db is None and options.aliasFname is None:
+    if options.doDownload and not options.db:
+        print("If you use --get you need to provide a genome assembly code like 'mm10' with the -g option")
+        exit(1)
+
+    if options.doDownload is None and options.aliasFname is None:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 # ----------- main --------------
 def splitLines(ifh):
     " yield (chromName, restOfLine) for all lines of ifh "
     sep = -1
     #if (sys.version_info > (3, 0)):
     lineNo = 0
     for line in ifh:
         if sep==-1:
             if "\t" in line:
                 sep = "\t"
             else:
                 sep = None # = split on any whitespace, consec. whitespc counts as one
-        chrom, rest = line.split(sep, 1)
+        #chrom, rest = line.split(sep, 1)
+        row = line.rstrip("\n\t").split(sep)
         lineNo += 1
-        yield lineNo, sep, chrom, rest
+        yield lineNo, sep, row
 
 def parseAlias(fname):
     " parse tsv file with at least two columns, orig chrom name and new chrom name "
     toUcsc = {}
-    for line in open(fname):
+    if fname.startswith("http://") or fname.startswith("https://"):
+        ifh = urlopen(fname)
+        if fname.endswith(".gz"):
+            data = gzip.GzipFile(fileobj=ifh).read().decode()
+            ifh = data.splitlines()
+    elif fname.endswith(".gz"):
+        ifh = gzip.open(fname)
+    else:
+        ifh = open(fname)
+
+    for line in ifh:
         if line.startswith("alias"):
             continue
         row = line.rstrip("\n").split("\t")
         toUcsc[row[0]] = row[1]
     return toUcsc
 
-def chromToUcsc(db, aliasFname, ifh, ofh):
+def chromToUcsc(aliasFname, fieldIdx, ifh, ofh):
     " convert the first column to UCSC-style chrom names "
     toUcsc = parseAlias(aliasFname)
 
     ucscChroms = set(toUcsc.values())
 
     mtSkipCount = 0
 
-    isHg19 = (db=="hg19" or basename(aliasFname).startswith("hg19"))
-
-    for lineNo, sep, chrom, rest in splitLines(ifh):
+    for lineNo, sep, row in splitLines(ifh):
         # just pass through any UCSC chrom names
+        chrom = row[fieldIdx]
         if chrom in ucscChroms:
             ucscChrom = chrom
         else:
-            if isHg19 and (chrom=="MT" or chrom=="M"):
-                mtSkipCount += 1
-                continue
-
             ucscChrom = toUcsc.get(chrom)
             if ucscChrom is None:
                 logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom)))
                 exit(1)
                 continue
 
-        ofh.write(ucscChrom)
-        ofh.write(sep)
-        ofh.write(rest)
+        row[fieldIdx] = ucscChrom
+        ofh.write(sep.join(row))
+        ofh.write("\n")
 
     if mtSkipCount!=0:
-        stderr.write("%d features were skipped because they were located on the M or MT chromosome. hg19 includes an older version of the mitochondrial genome and these features cannot be mapped yet.\n" % mtSkipCount)
+        stderr.write("%d features were skipped because they were located on the M chromosome. hg19 includes an older version of the mitochondrial genome and these features cannot be mapped yet.\n" % mtSkipCount)
 
 def download(db):
     url = "http://hgdownload.soe.ucsc.edu/goldenPath/%s/database/chromAlias.txt.gz" % db
     gzData = urlopen(url).read()
 
     if 'cStringIO' in modules:
         data = StringIO(gzData)
     else:
         data = BytesIO(gzData)
     
     data = gzip.GzipFile(fileobj=data).read().decode()
     outFname = db+".chromAlias.tsv"
     open(outFname, "w").write(data)
-    print("Wrote %s" % outFname)
+    print("Wrote %s to %s" % (url, outFname))
+    print("You can now convert a file with 'chromToUcsc -a %s -i infile.bed -o outfile.bed'" % outFname)
     exit(0)
 
 def main():
     args, options = parseArgs()
 
-    db = options.db
     aliasFname = options.aliasFname
     inFname = options.inFname
     outFname = options.outFname
 
     if options.doDownload:
         download(db)
 
     if aliasFname is None:
-        logging.error("You need to provide an alias table with the -a option")
+        logging.error("You need to provide an alias table with the -a option or use --get to download one.")
         exit(1)
 
     if inFname is None:
         ifh = stdin
     else:
         ifh = open(inFname)
 
     if outFname is None:
         ofh = stdout
     else:
         ofh = open(outFname, "w")
 
-    chromToUcsc(db, aliasFname, ifh, ofh)
+    fieldIdx = options.fieldNo-1
+    chromToUcsc(aliasFname, fieldIdx, ifh, ofh)
 
 main()