src/utils/chromToUcsc/chromToUcsc a3f24a9e80f42401d17724d6b88e6a9df1c24f24

a3f24a9e80f42401d17724d6b88e6a9df1c24f24
max
  Tue Feb 2 06:39:57 2021 -0800
changes after code review, refs #26903

diff --git src/utils/chromToUcsc/chromToUcsc src/utils/chromToUcsc/chromToUcsc
index 6e17808..4c4bb8d 100755
--- src/utils/chromToUcsc/chromToUcsc
+++ src/utils/chromToUcsc/chromToUcsc
@@ -1,203 +1,206 @@
 #!/usr/bin/env python
 import logging, optparse, gzip
 from sys import stdin, stdout, stderr, exit, modules
 from os.path import basename
 
 try:
     from urllib.request import urlopen # py2
 except ImportError:
     from urllib2 import urlopen # py3
 try:
     from cStringIO import StringIO # py2
 except ImportError:
     from io import BytesIO # py3
 
 # ==== functions =====
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("""usage: %prog [options] filename - change NCBI or Ensembl chromosome names to UCSC names in tabular or wiggle files, using a chromAlias table.
 
     Supports these UCSC file formats:
     BED, genePred, PSL, wiggle (all formats), bedGraph, VCF, SAM
     ... or any other csv or tsv format where the sequence (chromosome) name is a separate field.
 
     Requires a <genome>.chromAlias.tsv file which can be downloaded like this:
         %prog --get hg19              # download the file hg19.chromAlias.tsv into current directory
 
     If you do not want to use the --get option to retrieve the mapping tables, you can also download the alias mapping
     files yourself, e.g. for mm10 with 'wget https://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/chromAlias.txt.gz'
 
+    For NCBI RefSeq assemblies that are available as UCSC track hubs, the chromAlias files can be obtained like this:
+       'wget https://hgdownload.soe.ucsc.edu/hubs/GCF/000/001/405/GCF_000001405.39/GCF_000001405.39.chromAlias.txt'
+
     Then the script can be run like this:
         %prog -i in.bed -o out.bed -a hg19.chromAlias.tsv
         %prog -i in.bed -o out.bed -a https://hgdownload.soe.ucsc.edu/goldenPath/hg19/database/chromAlias.txt.gz
     Or in pipes, like this:
         cat test.bed | %prog -a mm10.chromAlias.tsv > test.ucsc.bed
     For BAM files use in a pipe with samtools:
         samtools view -h in.bam | ./chromToUcsc -a mm10.chromAlias.tsv | samtools -bS > out.bam
 
     """)
 
     parser.add_option("", "--get", dest="downloadDb", action="store", help="download a chrom alias table from UCSC for the genomeDb into the current directory and exit")
     parser.add_option("-a", "--chromAlias", dest="aliasFname", action="store", help="a UCSC chromAlias file in tab-sep format or the http/https URL to one")
     parser.add_option("-i", "--in", dest="inFname", action="store", help="input filename, default: /dev/stdin")
     parser.add_option("-o", "--out", dest="outFname", action="store", help="output filename, default: /dev/stdout")
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     parser.add_option("-k", "--field", dest="fieldNo", action="store", type="int", \
             help="index of field to convert, default is %default (first field is 1), for most formats, e.g. BED. For genePred, set this option to 2, for PSL it is 10 or 14. If any line starts with @ (SAM format), this field is automatically set to 2.", default=1)
 
     (options, args) = parser.parse_args()
 
     if options.downloadDb is None and options.aliasFname is None:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 # ----------- main --------------
 def splitLines(ifh):
     " yield (chromName, restOfLine) for all lines of ifh "
     sep = -1
     #if (sys.version_info > (3, 0)):
     lineNo = 0
     for line in ifh:
         if line.startswith("fixedStep") or line.startswith("variableStep"):
             row = line.split()
             yield lineNo, " ", row, line # *step lines are always space-separated
         else:
             if sep==-1:
                 # why test first for space? Because after fixedStep, the lines are just single numbers
                 if " " in line:
                     sep = None # = split on any whitespace, consec. whitespc counts as one
                 else:
                     sep = "\t" # default is to split on tab
             row = line.rstrip("\n\t").split(sep)
             lineNo += 1
             if sep is None:
                 sep = " "
             yield lineNo, sep, row, line
 
 def parseAlias(fname):
     " parse tsv file with at least two columns, orig chrom name and new chrom name "
     toUcsc = {}
     if fname.startswith("http://") or fname.startswith("https://"):
         ifh = urlopen(fname)
         if fname.endswith(".gz"):
             data = gzip.GzipFile(fileobj=ifh).read().decode()
             ifh = data.splitlines()
     elif fname.endswith(".gz"):
         ifh = gzip.open(fname, "rt")
     else:
         ifh = open(fname)
 
     for line in ifh:
         if line.startswith("alias"):
             continue
         row = line.rstrip("\n").split("\t")
         toUcsc[row[0]] = row[1]
     return toUcsc
 
 def chromToUcsc(aliasFname, fieldIdx, ifh, ofh):
     " convert column number fieldIdx to UCSC-style chrom names "
     toUcsc = parseAlias(aliasFname)
 
     ucscChroms = set(toUcsc.values())
     isSam = False
 
     for lineNo, sep, row, line in splitLines(ifh):
         if len(row)<=2:
             pass # fixedStep or variableStep lines are just passed through
         elif row[0]=="fixedStep" or row[0]=="variableStep":
             # fixedStep chrom=NC_033660.1 start=15778865 step=1
             key, chrom = row[1].split("=")
             assert(key=="chrom") # we assume that the first field has the chrom, that saves a lot of time
             ucscChrom = toUcsc.get(chrom)
             if ucscChrom is None:
                 logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom)))
                 exit(1)
             row[1] = "chrom="+ucscChrom
 
         elif row[0][0]== "@":
             fieldIdx = 2 # SAM format detected
             isSam = True
             if row[0]=="@SQ":
                 # @SQ     SN:1    LN:195471971
                 chrom = row[1].split(":")[1]
                 if chrom not in ucscChroms:
                     ucscChrom = toUcsc.get(chrom)
                     if ucscChrom is None:
                         logging.error("SAM format, header line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom)))
                         exit(1)
                     row[1] = "SN:%s" % ucscChrom
         else:
             # just pass through any UCSC chrom names
             chrom = row[fieldIdx]
             if row[0].startswith("#") or chrom in ucscChroms or chrom=="*":
                 ucscChrom = chrom
             else:
                 ucscChrom = toUcsc.get(chrom)
                 if ucscChrom is None:
                     logging.error("line %d: chrom name %s is not in chromAlias table" % (lineNo, repr(chrom)))
                     exit(1)
             if isSam:
                 mateChrom = row[6]
                 if mateChrom!="=":
                     row[6] = toUcsc[mateChrom]
 
             row[fieldIdx] = ucscChrom
 
         line = sep.join(row)
         ofh.write(line)
         ofh.write("\n")
 
 def download(db):
     url = "http://hgdownload.soe.ucsc.edu/goldenPath/%s/database/chromAlias.txt.gz" % db
     gzData = urlopen(url).read()
 
     if 'cStringIO' in modules:
         data = StringIO(gzData)
     else:
         data = BytesIO(gzData)
     
     data = gzip.GzipFile(fileobj=data).read().decode()
     outFname = db+".chromAlias.tsv"
     open(outFname, "w").write(data)
     print("Wrote %s to %s" % (url, outFname))
     print("You can now convert a file with 'chromToUcsc -a %s -i infile.bed -o outfile.bed'" % outFname)
     exit(0)
 
 def main():
     args, options = parseArgs()
 
     aliasFname = options.aliasFname
     inFname = options.inFname
     outFname = options.outFname
 
     if options.downloadDb:
         download(options.downloadDb)
 
     if aliasFname is None:
         logging.error("You need to provide an alias table with the -a option or use --get to download one.")
         exit(1)
 
     if inFname is None:
         ifh = stdin
     elif inFname.endswith(".gz"):
         ifh = gzip.open(inFname, "rt")
     else:
         ifh = open(inFname)
 
     if outFname is None:
         ofh = stdout
     elif outFname.endswith(".gz"):
         ofh = gzip.open(outFname, "wt")
     else:
         ofh = open(outFname, "w")
 
     fieldIdx = options.fieldNo-1
     chromToUcsc(aliasFname, fieldIdx, ifh, ofh)
 
 main()