src/utils/doLocusName 3c1d42b5846d46612b495f1aeece9a20857480ec

3c1d42b5846d46612b495f1aeece9a20857480ec
max
  Fri May 25 16:22:04 2018 -0700
crispr track for ce11, refs #21517

diff --git src/utils/doLocusName src/utils/doLocusName
index 077e516..c5db8aa 100755
--- src/utils/doLocusName
+++ src/utils/doLocusName
@@ -1,99 +1,101 @@
 #!/usr/bin/env python
 
 import logging, sys, optparse, subprocess, tempfile, os
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile
 
 # ==== functions =====
     
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("usage: %prog [options] db geneTableName - create and fill locusNames, a BED4 table that covers all chromosomes and assigns a name to all ranges. ensGene usually works well as the gene table.")
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     parser.add_option("-o", "--onlyFirst", dest="onlyFirst", action="store_true", help="Only use the first gene name if there are many alternatives. Use this if you get 'Warning 1265 Data truncated for column 'name' at row xxx' messages. This happens when transcripts have no gene names and every locus gets annotated with all flanking transcripts, not genes.")
     (options, args) = parser.parse_args()
 
     if args==[]:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 def runSql(db, query):
     " yield rows from sql query "
     proc = subprocess.Popen(['hgsql', db, "-NB", "-e", query], stdout=subprocess.PIPE)
     for line in  proc.stdout:
         yield line.rstrip("\n").split("\t")
 
 def runCmd(cmd, mustRun=True):
     " wrapper around os.system that prints error messages "
     print ("Running: "+ cmd)
     ret = os.system(cmd)
     if ret!=0 and mustRun:
         print "Could not run command %s" % cmd
         sys.exit(1)
     return ret
 
 def doLocusName(db, geneTableName, options):
     " create and fill the locusName table for db "
     if geneTableName=="ensGene":
         query = "SELECT * from ensemblToGeneName"
     elif geneTableName=="xenoRefGene":
         query = "SELECT name,name2 from xenoRefGene"
     elif geneTableName=="knownGene":
         query = "SELECT kgID, geneSymbol from kgXref where geneSymbol<>'' and geneSymbol not like '% %'"
+    elif geneTableName.startswith("ws"):
+        query = "SELECT name,name from %s" % geneTableName
     else:
         assert(False) # no support for this gene table yet
 
     tempFh = tempfile.NamedTemporaryFile()
     #tempFh = open("temp.tmp", "w")
     for row in runSql(db, query):
         tempFh.write("\t".join(row)+"\n")
     tempFh.flush()
     tempFname = tempFh.name
 
     chromSizesFname = "/cluster/data/%s/chrom.sizes" % db
     outTmp = "locusName.tmp.bed"
 
     cmdParts = []
     # get genePred
     cmdParts.append("hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| ")
     # only known Gene has no bin field
     if geneTableName!="knownGene":
         cmdParts.append("cut -f2- |")
     # break genes into exons
     cmdParts.append("genePredToBed stdin stdout | bedToExons stdin stdout | ")
     # replace transcript with symbol, skip dupes
     cmdParts.append("tabRepl %(tempFname)s 3 /dev/stdin | sort -u | ")
 
     # merge adjacent exons
 
     overlapOpt = ""
     if options.onlyFirst:
         overlapOpt = "-o"
     cmdParts.append("bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout %s |" % overlapOpt)
 
     # annotate regions
     cmdParts.append("bedBetween stdin /dev/stdout -a -s %(chromSizesFname)s -m 100 |")
     # and sort
     cmdParts.append("bedSort stdin %(outTmp)s")
     runCmd(("".join(cmdParts)) % locals())
 
     cmd = "hgLoadBed -tab %s locusName %s" % (db, outTmp)
     runCmd(cmd)
 
     os.remove(outTmp)
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     db, geneTable = args
     doLocusName(db, geneTable, options)
 
 main()