9b8c6b6168c9f5a70ef7270a78bd2536bb2ce8b6
max
  Fri Sep 30 01:25:57 2016 -0700
allow doLocusName to work with knowGene, refs #17235

diff --git src/utils/doLocusName src/utils/doLocusName
index 8dee4d8..f2c0fa3 100755
--- src/utils/doLocusName
+++ src/utils/doLocusName
@@ -1,79 +1,90 @@
 #!/usr/bin/env python
 
 import logging, sys, optparse, subprocess, tempfile, os
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile
 
 # ==== functions =====
     
 def parseArgs():
     " setup logging, parse command line arguments and options. -h shows auto-generated help page "
     parser = optparse.OptionParser("usage: %prog [options] db geneTableName - create and fill locusNames, a BED4 table that covers all chromosomes and assigns a name to all ranges. ensGene usually works well as the gene table.")
 
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     (options, args) = parser.parse_args()
 
     if args==[]:
         parser.print_help()
         exit(1)
 
     if options.debug:
         logging.basicConfig(level=logging.DEBUG)
     else:
         logging.basicConfig(level=logging.INFO)
     return args, options
 
 def runSql(db, query):
     " yield rows from sql query "
     proc = subprocess.Popen(['hgsql', db, "-NB", "-e", query], stdout=subprocess.PIPE)
     for line in  proc.stdout:
         yield line.rstrip("\n").split("\t")
 
 def runCmd(cmd, mustRun=True):
     " wrapper around os.system that prints error messages "
     print ("Running: "+ cmd)
     ret = os.system(cmd)
     if ret!=0 and mustRun:
         print "Could not run command %s" % cmd
         sys.exit(1)
     return ret
 
 def doLocusName(db, geneTableName):
     " create and fill the locusName table for db "
+    if geneTableName=="ensGene":
+        query = "SELECT * from ensemblToGeneName"
+    elif geneTableName=="knownGene":
+        query = "SELECT kgID, geneSymbol from kgXref where geneSymbol<>'' and geneSymbol not like '% %'"
+    else:
+        assert(False) # no support for this gene table yet
+
     tempFh = tempfile.NamedTemporaryFile()
-    for row in runSql(db, "SELECT * from ensemblToGeneName"):
+    #tempFh = open("temp.tmp", "w")
+    for row in runSql(db, query):
         tempFh.write("\t".join(row)+"\n")
     tempFh.flush()
     tempFname = tempFh.name
 
     chromSizesFname = "/cluster/data/%s/chrom.sizes" % db
     outTmp = "locusName.tmp.bed"
 
     cmdParts = []
     # get genePred
-    cmdParts.append("hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| cut -f2- |")
+    cmdParts.append("hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| ")
+    # only known Gene has no bin field
+    if geneTableName!="knownGene":
+        cmdParts.append("cut -f2- |")
     # break genes into exons
     cmdParts.append("genePredToBed stdin stdout | bedToExons stdin stdout | ")
     # replace transcript with symbol, skip dupes
     cmdParts.append("tabRepl %(tempFname)s 3 /dev/stdin | sort -u | ")
     # merge adjacent exons
     cmdParts.append("bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout |")
     # annotate regions
     cmdParts.append("bedBetween stdin /dev/stdout -a -s %(chromSizesFname)s -m 100 |")
     # and sort
     cmdParts.append("bedSort stdin %(outTmp)s")
     runCmd(("".join(cmdParts)) % locals())
 
     cmd = "hgLoadBed %s locusName %s" % (db, outTmp)
     runCmd(cmd)
 
     os.remove(outTmp)
 
 # ----------- main --------------
 def main():
     args, options = parseArgs()
 
     db, geneTable = args
     doLocusName(db, geneTable)
 
 main()