9b8c6b6168c9f5a70ef7270a78bd2536bb2ce8b6 max Fri Sep 30 01:25:57 2016 -0700 allow doLocusName to work with knowGene, refs #17235 diff --git src/utils/doLocusName src/utils/doLocusName index 8dee4d8..f2c0fa3 100755 --- src/utils/doLocusName +++ src/utils/doLocusName @@ -1,79 +1,90 @@ #!/usr/bin/env python import logging, sys, optparse, subprocess, tempfile, os from collections import defaultdict from os.path import join, basename, dirname, isfile # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] db geneTableName - create and fill locusNames, a BED4 table that covers all chromosomes and assigns a name to all ranges. ensGene usually works well as the gene table.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def runSql(db, query): " yield rows from sql query " proc = subprocess.Popen(['hgsql', db, "-NB", "-e", query], stdout=subprocess.PIPE) for line in proc.stdout: yield line.rstrip("\n").split("\t") def runCmd(cmd, mustRun=True): " wrapper around os.system that prints error messages " print ("Running: "+ cmd) ret = os.system(cmd) if ret!=0 and mustRun: print "Could not run command %s" % cmd sys.exit(1) return ret def doLocusName(db, geneTableName): " create and fill the locusName table for db " + if geneTableName=="ensGene": + query = "SELECT * from ensemblToGeneName" + elif geneTableName=="knownGene": + query = "SELECT kgID, geneSymbol from kgXref where geneSymbol<>'' and geneSymbol not like '% %'" + else: + assert(False) # no support for this gene table yet + tempFh = tempfile.NamedTemporaryFile() - for row in runSql(db, "SELECT * from ensemblToGeneName"): + #tempFh = open("temp.tmp", "w") + for row in runSql(db, query): tempFh.write("\t".join(row)+"\n") tempFh.flush() tempFname = tempFh.name chromSizesFname = "/cluster/data/%s/chrom.sizes" % db outTmp = "locusName.tmp.bed" cmdParts = [] # get genePred - cmdParts.append("hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| cut -f2- |") + cmdParts.append("hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| ") + # only known Gene has no bin field + if geneTableName!="knownGene": + cmdParts.append("cut -f2- |") # break genes into exons cmdParts.append("genePredToBed stdin stdout | bedToExons stdin stdout | ") # replace transcript with symbol, skip dupes cmdParts.append("tabRepl %(tempFname)s 3 /dev/stdin | sort -u | ") # merge adjacent exons cmdParts.append("bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout |") # annotate regions cmdParts.append("bedBetween stdin /dev/stdout -a -s %(chromSizesFname)s -m 100 |") # and sort cmdParts.append("bedSort stdin %(outTmp)s") runCmd(("".join(cmdParts)) % locals()) cmd = "hgLoadBed %s locusName %s" % (db, outTmp) runCmd(cmd) os.remove(outTmp) # ----------- main -------------- def main(): args, options = parseArgs() db, geneTable = args doLocusName(db, geneTable) main()