a823a26a0ed014c44482f50eb1c4072b3860f9de max Mon Oct 31 14:27:59 2016 -0700 small fix to doLocusName to handle genes with with spaces in names diff --git src/utils/doLocusName src/utils/doLocusName index f2c0fa3..8de63da 100755 --- src/utils/doLocusName +++ src/utils/doLocusName @@ -1,90 +1,90 @@ #!/usr/bin/env python import logging, sys, optparse, subprocess, tempfile, os from collections import defaultdict from os.path import join, basename, dirname, isfile # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] db geneTableName - create and fill locusNames, a BED4 table that covers all chromosomes and assigns a name to all ranges. ensGene usually works well as the gene table.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def runSql(db, query): " yield rows from sql query " proc = subprocess.Popen(['hgsql', db, "-NB", "-e", query], stdout=subprocess.PIPE) for line in proc.stdout: yield line.rstrip("\n").split("\t") def runCmd(cmd, mustRun=True): " wrapper around os.system that prints error messages " print ("Running: "+ cmd) ret = os.system(cmd) if ret!=0 and mustRun: print "Could not run command %s" % cmd sys.exit(1) return ret def doLocusName(db, geneTableName): " create and fill the locusName table for db " if geneTableName=="ensGene": query = "SELECT * from ensemblToGeneName" elif geneTableName=="knownGene": query = "SELECT kgID, geneSymbol from kgXref where geneSymbol<>'' and geneSymbol not like '% %'" else: assert(False) # no support for this gene table yet tempFh = tempfile.NamedTemporaryFile() #tempFh = open("temp.tmp", "w") for row in runSql(db, query): tempFh.write("\t".join(row)+"\n") tempFh.flush() tempFname = tempFh.name chromSizesFname = "/cluster/data/%s/chrom.sizes" % db outTmp = "locusName.tmp.bed" cmdParts = [] # get genePred cmdParts.append("hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| ") # only known Gene has no bin field if geneTableName!="knownGene": cmdParts.append("cut -f2- |") # break genes into exons cmdParts.append("genePredToBed stdin stdout | bedToExons stdin stdout | ") # replace transcript with symbol, skip dupes cmdParts.append("tabRepl %(tempFname)s 3 /dev/stdin | sort -u | ") # merge adjacent exons cmdParts.append("bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout |") # annotate regions cmdParts.append("bedBetween stdin /dev/stdout -a -s %(chromSizesFname)s -m 100 |") # and sort cmdParts.append("bedSort stdin %(outTmp)s") runCmd(("".join(cmdParts)) % locals()) - cmd = "hgLoadBed %s locusName %s" % (db, outTmp) + cmd = "hgLoadBed -tab %s locusName %s" % (db, outTmp) runCmd(cmd) os.remove(outTmp) # ----------- main -------------- def main(): args, options = parseArgs() db, geneTable = args doLocusName(db, geneTable) main()