c7723adc566681d4f69232076df356c36aa5a1cf max Fri Sep 9 16:03:22 2016 -0700 adding first versin of pipeline for crispr tracks and trackDb statements supporting external extra fields, refs #17235 diff --git src/utils/doLocusName src/utils/doLocusName new file mode 100755 index 0000000..aef5fe9 --- /dev/null +++ src/utils/doLocusName @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import logging, sys, optparse, subprocess, tempfile, os +from collections import defaultdict +from os.path import join, basename, dirname, isfile + +# ==== functions ===== + +def parseArgs(): + " setup logging, parse command line arguments and options. -h shows auto-generated help page " + parser = optparse.OptionParser("usage: %prog [options] db geneTableName - create and fill locusNames, a BED4 table that covers all chromosomes and assigns a name to all ranges. ensGene usually works well as the gene table.") + + parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") + (options, args) = parser.parse_args() + + if args==[]: + parser.print_help() + exit(1) + + if options.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + return args, options + +def runSql(db, query): + " yield rows from sql query " + proc = subprocess.Popen(['hgsql', db, "-NB", "-e", query], stdout=subprocess.PIPE) + for line in proc.stdout: + yield line.rstrip("\n").split("\t") + +def runCmd(cmd, mustRun=True): + " wrapper around os.system that prints error messages " + print ("Running: "+ cmd) + ret = os.system(cmd) + if ret!=0 and mustRun: + print "Could not run command %s" % cmd + sys.exit(1) + return ret + +def doLocusName(db, geneTableName): + " create and fill the locusName table for db " + tempFh = tempfile.NamedTemporaryFile() + for row in runSql(db, "SELECT * from ensemblToGeneName"): + tempFh.write("\t".join(row)+"\n") + tempFh.flush() + tempFname = tempFh.name + + chromSizesFname = "/cluster/data/%s/chrom.sizes" % db + outTmp = "locusName.tmp.bed" + + cmd = "hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| cut -f2- |" # get genePred + "genePredToBed stdin stdout | bedToExons stdin stdout | " # break into exons + "tabRepl %(tempFname)s 3 /dev/stdin | sort -u | " # replace transcript with symbol, skip dupes + "bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout |" # merge adjacent exons + "bedBetween stdin /dev/stdout -a -s %(chromSizesFname)s |" # annotate regions + "bedSort stdin %(outTmp)s" % locals() + runCmd(cmd) + + cmd = "hgLoadBed %s locusName %s" % (db, outTmp) + runCmd(cmd) + + os.remove(outTmp) + +# ----------- main -------------- +def main(): + args, options = parseArgs() + + db, geneTable = args + doLocusName(db, geneTable) + +main()