557415034ca7f233f8f4b5be7201261136ed80ae max Wed Sep 21 16:13:31 2016 -0700 small fix to two crispr-related utils diff --git src/utils/doLocusName src/utils/doLocusName index aef5fe9..8dee4d8 100755 --- src/utils/doLocusName +++ src/utils/doLocusName @@ -1,72 +1,79 @@ #!/usr/bin/env python import logging, sys, optparse, subprocess, tempfile, os from collections import defaultdict from os.path import join, basename, dirname, isfile # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] db geneTableName - create and fill locusNames, a BED4 table that covers all chromosomes and assigns a name to all ranges. ensGene usually works well as the gene table.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def runSql(db, query): " yield rows from sql query " proc = subprocess.Popen(['hgsql', db, "-NB", "-e", query], stdout=subprocess.PIPE) for line in proc.stdout: yield line.rstrip("\n").split("\t") def runCmd(cmd, mustRun=True): " wrapper around os.system that prints error messages " print ("Running: "+ cmd) ret = os.system(cmd) if ret!=0 and mustRun: print "Could not run command %s" % cmd sys.exit(1) return ret def doLocusName(db, geneTableName): " create and fill the locusName table for db " tempFh = tempfile.NamedTemporaryFile() for row in runSql(db, "SELECT * from ensemblToGeneName"): tempFh.write("\t".join(row)+"\n") tempFh.flush() tempFname = tempFh.name chromSizesFname = "/cluster/data/%s/chrom.sizes" % db outTmp = "locusName.tmp.bed" - cmd = "hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| cut -f2- |" # get genePred - "genePredToBed stdin stdout | bedToExons stdin stdout | " # break into exons - "tabRepl %(tempFname)s 3 /dev/stdin | sort -u | " # replace transcript with symbol, skip dupes - "bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout |" # merge adjacent exons - "bedBetween stdin /dev/stdout -a -s %(chromSizesFname)s |" # annotate regions - "bedSort stdin %(outTmp)s" % locals() - runCmd(cmd) + cmdParts = [] + # get genePred + cmdParts.append("hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| cut -f2- |") + # break genes into exons + cmdParts.append("genePredToBed stdin stdout | bedToExons stdin stdout | ") + # replace transcript with symbol, skip dupes + cmdParts.append("tabRepl %(tempFname)s 3 /dev/stdin | sort -u | ") + # merge adjacent exons + cmdParts.append("bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout |") + # annotate regions + cmdParts.append("bedBetween stdin /dev/stdout -a -s %(chromSizesFname)s -m 100 |") + # and sort + cmdParts.append("bedSort stdin %(outTmp)s") + runCmd(("".join(cmdParts)) % locals()) cmd = "hgLoadBed %s locusName %s" % (db, outTmp) runCmd(cmd) os.remove(outTmp) # ----------- main -------------- def main(): args, options = parseArgs() db, geneTable = args doLocusName(db, geneTable) main()