634275bbaa417bb81e44cca7aaa1757ee7b57b82 max Fri May 12 11:19:51 2017 -0700 fixing up crispr track pipeline for CHO/Hiram diff --git src/utils/doLocusName src/utils/doLocusName index 8de63da..d14ab6a 100755 --- src/utils/doLocusName +++ src/utils/doLocusName @@ -1,90 +1,97 @@ #!/usr/bin/env python import logging, sys, optparse, subprocess, tempfile, os from collections import defaultdict from os.path import join, basename, dirname, isfile # ==== functions ===== def parseArgs(): " setup logging, parse command line arguments and options. -h shows auto-generated help page " parser = optparse.OptionParser("usage: %prog [options] db geneTableName - create and fill locusNames, a BED4 table that covers all chromosomes and assigns a name to all ranges. ensGene usually works well as the gene table.") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") + parser.add_option("-o", "--onlyFirst", dest="onlyFirst", action="store_true", help="Only use the first gene name if there are many alternatives. Use this if you get 'Warning 1265 Data truncated for column 'name' at row xxx' messages. This happens when transcripts have no gene names and every locus gets annotated with all flanking transcripts, not genes.") (options, args) = parser.parse_args() if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) return args, options def runSql(db, query): " yield rows from sql query " proc = subprocess.Popen(['hgsql', db, "-NB", "-e", query], stdout=subprocess.PIPE) for line in proc.stdout: yield line.rstrip("\n").split("\t") def runCmd(cmd, mustRun=True): " wrapper around os.system that prints error messages " print ("Running: "+ cmd) ret = os.system(cmd) if ret!=0 and mustRun: print "Could not run command %s" % cmd sys.exit(1) return ret -def doLocusName(db, geneTableName): +def doLocusName(db, geneTableName, options): " create and fill the locusName table for db " if geneTableName=="ensGene": query = "SELECT * from ensemblToGeneName" elif geneTableName=="knownGene": query = "SELECT kgID, geneSymbol from kgXref where geneSymbol<>'' and geneSymbol not like '% %'" else: assert(False) # no support for this gene table yet tempFh = tempfile.NamedTemporaryFile() #tempFh = open("temp.tmp", "w") for row in runSql(db, query): tempFh.write("\t".join(row)+"\n") tempFh.flush() tempFname = tempFh.name chromSizesFname = "/cluster/data/%s/chrom.sizes" % db outTmp = "locusName.tmp.bed" cmdParts = [] # get genePred cmdParts.append("hgsql %(db)s -NB -e 'SELECT * from %(geneTableName)s'| ") # only known Gene has no bin field if geneTableName!="knownGene": cmdParts.append("cut -f2- |") # break genes into exons cmdParts.append("genePredToBed stdin stdout | bedToExons stdin stdout | ") # replace transcript with symbol, skip dupes cmdParts.append("tabRepl %(tempFname)s 3 /dev/stdin | sort -u | ") + # merge adjacent exons - cmdParts.append("bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout |") + + overlapOpt = "" + if options.onlyFirst: + overlapOpt = "-o" + cmdParts.append("bedSort stdin stdout | bedOverlapMerge /dev/stdin /dev/stdout %s |" % overlapOpt) + # annotate regions cmdParts.append("bedBetween stdin /dev/stdout -a -s %(chromSizesFname)s -m 100 |") # and sort cmdParts.append("bedSort stdin %(outTmp)s") runCmd(("".join(cmdParts)) % locals()) cmd = "hgLoadBed -tab %s locusName %s" % (db, outTmp) runCmd(cmd) os.remove(outTmp) # ----------- main -------------- def main(): args, options = parseArgs() db, geneTable = args - doLocusName(db, geneTable) + doLocusName(db, geneTable, options) main()