273486dcbd4a0fd80d7f09be639f32cd58598881 max Wed Jul 20 15:03:46 2016 -0700 adding uniprot track for hg38 diff --git src/utils/pslSameGene src/utils/pslSameGene new file mode 100755 index 0000000..7924817 --- /dev/null +++ src/utils/pslSameGene @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import logging, sys, optparse +from collections import defaultdict +from os.path import join, basename, dirname, isfile + +# ==== functions ===== + +def parseArgs(): + " setup logging, parse command line arguments and options. -h shows auto-generated help page " + parser = optparse.OptionParser("""usage: %prog [options] queryGenes targetGenes inPsl - pick out match of query against target where both query and target are from the same gene + + queryGenes and targetGenes are two files with columns queryId,geneId + """) + + parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") + #parser.add_option("-f", "--file", dest="file", action="store", help="run on file") + #parser.add_option("", "--test", dest="test", action="store_true", help="do something") + (options, args) = parser.parse_args() + + if args==[]: + parser.print_help() + exit(1) + + if options.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + return args, options + +# ----------- main -------------- +def readDict(fname): + " return map query (transcript) -> target (geneId) " + ret = {} + for line in open(fname): + query, target = line.strip().split() + ret[query] = target + return ret + +def main(): + args, options = parseArgs() + + queryGeneFname, targetGeneFname, inPslFname = args + queryGenes = readDict(queryGeneFname) + targetGenes = readDict(targetGeneFname) + + for line in open(inPslFname): + row = line.strip("\n").split("\t") + qName, tName = row[9], row[13] + if queryGenes.get(qName, 1) != targetGenes.get(tName, 2): # no gene? -> 1!=2 -> skip + continue + print line, + +main()