ae427c75bc1aa63033fb656a0f84ddc0c45486bf max Fri Apr 28 18:03:13 2017 -0700 Adding two more tools required for building the gene interactions track, refs #13634 diff --git src/utils/bedBestTranscript src/utils/bedBestTranscript new file mode 100755 index 0000000..cf50bbe --- /dev/null +++ src/utils/bedBestTranscript @@ -0,0 +1,49 @@ +#!/usr/bin/python + +import sys +from optparse import OptionParser +import tabfile, bed + +# -------- OPTIONS ------------ +parser = OptionParser("usage: %prog [options] transcriptBedFile transcriptGeneFile\nselects only the longest or median-length transcript for each gene, transcriptBedFile has format: transcriptID<tab>geneId") +parser.add_option("-m", "--median", dest="median", action="store_true", help="use median-length transcript, instead of longest (which is the default)",default=False) +(options, args) = parser.parse_args() +if len(args)==0: + parser.print_help() + sys.exit(1) + +# ------ MAIN ----------------- + +bedFile, geneTransFile = args +transcripts = bed.parseBedFilename(bedFile) +transToGene = tabfile.slurpdict(geneTransFile) + +# index transcripts by gene id +geneToTransList = {} +for trans in transcripts: + gene = transToGene.get(trans.name, "unknownGene") + geneToTransList.setdefault(gene, []) + geneToTransList[gene].append(trans) + +transNames = [t.name for t in geneToTransList.get("unknownGene",[])] +sys.stderr.write("The following %d transcripts are not associated to any gene:\n" % len(transNames)) +sys.stderr.write(",".join(transNames)) +sys.stderr.write("\n") + +# iterate over gene and keep only best transcript +bestTransList = bed.Features() +for gene, transList in geneToTransList.iteritems(): + transList.sort(key= lambda trans: trans.end - trans.start) + if options.median: + bestTrans = transList[len(transList)/2] + else: + bestTrans = transList[-1] # -1 = last element in list + #bestTrans.name += ("|"+gene) + bestTrans.name = gene + bestTransList.append(bestTrans) + +for trans in bestTransList: + print trans +sys.stderr.write("%d best transcripts found\n" % len(bestTransList)) +sys.stderr.write("%d transcripts are not associated to any gene\n" % len(transNames)) +