c7723adc566681d4f69232076df356c36aa5a1cf max Fri Sep 9 16:03:22 2016 -0700 adding first versin of pipeline for crispr tracks and trackDb statements supporting external extra fields, refs #17235 diff --git src/utils/bedBetween src/utils/bedBetween new file mode 100755 index 0000000..1e065c3 --- /dev/null +++ src/utils/bedBetween @@ -0,0 +1,360 @@ +#!/usr/bin/env python2.7 + +# Max Haeussler, 6/2007, 6/2014 + +from sys import * +import sys +from optparse import OptionParser +import logging + +# === COMMAND LINE INTERFACE, OPTIONS AND HELP === +parser = OptionParser("usage: %prog [options] inSortedBedFile outfile (stdout ok): given sorted feats., create features between them and annotated these with the neighboring bednames. Regions around chromosomes are limited to 50kbp.") + +parser.add_option("-d", "", dest="debug", action="store_true", \ + help="show debug messages") +parser.add_option("-s", "--chromSizes", dest="chromSizes", action="store", \ + help="use this file with chrom size - lines to get chrom sizes") +parser.add_option("-u", "--upstream", dest="upstream", action="store_true", \ + help="only report intergenic regions that are upstream (if between -/+ neighbors, two features are created)", default=False) +parser.add_option("-l", "--limit", dest="limit", action="store", \ + help="if -u is set: limit the length of upstream regions to this size", default=0, type="int") +parser.add_option("", "--uniqueNames", dest="uniqueNames", action="store_true", \ + help="keep only the first feature, if identical names") +parser.add_option("-a", "--all", dest="all", action="store_true", \ + help="output all features (input and spacers between them) and annotate with 'ex:', 'in:', 'ig:' (exon, intron, intergenic)") + +(options, args) = parser.parse_args() + +doAll = False + +# ==== FUNCTIONs ===== + +def truncSize(start, end, limit, onLeft): + """ if limit is true, cut down start end to max limit length and return as pair """ + if limit!=0 and end-start > limit: + if onLeft: + return (start, start+limit) + else: + return (end-limit, end) + return start, end + +def writeBeds(outf, chrom, posList, names, type): + for i in range(0, len(names)): + name = names[i] + start, end = posList[i] + # skip 0-length features + if start==end: + continue + outf.write("%s\t%d\t%d\t%s\n" % (chrom, start, end, name)) + +def output(fh, left, right, upstream, limit, type, chromSizes=None): + """ given two flanking beds, create lists of names for the regions between and write to fh""" + """ this can return 0, 1 or 2 different names. + type can be "ig", "ex" or "in" = intergenic, exon, intron + """ + + global outCount + global dublCount + names = [] + posList = [] + + # basic info + if left==None: + chrom = right.chrom + start = 0 + end = right.start + leftname = "Gap" + rightname = right.name + elif right==None: + chrom = left.chrom + start = left.end + end = chromSizes[left.chrom] + leftname = left.name + rightname = "Gap" + else: + chrom = left.chrom + start = left.end + end = right.start + leftname = left.name + rightname = right.name + + if not upstream: + posList = [ (start, end) ] + if doAll: + if type=="ig": + names = ["ig:"+leftname+"|"+rightname] + elif type=="in": + assert(leftname==rightname) + names = ["in:"+leftname] + else: + names = [leftname+"|"+rightname] + else: + # + everything that depends on strands (-u option) + if left!=None and right!=None and left.strand=="-" and right.strand=="+": + start1, end1 = truncSize(start, end, limit, True) + start2, end2 = truncSize(start, end, limit, False) + posList = [(start1,end1), (start2, end2)] + names = [leftname, rightname] + dublCount+=1 + elif left!=None and left.strand=="-": + start, end = truncSize(start, end, limit, True) + posList = [ (start, end) ] + names = [leftname] + elif right!=None and right.strand=="+": + start, end = truncSize(start, end, limit, False) + posList = [ (start, end) ] + names = [rightname] + else: + return + + writeBeds(fh, chrom, posList, names, type) + outCount += len(names) + lastLeft = left + +def slurpdict(fname): + """ read in tab delimited file """ + dict = {} + for l in open(fname, "r"): + l = l.strip("\n") + fs = l.split("\t") + if not len(fs)>1: + continue + key = fs[0] + val = int(fs[1]) + if key not in dict: + dict[key] = val + else: + sys.stderr.write("info: hit key %s two times: %s -> %s\n" %(key, key, val)) + return dict + +# --- generic bed stuff, from bed.py ----- + +def coordOverlap(start1, end1, start2, end2): + """ returns true if two Features overlap """ + result = (( start2 <= start1 and end2 > start1) or \ + (start2 < end1 and end2 >= end1) or \ + (start1 >= start2 and end1 <= end2) or \ + (start2 >= start1 and end2 <= end1)) + return result + +def hasOverlap(f1, f2): + """ returns true if two Features overlap """ + if f1.chrom != f2.chrom: + return False + result = coordOverlap(f1.start, f1.end, f2.start, f2.end) + return result + +def parseBedFilename(fname, reqSorted=False, quiet=False): + if not quiet: + sys.stderr.write("Reading %s...\n" % fname) + if fname=="stdin": + return parseBedFile(sys.stdin, reqSorted) + else: + return parseBedFile(open(fname,"r"), reqSorted) + +class Features(list): + def __repr__(self): + buf = [] + for i in self: + buf.append(repr(i)) + return "\n".join(buf) + def __sort__(self): + self.sort(Feature.sort) + + def writeToFileHandle(self, fh): + for b in self: + fh.write(str(b)+"\n") + + def writeToFile(self, fname): + if fname!="stdout": + fh = open(fname, "w") + else: + fh = sys.stdout + self.writeToFileHandle(fh) + fh.close() + + def countChromosomes(self): + chroms = set() + for b in self: + chroms.add(b.chrom) + return len(chroms) + +class Feature: + def __init__(self, line): + l = line.split() + count = len(l) + self.chrom = l[0] + self.start = int(l[1]) + self.end = int(l[2]) + if count >= 4: + self.name = l[3] + if count >= 5: + self.strand= l[4] + if count >= 6: + self.score = int(l[5]) + + def __init__(self,seqid="",start=0,end=0,name="",score=0,strand="+"): + self.chrom = seqid + self.start = int(start) + self.end = int(end) + self.name = name + self.score = int(score) + self.strand = strand + + def __repr__(self): + fields = [self.chrom,str(self.start),str(self.end)] + if "name" in self.__dict__: + fields.append(self.name) + if "score" in self.__dict__: + fields.append(str(self.score)) + if "strand" in self.__dict__: + fields.append(self.strand) + if "blockStarts" in self.__dict__: + fields.append(str(self.thickStart)) + fields.append(str(self.thickEnd)) + fields.append(self.itemRgb) + fields.append(str(self.blockCount)) + fields.append(self.blockSizes) + fields.append(self.blockStarts) + + return "\t".join(fields) + + def includes(self, f, tol=0): + if self.chrom!=f.chrom: + return False + else: + if self.start-tol <= f.start and self.end+tol >= f.end: + return True + + def overlaps(self, f1): + """ returns true if two Features overlap """ + return hasOverlap(self,f1) + +def parseBedLine(line): + f = Feature() + fields = line.split("\t") + f.chrom = fields[0] + f.start = int(fields[1]) + f.end = int(fields[2]) + if len(fields)>3: + f.name = fields[3] + if len(fields)>4: + f.score = int(fields[4]) + if len(fields)>5: + f.strand = fields[5] + if len(fields)>6: + f.thickStart=int(fields[6]) + f.thickEnd=int(fields[7]) + f.itemRgb=fields[8] + f.blockCount=int(fields[9]) + f.blockSizes=fields[10] + f.blockStarts=fields[11] + return f + +def parseBedFile(lines, reqSort=False): + """ return a Features() object """ + features = Features() + lastStart = -1 + for l in lines: + l = l.strip() + if l.startswith("track") or l.startswith("browser") or l.startswith("#") or l=="": + continue + f = parseBedLine(l) + if reqSort and lastStart > f.start: + sys.stderr.write("error: bed file is not sorted, violating feature: %s" % str(f)) + sys.exit(1) + features.append(f) + return features +# ----------- MAIN -------------- +if args==[]: + parser.print_help() + exit(1) + +if options.debug: + logging.basicConfig(level=logging.DEBUG) +if options.all: + doAll = True + +# parse input arguments +infile = args[0] +outfile = args[1] +upstream = options.upstream +limit = options.limit +chromSize=None +if options.chromSizes!=None: + chromSize = slurpdict(options.chromSizes) + +stderr.write("Reading beds ...\n") +beds = parseBedFilename(infile) +stderr.write("%d features read\n" % len(beds)) + +if options.uniqueNames: + newBeds = [] + seenNames = set() + for b in beds: + if b.name not in seenNames: + newBeds.append(b) + seenNames.add(b.name) + logging.info("Removing features with duplicate names, feature count %d -> %d" % \ + (len(beds), len(newBeds))) + beds = newBeds + +# open output file +if outfile=="stdout": + outf = stdout +else: + outf = open(outfile, "w") + +# init stats counters +sizeMissing=0 +outCount = 0 +dublCount = 0 +lastLeft=None +included = 0 +# output flank around first feature +output(outf, None, beds[0], upstream, limit, "ig", chromSize) +# these are the main iterator variables for the following loop +left = None +right = None + +# iterate over all bed features from input file +for i in range(0, len(beds)-1): + left = beds[i] + right = beds[i+1] + + # generate features "on the edges" and break where chrom ends or starts + if left.chrom != right.chrom: + if chromSize!=None: + output(outf, left, None, upstream, limit, "ig", chromSize) + else: + sizeMissing+=1 + output(outf, None, right, upstream, limit, "ig", chromSize) + lastLeft = left + continue + + assert(not left.overlaps(right)) # features cannot overlap + + # sanity check + if left.start > right.start: + stderr.write("error: bed file does not seem to be sorted\n") + stderr.write("check these features: %s \n<->\n %s\n" % (left, right)) + exit(1) + + if left.name==right.name: + rangeType = "in" + else: + rangeType = "ig" + output(outf, left, right, upstream, limit, rangeType, chromSize) + + if doAll: + outf.write("%s\t%d\t%d\tex:%s\n" % (left.chrom, left.start, left.end, left.name)) + + lastLeft = left + +# write some overall statistics to stderr +stderr.write("%d features written\n" % outCount) +stderr.write("%d genes/exons were dropped because one includes the other\n" % included) +if chromSize==None: + stderr.write("%d features are missing because of missing chromSizes file\n" % sizeMissing) +if upstream: + stderr.write("%d features are dublicated because of bidirectional genes and upstream option\n" % dublCount)