557415034ca7f233f8f4b5be7201261136ed80ae max Wed Sep 21 16:13:31 2016 -0700 small fix to two crispr-related utils diff --git src/utils/bedBetween src/utils/bedBetween index 1e065c3..ec454ed 100755 --- src/utils/bedBetween +++ src/utils/bedBetween @@ -1,360 +1,369 @@ #!/usr/bin/env python2.7 # Max Haeussler, 6/2007, 6/2014 from sys import * import sys from optparse import OptionParser import logging # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = OptionParser("usage: %prog [options] inSortedBedFile outfile (stdout ok): given sorted feats., create features between them and annotated these with the neighboring bednames. Regions around chromosomes are limited to 50kbp.") parser.add_option("-d", "", dest="debug", action="store_true", \ help="show debug messages") parser.add_option("-s", "--chromSizes", dest="chromSizes", action="store", \ help="use this file with chrom size - lines to get chrom sizes") parser.add_option("-u", "--upstream", dest="upstream", action="store_true", \ help="only report intergenic regions that are upstream (if between -/+ neighbors, two features are created)", default=False) parser.add_option("-l", "--limit", dest="limit", action="store", \ help="if -u is set: limit the length of upstream regions to this size", default=0, type="int") +parser.add_option("-m", "--maxLen", dest="maxLen", action="store", \ + help="limit names of features to this many characters in output bed file", default=None, type="int") parser.add_option("", "--uniqueNames", dest="uniqueNames", action="store_true", \ help="keep only the first feature, if identical names") parser.add_option("-a", "--all", dest="all", action="store_true", \ help="output all features (input and spacers between them) and annotate with 'ex:', 'in:', 'ig:' (exon, intron, intergenic)") (options, args) = parser.parse_args() doAll = False +maxNameLen = None + # ==== FUNCTIONs ===== def truncSize(start, end, limit, onLeft): """ if limit is true, cut down start end to max limit length and return as pair """ if limit!=0 and end-start > limit: if onLeft: return (start, start+limit) else: return (end-limit, end) return start, end def writeBeds(outf, chrom, posList, names, type): for i in range(0, len(names)): name = names[i] + if maxNameLen is not None and len(name)>maxNameLen: + name = name[:maxNameLen-3]+"..." + start, end = posList[i] # skip 0-length features if start==end: continue outf.write("%s\t%d\t%d\t%s\n" % (chrom, start, end, name)) def output(fh, left, right, upstream, limit, type, chromSizes=None): """ given two flanking beds, create lists of names for the regions between and write to fh""" """ this can return 0, 1 or 2 different names. type can be "ig", "ex" or "in" = intergenic, exon, intron """ global outCount global dublCount names = [] posList = [] # basic info if left==None: chrom = right.chrom start = 0 end = right.start leftname = "Gap" rightname = right.name elif right==None: chrom = left.chrom start = left.end end = chromSizes[left.chrom] leftname = left.name rightname = "Gap" else: chrom = left.chrom start = left.end end = right.start leftname = left.name rightname = right.name if not upstream: posList = [ (start, end) ] if doAll: if type=="ig": names = ["ig:"+leftname+"|"+rightname] elif type=="in": assert(leftname==rightname) names = ["in:"+leftname] else: names = [leftname+"|"+rightname] else: # + everything that depends on strands (-u option) if left!=None and right!=None and left.strand=="-" and right.strand=="+": start1, end1 = truncSize(start, end, limit, True) start2, end2 = truncSize(start, end, limit, False) posList = [(start1,end1), (start2, end2)] names = [leftname, rightname] dublCount+=1 elif left!=None and left.strand=="-": start, end = truncSize(start, end, limit, True) posList = [ (start, end) ] names = [leftname] elif right!=None and right.strand=="+": start, end = truncSize(start, end, limit, False) posList = [ (start, end) ] names = [rightname] else: return writeBeds(fh, chrom, posList, names, type) outCount += len(names) lastLeft = left def slurpdict(fname): """ read in tab delimited file """ dict = {} for l in open(fname, "r"): l = l.strip("\n") fs = l.split("\t") if not len(fs)>1: continue key = fs[0] val = int(fs[1]) if key not in dict: dict[key] = val else: sys.stderr.write("info: hit key %s two times: %s -> %s\n" %(key, key, val)) return dict # --- generic bed stuff, from bed.py ----- def coordOverlap(start1, end1, start2, end2): """ returns true if two Features overlap """ result = (( start2 <= start1 and end2 > start1) or \ (start2 < end1 and end2 >= end1) or \ (start1 >= start2 and end1 <= end2) or \ (start2 >= start1 and end2 <= end1)) return result def hasOverlap(f1, f2): """ returns true if two Features overlap """ if f1.chrom != f2.chrom: return False result = coordOverlap(f1.start, f1.end, f2.start, f2.end) return result def parseBedFilename(fname, reqSorted=False, quiet=False): if not quiet: sys.stderr.write("Reading %s...\n" % fname) if fname=="stdin": return parseBedFile(sys.stdin, reqSorted) else: return parseBedFile(open(fname,"r"), reqSorted) class Features(list): def __repr__(self): buf = [] for i in self: buf.append(repr(i)) return "\n".join(buf) def __sort__(self): self.sort(Feature.sort) def writeToFileHandle(self, fh): for b in self: fh.write(str(b)+"\n") def writeToFile(self, fname): if fname!="stdout": fh = open(fname, "w") else: fh = sys.stdout self.writeToFileHandle(fh) fh.close() def countChromosomes(self): chroms = set() for b in self: chroms.add(b.chrom) return len(chroms) class Feature: def __init__(self, line): l = line.split() count = len(l) self.chrom = l[0] self.start = int(l[1]) self.end = int(l[2]) if count >= 4: self.name = l[3] if count >= 5: self.strand= l[4] if count >= 6: self.score = int(l[5]) def __init__(self,seqid="",start=0,end=0,name="",score=0,strand="+"): self.chrom = seqid self.start = int(start) self.end = int(end) self.name = name self.score = int(score) self.strand = strand def __repr__(self): fields = [self.chrom,str(self.start),str(self.end)] if "name" in self.__dict__: fields.append(self.name) if "score" in self.__dict__: fields.append(str(self.score)) if "strand" in self.__dict__: fields.append(self.strand) if "blockStarts" in self.__dict__: fields.append(str(self.thickStart)) fields.append(str(self.thickEnd)) fields.append(self.itemRgb) fields.append(str(self.blockCount)) fields.append(self.blockSizes) fields.append(self.blockStarts) return "\t".join(fields) def includes(self, f, tol=0): if self.chrom!=f.chrom: return False else: if self.start-tol <= f.start and self.end+tol >= f.end: return True def overlaps(self, f1): """ returns true if two Features overlap """ return hasOverlap(self,f1) def parseBedLine(line): f = Feature() fields = line.split("\t") f.chrom = fields[0] f.start = int(fields[1]) f.end = int(fields[2]) if len(fields)>3: f.name = fields[3] if len(fields)>4: f.score = int(fields[4]) if len(fields)>5: f.strand = fields[5] if len(fields)>6: f.thickStart=int(fields[6]) f.thickEnd=int(fields[7]) f.itemRgb=fields[8] f.blockCount=int(fields[9]) f.blockSizes=fields[10] f.blockStarts=fields[11] return f def parseBedFile(lines, reqSort=False): """ return a Features() object """ features = Features() lastStart = -1 for l in lines: l = l.strip() if l.startswith("track") or l.startswith("browser") or l.startswith("#") or l=="": continue f = parseBedLine(l) if reqSort and lastStart > f.start: sys.stderr.write("error: bed file is not sorted, violating feature: %s" % str(f)) sys.exit(1) features.append(f) return features # ----------- MAIN -------------- if args==[]: parser.print_help() exit(1) if options.debug: logging.basicConfig(level=logging.DEBUG) if options.all: doAll = True # parse input arguments infile = args[0] outfile = args[1] upstream = options.upstream limit = options.limit chromSize=None if options.chromSizes!=None: chromSize = slurpdict(options.chromSizes) +maxNameLen = options.maxLen + stderr.write("Reading beds ...\n") beds = parseBedFilename(infile) stderr.write("%d features read\n" % len(beds)) if options.uniqueNames: newBeds = [] seenNames = set() for b in beds: if b.name not in seenNames: newBeds.append(b) seenNames.add(b.name) logging.info("Removing features with duplicate names, feature count %d -> %d" % \ (len(beds), len(newBeds))) beds = newBeds # open output file if outfile=="stdout": outf = stdout else: outf = open(outfile, "w") # init stats counters sizeMissing=0 outCount = 0 dublCount = 0 lastLeft=None included = 0 # output flank around first feature output(outf, None, beds[0], upstream, limit, "ig", chromSize) # these are the main iterator variables for the following loop left = None right = None # iterate over all bed features from input file for i in range(0, len(beds)-1): left = beds[i] right = beds[i+1] # generate features "on the edges" and break where chrom ends or starts if left.chrom != right.chrom: if chromSize!=None: output(outf, left, None, upstream, limit, "ig", chromSize) else: sizeMissing+=1 output(outf, None, right, upstream, limit, "ig", chromSize) lastLeft = left continue assert(not left.overlaps(right)) # features cannot overlap # sanity check if left.start > right.start: stderr.write("error: bed file does not seem to be sorted\n") stderr.write("check these features: %s \n<->\n %s\n" % (left, right)) exit(1) if left.name==right.name: rangeType = "in" else: rangeType = "ig" output(outf, left, right, upstream, limit, rangeType, chromSize) if doAll: outf.write("%s\t%d\t%d\tex:%s\n" % (left.chrom, left.start, left.end, left.name)) lastLeft = left # write some overall statistics to stderr stderr.write("%d features written\n" % outCount) stderr.write("%d genes/exons were dropped because one includes the other\n" % included) if chromSize==None: stderr.write("%d features are missing because of missing chromSizes file\n" % sizeMissing) if upstream: stderr.write("%d features are dublicated because of bidirectional genes and upstream option\n" % dublCount)