557415034ca7f233f8f4b5be7201261136ed80ae max Wed Sep 21 16:13:31 2016 -0700 small fix to two crispr-related utils diff --git src/utils/bedBetween src/utils/bedBetween index 1e065c3..ec454ed 100755 --- src/utils/bedBetween +++ src/utils/bedBetween @@ -6,53 +6,60 @@ import sys from optparse import OptionParser import logging # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = OptionParser("usage: %prog [options] inSortedBedFile outfile (stdout ok): given sorted feats., create features between them and annotated these with the neighboring bednames. Regions around chromosomes are limited to 50kbp.") parser.add_option("-d", "", dest="debug", action="store_true", \ help="show debug messages") parser.add_option("-s", "--chromSizes", dest="chromSizes", action="store", \ help="use this file with chrom size - lines to get chrom sizes") parser.add_option("-u", "--upstream", dest="upstream", action="store_true", \ help="only report intergenic regions that are upstream (if between -/+ neighbors, two features are created)", default=False) parser.add_option("-l", "--limit", dest="limit", action="store", \ help="if -u is set: limit the length of upstream regions to this size", default=0, type="int") +parser.add_option("-m", "--maxLen", dest="maxLen", action="store", \ + help="limit names of features to this many characters in output bed file", default=None, type="int") parser.add_option("", "--uniqueNames", dest="uniqueNames", action="store_true", \ help="keep only the first feature, if identical names") parser.add_option("-a", "--all", dest="all", action="store_true", \ help="output all features (input and spacers between them) and annotate with 'ex:', 'in:', 'ig:' (exon, intron, intergenic)") (options, args) = parser.parse_args() doAll = False +maxNameLen = None + # ==== FUNCTIONs ===== def truncSize(start, end, limit, onLeft): """ if limit is true, cut down start end to max limit length and return as pair """ if limit!=0 and end-start > limit: if onLeft: return (start, start+limit) else: return (end-limit, end) return start, end def writeBeds(outf, chrom, posList, names, type): for i in range(0, len(names)): name = names[i] + if maxNameLen is not None and len(name)>maxNameLen: + name = name[:maxNameLen-3]+"..." + start, end = posList[i] # skip 0-length features if start==end: continue outf.write("%s\t%d\t%d\t%s\n" % (chrom, start, end, name)) def output(fh, left, right, upstream, limit, type, chromSizes=None): """ given two flanking beds, create lists of names for the regions between and write to fh""" """ this can return 0, 1 or 2 different names. type can be "ig", "ex" or "in" = intergenic, exon, intron """ global outCount global dublCount names = [] @@ -272,30 +279,32 @@ if options.debug: logging.basicConfig(level=logging.DEBUG) if options.all: doAll = True # parse input arguments infile = args[0] outfile = args[1] upstream = options.upstream limit = options.limit chromSize=None if options.chromSizes!=None: chromSize = slurpdict(options.chromSizes) +maxNameLen = options.maxLen + stderr.write("Reading beds ...\n") beds = parseBedFilename(infile) stderr.write("%d features read\n" % len(beds)) if options.uniqueNames: newBeds = [] seenNames = set() for b in beds: if b.name not in seenNames: newBeds.append(b) seenNames.add(b.name) logging.info("Removing features with duplicate names, feature count %d -> %d" % \ (len(beds), len(newBeds))) beds = newBeds