9b4e3092e23d37d97634b3f09002bec996bd633e hiram Thu Jul 14 16:21:14 2016 -0700 bed inversion, show gaps between elements refs #13553 diff --git src/utils/bedInvert.pl src/utils/bedInvert.pl new file mode 100755 index 0000000..636061f --- /dev/null +++ src/utils/bedInvert.pl @@ -0,0 +1,95 @@ +#!/bin/env perl +# +# bedInvert - invert a bed file, output regions not covered by the bed file + +# DO NOT EDIT the /cluster/bin/scripts copy of this file -- +# edit ~/kent/src/utils/bedInvert.pl instead. + +use strict; +use warnings; + +my $argc = scalar(@ARGV); + +if ($argc != 2) { + printf STDERR "usage: bedInvert.pl [chrom.sizes] [file.bed] > invert.bed\n"; + printf STDERR "will output bed elements for gaps between elements in file.bed\n"; + printf STDERR "The file.bed does not have to be sorted, it will be\n"; + printf STDERR "\tsorted here.\n"; + printf STDERR "The chrom.sizes is used to get the last possible element\n"; + printf STDERR "\tto the end of the chromosome.\n"; + exit 255 +} + +my ($sizesFile, $bedFile) = @ARGV; + +my %chromSizes; # key is chrom name, value is length +if ($sizesFile =~ m/^stdin$/) { + open (FH, ") { + chomp $line; + my ($chr, $size) = split('\s+', $line); + $chromSizes{$chr} = $size; +} +close (FH); +my %chromDone; # key is chrom name, value is 1 to indicate done this chrom +my $chr = ""; +my $start = 0; +my $end = 0; +my $size = 0; +if ($bedFile =~ m/^stdin$/) { + open (FH, "grep -v '^ *#' /dev/stdin | sort -k1,1 -k2,2n |") or die "can not read /dev/stdin"; +} elsif ($bedFile =~ m/.gz$/) { + open (FH, "zcat $bedFile | grep -v '^ *#' | sort -k1,1 -k2,2n|") or die "can not read $bedFile"; +} else { + open (FH, "grep -v '^ *#' $bedFile | sort -k1,1 -k2,2n|") or die "can not read $bedFile"; +} +while (my $line = ) { + chomp $line; + my ($c, $s, $e, $rest) = split('\s+', $line, 4); + if (length($chr) > 1) { + if ($chr ne $c) { + my $chromEnd = $chromSizes{$chr}; + $size = $chromEnd - $end; # may be last element previous chrom + $chromDone{$chr} = 1; + printf "%s\t%d\t%d\t%d\n", $chr, $end, $chromEnd, $size if ($size > 0); + $chr = $c; $start = $s; $end = $e; + $size = $start - 0; + # might be the first gap this new chrom + $chromDone{$chr} = 1; + printf "%s\t0\t%d\t%d\n", $chr, $start, $size if ($size > 0); + } else { + if ($s <= $end) { # next element overlaps or joins previous element + $end = $e; + } else { # there is a gap here from $end to $s + $size = $s - $end; + $chromDone{$chr} = 1; + printf "%s\t%d\t%d\t%d\n", $chr, $end, $s, + $size if ($size > 0); + $chr = $c; $start = $s; $end = $e; + } + } + } else { # first chromosome, first element + $chr = $c; $start = $s; $end = $e; + $size = $start - 0; + $chromDone{$chr} = 1; + # might be the first gap this new chrom + printf "%s\t0\t%d\t%d\n", $chr, $start, $size if ($size > 0); + } +} +# might be final gap on last chrom mentioned +my $chromEnd = $chromSizes{$chr}; +$size = $chromEnd - $end; +$chromDone{$chr} = 1; +printf "%s\t%d\t%d\t%d\n", $chr, $end, $chromEnd, $size if ($size > 0);; +close (FH); + +# output gaps on chroms never mentioned yet, they had no elements +foreach $chr (keys %chromSizes) { + next if (exists($chromDone{$chr})); + printf "%s\t0\t%d\t%d\n", $chr, $chromSizes{$chr}, $chromSizes{$chr}; +}