c937ed5d9ffb20f8cfcbe18152a3f65929a3853c hiram Thu Aug 18 12:11:37 2022 -0700 maf sort command used to clean up maf files refs #29898 diff --git src/utils/mafSort.pl src/utils/mafSort.pl new file mode 100755 index 0000000..4ffc11c --- /dev/null +++ src/utils/mafSort.pl @@ -0,0 +1,106 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; + +my $argc = scalar(@ARGV); + +if ($argc != 2 ) { + printf STDERR "usage: mafSort.pl targetId file.maf > sorted.maf\n"; + printf STDERR "this sorts the maf blocks by the chromStart position of the target\n"; + printf STDERR "reference species. This is a two pass read of the input file.maf, thus it\n"; + printf STDERR "can not be a stdin reference, it must be an actual file, AND it can *not* be\n"; + printf STDERR "gzipped, it must be plain text. The target reference species is expected\n"; + printf STDERR "to be found on the 's' line beginning:\n"; + printf STDERR "s targetId.chromName ...\n"; + printf STDERR "in the case of overlapping blocks, it will still be out sequence, and those\n"; + printf STDERR "lines will be noted.\n"; + exit 255; +} + +sub vmPeak() { + my $pid = $$; + my $vmPeak = `grep -m 1 -i vmpeak /proc/$pid/status`; + chomp $vmPeak; + printf STDERR "# %s\n", $vmPeak; +} + +my @header; # first lines beginning with ^# +my @blocks; # array index is block number starting at 0 + # value is the 'tell' byte position in the file where this + # block starts +my %blockStarts; # key is block number starting at 0 + # value is start position of this block +my %blockLength; # key is block number starting at 0 + # value is this block length +my $blockCount = 0; +my $prevEnd = 0; + +my $targetId = shift; +my $inFile = shift; + +open (FH, "<$inFile") or die "can not read $inFile"; + +while (my $line = ) { + chomp $line; + if ($line =~ m/^#/) { + if ($blockCount > 0) { + printf STDERR "ERROR: why are there header lines after block $blockCount ?\n"; + printf STDERR "%s\n", $line; + exit 255 + } + push (@header, $line); + } elsif ($line =~ m/^a /) { + my $bytePosition = tell(FH); + $bytePosition -= length($line) + 1; + push (@blocks, $bytePosition); + } elsif ($line =~ m/^s $targetId/) { + my (undef, $chr, $s, $l, undef) = split('\s+', $line, 5); + $chr =~ s/$targetId.//; + $blockLength{$blockCount} = $l; + $blockStarts{$blockCount++} = $s; + if ($s < $prevEnd) { + printf STDERR "out of sequence %d < %d at block %d\n", $s, $prevEnd, $blockCount; + } + $prevEnd = $s + $l; + } +} +close (FH); + +printf STDERR "read %d blocks in $inFile\n", $blockCount; + +foreach my $headLine (@header) { + printf "%s\n", $headLine; +} +$prevEnd = 0; +open (FH, "<$inFile") or die "can not read $inFile"; +foreach my $blockId (sort { $blockStarts{$a} <=> $blockStarts{$b} } keys %blockStarts) { + my $s = $blockStarts{$blockId}; + my $l = $blockLength{$blockId}; + my $bytePosition = $blocks[$blockId]; + seek(FH, $bytePosition, 0); + my $line = ; + chomp $line; + if ($line !~ m/^a /) { + printf "ERROR: expected line to begin ^a, instead found:\n'%s'\n", $line; + exit 255; + } + printf "%s\n", $line; + while ($line = ) { + chomp $line; + if (length($line) > 0) { + printf "%s\n", $line; + } else { + last; + } + } + printf "\n"; + + if ($s < $prevEnd) { + printf STDERR "still out of sequence %d < %d for blockId %d\n", $s, $prevEnd, $blockId; + } + $prevEnd = $s + $l; +} +close (FH); +vmPeak();