src/hg/encode/encodeValidate/doEncodeValidate.pl 1.166
1.166 2009/03/14 00:12:54 mikep
mods to keep massive sorts under control; sort files independently with mem limit, then do merge sort of all
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.165
retrieving revision 1.166
diff -b -B -U 4 -r1.165 -r1.166
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 7 Mar 2009 23:24:29 -0000 1.165
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 14 Mar 2009 00:12:54 -0000 1.166
@@ -65,12 +65,12 @@
our $quickCount=100;
our $time0 = time;
our $timeStart = time;
our %chromInfo; # chromInfo from assembly for chrom validation
-our $maxBedRows=50_000_000; # number of rows to allow in a bed-type file
+our $maxBedRows=80_000_000; # number of rows to allow in a bed-type file
our %tableNamesUsed;
our ($grants, $fields, $daf);
-
+our $SORT_BUF = " -S 5G ";
sub usage {
print STDERR <<END;
usage: encodeValidate.pl submission-type project-submission-dir
@@ -518,8 +518,9 @@
# XXXX why not do the whole thing, rather than just 1000 lines?
push(@cmds, "head -1000 $filePath");
push(@cmds, "/cluster/bin/x86_64/wigEncode -noOverlapSpanData stdin /dev/null /dev/null");
}
+ # This can produce /data/tmp/SafePipe_NNN_.err files
my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => "/dev/null", DEBUG => $opt_verbose - 1);
if(my $err = $safe->exec()) {
my $err = $safe->stderr();
chomp($err);
@@ -1505,14 +1506,14 @@
for my $file (@{$alignmentLine->{files}}) {
# Unzip any zipped files - only works if they are with .gz suffix
my ($fbase,$dir,$suf) = fileparse($file, ".gz");
if ($suf eq ".gz") {
- # If the zipped file exists and has not already been unzipped then unzip it
+ # If the zipped file exists then unzip it (do this each time, in case zip file is updated
# This check is also done above at the stage where we are testign the files in the ddf exist
- if (-s $file and ! -s "$dir/$fbase") {
- my $err = system("gunzip $file");
+ if (-s $file) {
+ my $err = system("gunzip -c $file > $dir/$fbase");
if ($err) {
- die ("File \'$file\' failed gunzip $file\n");
+ die ("File \'$file\' failed gunzip $file to [$dir/$fbase]\n");
}
HgAutomate::verbose(2, "File \'$file\' gunzipped to \'$fbase\'\n");
}
if ( ! -s "$dir/$fbase") {
@@ -1539,12 +1540,24 @@
my $sortFiles;
if(defined($daf->{medianFragmentLength})) {
push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $daf->{medianFragmentLength} $files");
$sortFiles = " -";
+ # sorting stdin, so have to sort in mem (and control how much mem we use)
+ push @cmds, "sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
} else {
$sortFiles = $files;
+ # sort each file in place, controling mem usage, then do merge sort
+ my @sortList = split(/\s+/, $sortFiles);
+ foreach my $f (@sortList) {
+ my $err = system("sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n -o $f $f ");
+ if ($err) {
+ die ("File \'$f\' failed sort\n");
+ }
+ HgAutomate::verbose(2, "File \'$f\' sorted\n");
+ }
+ # Now do the mergesort in the pipeline
+ push @cmds, "sort -m $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
}
- push @cmds, "sort -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
push @cmds, "grep -v -E \"^track\" ";
push @cmds, "gawk '\$6 == \"+\" {print}'" if $newView eq "PlusRawSignal";
push @cmds, "gawk '\$6 == \"-\" {print}'" if $newView eq "MinusRawSignal";
push @cmds, "bedItemOverlapCount $daf->{assembly} stdin";