src/hg/encode/encodeValidate/doEncodeValidate.pl 1.166

1.166 2009/03/14 00:12:54 mikep
mods to keep massive sorts under control; sort files independently with mem limit, then do merge sort of all
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.165
retrieving revision 1.166
diff -b -B -U 4 -r1.165 -r1.166
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	7 Mar 2009 23:24:29 -0000	1.165
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	14 Mar 2009 00:12:54 -0000	1.166
@@ -65,12 +65,12 @@
 our $quickCount=100;
 our $time0 = time;
 our $timeStart = time;
 our %chromInfo;         # chromInfo from assembly for chrom validation
-our $maxBedRows=50_000_000; # number of rows to allow in a bed-type file
+our $maxBedRows=80_000_000; # number of rows to allow in a bed-type file
 our %tableNamesUsed;
 our ($grants, $fields, $daf);
-
+our $SORT_BUF = " -S 5G ";
 
 sub usage {
     print STDERR <<END;
 usage: encodeValidate.pl submission-type project-submission-dir
@@ -518,8 +518,9 @@
         # XXXX why not do the whole thing, rather than just 1000 lines?
         push(@cmds, "head -1000 $filePath");
         push(@cmds, "/cluster/bin/x86_64/wigEncode -noOverlapSpanData stdin /dev/null /dev/null");
     }
+    # This can produce /data/tmp/SafePipe_NNN_.err files
     my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => "/dev/null", DEBUG => $opt_verbose - 1);
     if(my $err = $safe->exec()) {
         my $err = $safe->stderr();
         chomp($err);
@@ -1505,14 +1506,14 @@
                 for my $file (@{$alignmentLine->{files}}) {
                     # Unzip any zipped files - only works if they are with .gz suffix
                     my ($fbase,$dir,$suf) = fileparse($file, ".gz");
                     if ($suf eq ".gz") {
-                        # If the zipped file exists and has not already been unzipped then unzip it
+                        # If the zipped file exists then unzip it (do this each time, in case zip file is updated
                         # This check is also done above at the stage where we are testign the files in the ddf exist
-                        if (-s $file and ! -s "$dir/$fbase") {
-                            my $err = system("gunzip $file");
+                        if (-s $file) {
+                            my $err = system("gunzip -c $file > $dir/$fbase");
                             if ($err) {
-                                die ("File \'$file\' failed gunzip $file\n");
+                                die ("File \'$file\' failed gunzip $file to [$dir/$fbase]\n");
                             }
                             HgAutomate::verbose(2, "File \'$file\' gunzipped to \'$fbase\'\n");
                         }
                         if ( ! -s "$dir/$fbase") {
@@ -1539,12 +1540,24 @@
                         my $sortFiles;
                         if(defined($daf->{medianFragmentLength})) {
                             push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $daf->{medianFragmentLength} $files");
                             $sortFiles = " -";
+			    # sorting stdin, so have to sort in mem (and control how much mem we use)
+			    push @cmds, "sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                         } else {
                             $sortFiles = $files;
+			    # sort each file in place, controling mem usage, then do merge sort
+			    my @sortList = split(/\s+/, $sortFiles);
+			    foreach my $f (@sortList) {
+				my $err = system("sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n -o $f $f ");
+				if ($err) {
+				    die ("File \'$f\' failed sort\n");
+				}
+				HgAutomate::verbose(2, "File \'$f\' sorted\n");
+			    }
+			    # Now do the mergesort in the pipeline
+			    push @cmds, "sort -m $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                         }
-                        push @cmds, "sort -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
 			push @cmds, "grep -v -E \"^track\" ";
 			push @cmds, "gawk '\$6 == \"+\" {print}'" if $newView eq "PlusRawSignal";
 			push @cmds, "gawk '\$6 == \"-\" {print}'" if $newView eq "MinusRawSignal";
                         push @cmds, "bedItemOverlapCount $daf->{assembly} stdin";