src/hg/encode/encodeValidate/doEncodeValidate.pl 1.225

1.225 2010/05/21 17:38:00 braney
add checking for sex of cell type for BAM's
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.224
retrieving revision 1.225
diff -b -B -U 4 -r1.224 -r1.225
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	11 May 2010 20:25:02 -0000	1.224
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	21 May 2010 17:38:00 -0000	1.225
@@ -180,9 +180,9 @@
 
 sub validateFiles {
     # Validate array of filenames, ordered by part
     # Check files exist and are of correct data format
-    my ($files, $type, $track, $daf) = @_;
+    my ($files, $type, $track, $daf, $cell) = @_;
     my @newFiles;
     my @errors;
     my $regex = "\`\|\\\|\|\"\|\'";
     doTime("beginning validateFiles") if $opt_timing;
@@ -212,9 +212,9 @@
             pushError(\@errors, "File \'$file\' is empty");
         } elsif(!(-r $file)) {
             pushError(\@errors, "File \'$file\' is un-readable");
         } else {
-            pushError(\@errors, checkDataFormat($daf->{TRACKS}{$track}{type}, $file));
+            pushError(\@errors, checkDataFormat($daf->{TRACKS}{$track}{type}, $file, $cell));
         }
     }
     $files = \@newFiles;
     doTime("done validateFiles") if $opt_timing;
@@ -758,13 +758,29 @@
 }
 
 sub validateBam
 {
-    my ($path, $file, $type) = @_;
+    my ($path, $file, $type, $cell) = @_;
     doTime("beginning validateBam") if $opt_timing;
     HgAutomate::verbose(2, "validateBam($path,$file,$type)\n");
     my $paramList = validationSettings("validateFiles","bam");
-    my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=BAM -chromDb=$daf->{assembly} $file"]);
+    my $sex = $terms{'Cell Line'}->{$cell}->{'sex'};
+    my $downloadDir = "/hive/groups/encode/dcc/pipeline/downloads/$assembly/referenceSequences";
+    my $infoFile =  "$downloadDir/female.$assembly.chrom.sizes";
+    my $twoBitFile =  "$downloadDir/female.$assembly.2bit";
+    if ($sex ne "F")  {
+        $infoFile =  "$downloadDir/male.$assembly.chrom.sizes";
+        $twoBitFile =  "$downloadDir/male.$assembly.2bit";
+    }
+
+    # index the BAM file
+    my $safe = SafePipe->new(CMDS => ["samtools index $file"]);
+    if(my $err = $safe->exec()) {
+	print STDERR  "ERROR: failed samtools index : " . $safe->stderr() . "\n";
+	# don't show end-user pipe error(s)
+	return("failed validateBam for '$file'");
+    }
+    $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=BAM -chromInfo=$infoFile -genome=$twoBitFile $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateBam : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateBam for '$file'");
@@ -951,21 +967,21 @@
 # Misc subroutines
 
 sub validateDdfField {
     # validate value for type of field
-    my ($type, $val, $track, $daf) = @_;
+    my ($type, $val, $track, $daf, $cell) = @_;
     $type =~ s/ /_/g;
     HgAutomate::verbose(4, "Validating $type: " . (defined($val) ? $val : "") . "\n");
     if($validators{$type}) {
-        return $validators{$type}->($val, $type, $track, $daf);
+        return $validators{$type}->($val, $type, $track, $daf, $cell);
     } else {
         return $validators{'default'}->($val, $type, $track, $daf); # Considers the term controlled vocab
     }
 }
 
 sub checkDataFormat {
     # validate file type
-    my ($format, $file) = @_;
+    my ($format, $file, $cell) = @_;
     HgAutomate::verbose(3, "Checking data format for $file: $format\n");
     my $type = $format;
     if ($format =~ m/(bed) (\d+)/) {
         $format = $1;
@@ -973,9 +989,9 @@
     if ($format =~ m/(bedGraph) (\d+)/) {
         $format = $1;
     }
     $formatCheckers{$format} || return "Data format \'$format\' is unknown\n";
-    return $formatCheckers{$format}->($submitPath, $file, $type);
+    return $formatCheckers{$format}->($submitPath, $file, $type, $cell);
     HgAutomate::verbose(3, "Done checking data format for $file: $format\n");
 }
 
 sub ddfKey
@@ -1464,9 +1480,10 @@
         }
         $line{files} = \@filenames;
         my @metadataErrors;
         for my $field (keys %line) {
-            push(@metadataErrors, validateDdfField($field, $line{$field}, $view, $daf));
+            my $cell = $line{cell};
+            push(@metadataErrors, validateDdfField($field, $line{$field}, $view, $daf, $cell));
         }
         if(@metadataErrors) {
             pushError(\@errors, @metadataErrors);
         } else {