src/hg/encode/encodeValidate/doEncodeValidate.pl 1.171

1.171 2009/04/02 22:50:33 mikep
fix warning on comparison with undefined; add fasta file validation by validateFiles program
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.170
retrieving revision 1.171
diff -b -B -U 4 -r1.170 -r1.171
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	26 Mar 2009 07:01:20 -0000	1.170
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	2 Apr 2009 22:50:33 -0000	1.171
@@ -838,44 +838,26 @@
     return ();
 }
 
 sub validateFasta
-# Wold lab fasta files; they dont have fastq format. 
+# Wold lab & Helicos have fasta files; no quality, one line per sequence
 # Sample fasta lines are:
 #>HWI-EAS229_75_30DY0AAXX:7:1:0:949/1
 #NGCGGATGTTCTCAGTGTCCACAGCGCAGGTGAAATAAGGGAAGCAGTAGCGACGCCCATCTCCACGCGCAGCGC
 #>HWI-EAS229_75_30DY0AAXX:7:1:0:1739/1
 #NAGCCATCAGGAAAGCAAGGAGGGGGCATTAAAGGACAATCAAGGGGTTTGGAGGAAGGAGCAGGCCGGAGGCAA
 {
-    # Wold lab has fasta files, like fastq format without quality
     my ($path, $file, $type) = @_;
     doTime("beginning validateFasta") if $opt_timing;
     HgAutomate::verbose(2, "validateFasta($path,$file,$type)\n");
-    return () if $opt_skipValidateFastQ;
-    doTime("beginning validateFasta") if $opt_timing;
-    my $fh = Encode::openUtil($file, $path);
-    my $line = 0;
-    my $state = 'firstLine';
-    my $seqName;
-    my $seqNameRegEx = "[A-Za-z0-9_.:/-]+";
-    my $seqRegEx = "[A-Za-z\n\.~]+";
-    my $states = {firstLine => {REGEX => ">($seqNameRegEx)", NEXT => 'seqLine'},
-                  seqLine => {REGEX => $seqRegEx, NEXT => 'firstLine'}};
-    while(<$fh>) {
-        chomp;
-        $line++;
-        my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateFasta]";
-        my $regex = $states->{$state}{REGEX};
-        if(/^${regex}$/) {
-	        $state = $states->{$state}{NEXT};
-        } else {
-	         return("$errorPrefix (expecting $state):\nline: $_");
-        }
-        last if($opt_quick && $line >= $quickCount);
+    my $safe = SafePipe->new(CMDS => ["validateFiles -type=fasta $file"]);
+    if(my $err = $safe->exec()) {
+	print STDERR  "ERROR: failed validateFasta : " . $safe->stderr() . "\n";
+	# don't show end-user pipe error(s)
+	return("failed validateFasta for '$file'");
      }
-    $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
-    doTime("done validateFasta", $line) if $opt_timing;
+    doTime("done validateFasta") if $opt_timing;
     return ();
 }
 
 sub validateRpkm
@@ -1421,9 +1403,9 @@
 	# note this loop assumes these are on a per replicate basis.
 	# Also note that any project (like transcriptome) that doesnt have replicates should also use
 	# this for their auto-create signals.
 	HgAutomate::verbose(2, "ddfReplicateSets loop key=[$key] aln=[".(defined($ddfReplicateSets{$key}{VIEWS}{Alignments}))."] rawsig=[".(defined($ddfReplicateSets{$key}{VIEWS}{RawSignal}))."]\n");
-        if($daf->{noAutoCreate} ne "yes" && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
+        if( ( !defined($daf->{noAutoCreate}) || $daf->{noAutoCreate} ne "yes") && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
 		&& ($daf->{dataType} ne 'MethylSeq')) {