src/hg/encode/encodeValidate/doEncodeValidate.pl 1.171
1.171 2009/04/02 22:50:33 mikep
fix warning on comparison with undefined; add fasta file validation by validateFiles program
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.170
retrieving revision 1.171
diff -b -B -U 4 -r1.170 -r1.171
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 26 Mar 2009 07:01:20 -0000 1.170
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 2 Apr 2009 22:50:33 -0000 1.171
@@ -838,44 +838,26 @@
return ();
}
sub validateFasta
-# Wold lab fasta files; they dont have fastq format.
+# Wold lab & Helicos have fasta files; no quality, one line per sequence
# Sample fasta lines are:
#>HWI-EAS229_75_30DY0AAXX:7:1:0:949/1
#NGCGGATGTTCTCAGTGTCCACAGCGCAGGTGAAATAAGGGAAGCAGTAGCGACGCCCATCTCCACGCGCAGCGC
#>HWI-EAS229_75_30DY0AAXX:7:1:0:1739/1
#NAGCCATCAGGAAAGCAAGGAGGGGGCATTAAAGGACAATCAAGGGGTTTGGAGGAAGGAGCAGGCCGGAGGCAA
{
- # Wold lab has fasta files, like fastq format without quality
my ($path, $file, $type) = @_;
doTime("beginning validateFasta") if $opt_timing;
HgAutomate::verbose(2, "validateFasta($path,$file,$type)\n");
- return () if $opt_skipValidateFastQ;
- doTime("beginning validateFasta") if $opt_timing;
- my $fh = Encode::openUtil($file, $path);
- my $line = 0;
- my $state = 'firstLine';
- my $seqName;
- my $seqNameRegEx = "[A-Za-z0-9_.:/-]+";
- my $seqRegEx = "[A-Za-z\n\.~]+";
- my $states = {firstLine => {REGEX => ">($seqNameRegEx)", NEXT => 'seqLine'},
- seqLine => {REGEX => $seqRegEx, NEXT => 'firstLine'}};
- while(<$fh>) {
- chomp;
- $line++;
- my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateFasta]";
- my $regex = $states->{$state}{REGEX};
- if(/^${regex}$/) {
- $state = $states->{$state}{NEXT};
- } else {
- return("$errorPrefix (expecting $state):\nline: $_");
- }
- last if($opt_quick && $line >= $quickCount);
+ my $safe = SafePipe->new(CMDS => ["validateFiles -type=fasta $file"]);
+ if(my $err = $safe->exec()) {
+ print STDERR "ERROR: failed validateFasta : " . $safe->stderr() . "\n";
+ # don't show end-user pipe error(s)
+ return("failed validateFasta for '$file'");
}
- $fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
- doTime("done validateFasta", $line) if $opt_timing;
+ doTime("done validateFasta") if $opt_timing;
return ();
}
sub validateRpkm
@@ -1421,9 +1403,9 @@
# note this loop assumes these are on a per replicate basis.
# Also note that any project (like transcriptome) that doesnt have replicates should also use
# this for their auto-create signals.
HgAutomate::verbose(2, "ddfReplicateSets loop key=[$key] aln=[".(defined($ddfReplicateSets{$key}{VIEWS}{Alignments}))."] rawsig=[".(defined($ddfReplicateSets{$key}{VIEWS}{RawSignal}))."]\n");
- if($daf->{noAutoCreate} ne "yes" && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
+ if( ( !defined($daf->{noAutoCreate}) || $daf->{noAutoCreate} ne "yes") && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
&& !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})
&& !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
&& !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
&& ($daf->{dataType} ne 'MethylSeq')) {