src/hg/encode/encodeValidate/doEncodeValidate.pl 1.167
1.167 2009/03/14 07:26:56 mikep
adding downloadOnly property to Views in DAF, adding validateFiles support in broadPeak, fastq, tagAlign so can validate CSHL data in reasonable time
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.166
retrieving revision 1.167
diff -b -B -U 4 -r1.166 -r1.167
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 14 Mar 2009 00:12:54 -0000 1.166
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 14 Mar 2009 07:26:56 -0000 1.167
@@ -29,8 +29,9 @@
use Carp qw(cluck);
use Cwd;
use IO::File;
use File::Basename;
+use Data::Dumper; # MJP
use lib "/cluster/bin/scripts";
use Encode;
use HgAutomate;
@@ -673,32 +674,28 @@
sub validateTagAlign
{
my ($path, $file, $type) = @_;
- # MJP: for now, allow colorspace sequences as well as DNA + dot
- my @list = ({TYPE => "chrom", NAME => "chrom"},
- {TYPE => "uint", NAME => "chromStart"},
- {TYPE => "uint", NAME => "chromEnd"},
- {REGEX => "[0-3ATCGN\\.]+", NAME => "sequence"},
- {TYPE => "uint", NAME => "score"},
- {REGEX => "[+-\\.]", NAME => "strand"});
- # MJP: for now, allow 10x more tagAlign records for Cshl project as we are not loading them
- return validateWithList($path, $file, $type, 10*$maxBedRows, "validateTagAlign", \@list);
+ my $safe = SafePipe->new(CMDS => ["validateFiles -type=tagAlign $file"]);
+ if(my $err = $safe->exec()) {
+ print STDERR "ERROR: failed validateTagAlign : " . $safe->stderr() . "\n";
+ # don't show end-user pipe error(s)
+ return("failed validateTagAlign for '$file'");
+ }
+ return ();
}
sub validatePairedTagAlign
# This is like tag align but with two additional sequence fields appended; seq1 and seq2
{
my ($path, $file, $type) = @_;
- my @list = ({TYPE => "chrom", NAME => "chrom"},
- {TYPE => "uint", NAME => "chromStart"},
- {TYPE => "uint", NAME => "chromEnd"},
- {TYPE => "string", NAME => "sequence"},
- {TYPE => "uint", NAME => "score"},
- {REGEX => "[+-\\.]", NAME => "strand"},
- {REGEX => "[ACGTNacgtn]*", NAME => "seq1"},
- {REGEX => "[ACGTNacgtn]*", NAME => "seq2"});
- return validateWithList($path, $file, $type, $maxBedRows, "validatePairedTagAlign", \@list);
+ my $safe = SafePipe->new(CMDS => ["validateFiles -type=pairedTagAlign $file"]);
+ if(my $err = $safe->exec()) {
+ print STDERR "ERROR: failed validatePairedTagAlign : " . $safe->stderr() . "\n";
+ # don't show end-user pipe error(s)
+ return("failed validatePairedTagAlign for '$file'");
+ }
+ return ();
}
sub validateNarrowPeak
{
@@ -718,18 +715,15 @@
sub validateBroadPeak
{
my ($path, $file, $type) = @_;
- my @list = ({TYPE => "chrom", NAME => "chrom"},
- {TYPE => "uint", NAME => "chromStart"},
- {TYPE => "uint", NAME => "chromEnd"},
- {TYPE => "string", NAME => "name"},
- {TYPE => "uint", NAME => "score"},
- {REGEX => "[+-\\.]", NAME => "strand"},
- {TYPE => "float", NAME => "signalValue"},
- {TYPE => "float", NAME => "pValue"},
- {TYPE => "float", NAME => "qValue"});
- return validateWithList($path, $file, $type, $maxBedRows, "validateBroadPeak", \@list);
+ my $safe = SafePipe->new(CMDS => ["validateFiles -type=broadPeak $file"]);
+ if(my $err = $safe->exec()) {
+ print STDERR "ERROR: failed validateBroadPeak : " . $safe->stderr() . "\n";
+ # don't show end-user pipe error(s)
+ return("failed validateBroadPeak for '$file'");
+ }
+ return ();
}
sub validateGappedPeak
{
@@ -765,41 +759,14 @@
# - fastq produced directly from Solexa has a 'solexa' quality score
# - fastq defined by Sanger has a 'PHRED' quality score
# - The 2 urls above show how to convert between both
my ($path, $file, $type) = @_;
- HgAutomate::verbose(2, "validateFastQ($path,$file,$type)\n");
- return () if $opt_skipValidateFastQ;
- doTime("beginning validateFastQ") if $opt_timing;
- my $fh = openUtil($path, $file);
- my $line = 0;
- my $state = 'firstLine';
- my $seqName;
- my $seqNameRegEx = "[A-Za-z0-9_.:/-]+";
- my $seqRegEx = "[A-Za-z\n\.~]+";
- my $qualRegEx = "[!-~\n]+"; # ord(!)=33, ord(~)=126
- my $states = {firstLine => {REGEX => "\@($seqNameRegEx)", NEXT => 'seqLine'},
- seqLine => {REGEX => $seqRegEx, NEXT => 'plusLine'},
- plusLine => {REGEX => "\\\+([A-Za-z0-9_.:/-]*)", NEXT => 'qualLine'},
- qualLine => {REGEX => $qualRegEx, NEXT => 'firstLine'}};
- while(<$fh>) {
- chomp;
- $line++;
- my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateFastQ]";
- my $regex = $states->{$state}{REGEX};
- if(/^${regex}$/) {
- $seqName = $1 if($state eq 'firstLine');
- if($state eq 'plusLine' && defined($1) && $1 && $1 ne $seqName) {
- return("$errorPrefix: seqence name '$1' does not match previous seqence name '$seqName'\nline: $_");
- }
- $state = $states->{$state}{NEXT};
- } else {
- return("$errorPrefix (expecting $state):\nline: $_");
- }
- last if($opt_quick && $line >= $quickCount);
+ my $safe = SafePipe->new(CMDS => ["validateFiles -type=fastq $file"]);
+ if(my $err = $safe->exec()) {
+ print STDERR "ERROR: failed validateFastQ : " . $safe->stderr() . "\n";
+ # don't show end-user pipe error(s)
+ return("failed validateFastQ for '$file'");
}
- $fh->close();
- HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
- doTime("done validateFastQ", $line) if $opt_timing;
return ();
}
sub validateCsfasta
@@ -1084,16 +1051,17 @@
}
}
sub isDownloadOnly {
- my ($view, $grant, $lab) = @_;
+ my ($view, $grant, $lab, $daf) = @_;
+ # Added 'downloadOnly' bool to DAF views so these rules can be explicit not hardcoded
# Dont load any RawData* or Comparative views,
# Dont load Alignments unless they are from Gingeras or Wold labs (RNA folks like to see their RNAs)
# Riken group have RawData and RawData2 because they have colorspace fasta and quality files
# Wold group have RawData, RawData[2-7]
# Wold group alignments are called 'Aligns', 'Splices', 'Paired'
- return ($view =~ m/^RawData\d*$/ or $view eq 'Comparative'
- or ($view eq 'Alignments' and $grant ne "Gingeras" and $grant ne "Wold")) ? 1 : 0;
+ return ( (($daf->{TRACKS}->{$view}->{downloadOnly} || "") eq 'yes') or ($view =~ m/^RawData\d*$/ or $view eq 'Comparative'
+ or ($view eq 'Alignments' and $grant ne "Gingeras" and $grant ne "Wold"))) ? 1 : 0;
}
sub printCompositeTdbSettings {
# prints out trackDb.ra settings for the composite track
@@ -1110,9 +1078,9 @@
# Cycle through to get best view to default labels and to get all views and terms
for my $view (keys %{$daf->{TRACKS}}) {
for my $key (keys %ddfSets) {
if(defined($ddfSets{$key}{VIEWS}{$view})) {
- my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab});
+ my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
if(!$downloadOnly) {
$setting = $setting . " " . $view . "=" . $view;
$visDefault = $visDefault . " " . $view . "=";
if($view eq "Peaks") {
@@ -1471,10 +1439,9 @@
# note this loop assumes these are on a per replicate basis.
# Also note that any project (like transcriptome) that doesnt have replicates should also use
# this for their auto-create signals.
HgAutomate::verbose(2, "ddfReplicateSets loop key=[$key] aln=[".(defined($ddfReplicateSets{$key}{VIEWS}{Alignments}))."] rawsig=[".(defined($ddfReplicateSets{$key}{VIEWS}{RawSignal}))."]\n");
-
- if(defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
+ if($daf->{noAutoCreate} ne "yes" && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
&& !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})
&& !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
&& !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
&& ($daf->{dataType} ne 'MethylSeq')) {
@@ -1735,9 +1702,9 @@
# XXXX Move the decision about which views have tracks into the DAF?
# Already this is used in 2 places so made it a function,
# would be better in the DAF except we'd have to go change all the DAFs :(
- my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab});
+ my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
print LOADER_RA "tablename $tableName\n";
print LOADER_RA "view $view\n";
print LOADER_RA "type $type\n";