src/hg/encode/encodeValidate/doEncodeValidate.pl 1.177
1.177 2009/04/16 19:03:30 tdreszer
Support for DDF level fragLength for Alignments to RawSignal. Now creating metadata setting for trackDb and new fileDb.ra
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.176
retrieving revision 1.177
diff -b -B -U 4 -r1.176 -r1.177
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 7 Apr 2009 18:21:30 -0000 1.176
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 16 Apr 2009 19:03:30 -0000 1.177
@@ -41,8 +41,9 @@
use vars qw/
$opt_allowReloads
$opt_configDir
$opt_fileType
+ $opt_justFileDb
$opt_metaDataOnly
$opt_outDir
$opt_quick
$opt_skipAll
@@ -86,8 +87,9 @@
-allowReloads Allow reloads of existing tables
-configDir=dir Path of configuration directory, containing
metadata .ra files (default: submission-dir/../config)
-fileType=type used only with validateFile option; e.g. narrowPeak
+ -justFileDb Just generate the fileDb.ra file which contains all metadata
-metaDataOnly Process DAF/DDF and just update the projects.metadata field;
equal to -allowReloads -skipAll
-quick Validate only first $quickCount lines of files
-skipAll Turn on all "-skip..." options
@@ -1079,9 +1082,8 @@
############################################################################
# Main
my $now = time();
-my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($now);
my @ddfHeader; # list of field names on the first line of DDF file
my %ddfHeader = (); # convenience hash version of @ddfHeader (maps name to field index)
my @ddfLines = (); # each line in DDF (except for fields header); value is a hash; e.g. {files => 'foo.bed', cell => 'HeLa-S3', ...}
my %ddfSets = (); # info about DDF entries broken down by ddfKey
@@ -1096,8 +1098,9 @@
"quick",
"timing",
"skipAll",
"skipAutoCreation",
+ "justFileDb",
"skipOutput",
"skipValidateFiles",
"skipValidateFastQ",
"validateDaf",
@@ -1107,8 +1110,11 @@
);
usage() if (!$ok);
$opt_verbose = 1 if (!defined $opt_verbose);
$opt_sendEmail = 0 if (!defined $opt_sendEmail);
+if($opt_justFileDb) {
+ $opt_skipAll = $opt_quick = $opt_allowReloads =1;
+}
$quickOpt = " -quick " if defined ($opt_quick);
if($opt_skipAll) {
$opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
@@ -1236,8 +1242,11 @@
# Open dataset descriptor file (DDF)
my @glob = glob "*.DDF";
push(@glob, glob "*.ddf");
my $ddfFile = Encode::newestFile(@glob);
+my $fileTime = (stat($ddfFile))->ctime;
+my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime($fileTime);
+
HgAutomate::verbose(2, "Using newest DDF file \'$ddfFile\'\n");
my $lines = Encode::readFile($ddfFile);
my $ddfLineNumber = 0;
@@ -1298,8 +1307,12 @@
}
my $i = 0;
my %line;
for my $val (split('\t', $line)) {
+ if($ddfHeader[$i] ne "files" && $val =~ / /) {
+ $val =~ s/\"/\\"/g if $val =~ /\"/;
+ $val = '"' . $val . '"';
+ }
$line{$ddfHeader[$i]} = $val;
$i++;
}
if(my @tmp = Encode::validateValueList(\%line, $fields, 'ddf')) {
@@ -1378,11 +1391,8 @@
&& !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})
&& !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
&& !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
&& ($daf->{dataType} ne 'MethylSeq')) {
- if($daf->{dataType} eq 'ChipSeq' && !defined($daf->{medianFragmentLength})) {
- pushError(\@errors, "Missing medianFragmentLength field; this field is required for dataType '$daf->{dataType}' when RawSignal view is not provided");
- } else {
# hack for case where they have removed RawSignal view in the DAF
# - if no (Plus|Minus|)RawSignal is defined, assume RawSignal is required
if(!defined($daf->{TRACKS}{RawSignal}{order})
&& !defined($daf->{TRACKS}{PlusRawSignal}{order})
@@ -1397,8 +1407,21 @@
foreach my $newView (@newViews) #loop around making them
{
my $alignmentLine = $ddfReplicateSets{$key}{VIEWS}{Alignments};
+ # Time to check for fragLength by replicate (in alignments line) (die, don't just push error and build anyway)
+ if($newView eq "RawSignal") {
+ if(!defined($alignmentLine->{fragLength})) {
+ if(!defined($daf->{medianFragmentLength})) {
+ die (\@errors, "Missing fragLength field for building $daf->{dataType} '$newView' for replicate $alignmentLine->{replicate}\nThe fragLength is required and is the median fragment length used in generating this replicate.\n");
+ } else { # Letting medianFramentLength stand in for per relicate fragLength
+ $alignmentLine->{fragLength} = $daf->{medianFragmentLength};
+ }
+ }
+ if ($alignmentLine->{fragLength} < 0 || $alignmentLine->{fragLength} > 10000) {
+ die (\@errors, "Missing or invalid fragLength field for building $daf->{dataType} '$newView' for replicate $alignmentLine->{replicate}\nThe fragLength is required and is the median fragment length used in generating this replicate.\n");
+ }
+ }
my %line = %{$alignmentLine};
$line{view} = $newView;
$line{type} = 'wig';
$ddfReplicateSets{$key}{VIEWS}{$newView} = \%line;
@@ -1435,13 +1458,14 @@
HgAutomate::verbose(2, "Skipping auto-creating view '$newView' for key '$key'\n");
} else {
HgAutomate::verbose(2, "Auto-creating view '$newView' for key '$key' in file '$tmpFile'\n");
doTime("beginning Auto-create of view $newView in file $tmpFile") if $opt_timing;
+
# XXXX gzip before saving to disk?
my @cmds;
my $sortFiles;
- if(defined($daf->{medianFragmentLength})) {
- push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $daf->{medianFragmentLength} $files");
+ if(defined($alignmentLine->{fragLength})) {
+ push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $alignmentLine->{fragLength} $files");
$sortFiles = " -";
# sorting stdin, so have to sort in mem (and control how much mem we use)
push @cmds, "sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
} else {
@@ -1473,9 +1497,8 @@
$line{files} = [$tmpFile];
push(@ddfLines, \%line);
} # End foreach newView loop
}
- }
} # End replicate sets loop
doTime("done ddfReplicateSets loop") if $opt_timing;
}
@@ -1505,8 +1528,13 @@
open(LOADER_RA, ">$outPath/$Encode::loadFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::loadFile\' file; error: $!\n";
open(TRACK_RA, ">$outPath/$Encode::trackFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::trackFile\' file; error: $!\n";
open(README, ">$outPath/README.txt") || die "SYS ERROR: Can't write '$outPath/READEME.txt' file; error: $!\n";
}
+if($opt_justFileDb || !$opt_skipOutput) {
+ open(FILE_RA, ">$outPath/$Encode::fileDbFile") || die "SYS ERROR: Can't write '$outPath/$Encode::fileDbFile' file; error: $!\n";
+} else {
+ open(FILE_RA, ">>/dev/null");
+}
# Create a composite track entry if the trackDb.ra entry was not found
if(!$opt_skipOutput && !$compositeExists) {
printCompositeTdbSettings(*TRACK_RA,$daf,%ddfSets);
@@ -1522,8 +1550,35 @@
$ddfLineNumber++;
my $diePrefix = "ERROR on DDF lineNumber $ddfLineNumber:";
my $view = $ddfLine->{view};
my $type = $daf->{TRACKS}{$view}{type} || die "Missing DAF entry for view '$view'\n";
+ my $metadata = "project=wgEncode grant=$daf->{grant} lab=$daf->{lab} dataType=$daf->{dataType}";
+ $metadata .= " cell=$ddfLine->{cell}" if $ddfLine->{cell}; # force some order
+ $metadata .= " antibody=$ddfLine->{antibody}" if $ddfLine->{antibody};
+ for my $key (keys %{$ddfLine}) {
+ my $value = $ddfLine->{$key};
+ if($value
+ && $key ne 'files'
+ && $key ne 'cell'
+ && $key ne 'antibody'
+ && $key ne 'view'
+ && $key ne 'replicate'
+ && $key ne 'labVersion'
+ && $key ne 'softwareVersion') {
+ $metadata .= " $key=$value"; # and the rest
+ }
+ }
+ $metadata .= " view=$view";
+ $metadata .= " replicate=$ddfLine->{replicate}" if $ddfLine->{replicate} && $daf->{TRACKS}{$view}{hasReplicates};
+ $metadata .= " labVersion=$ddfLine->{labVersion}" if $ddfLine->{labVersion};
+ $metadata .= " softwareVersion=$ddfLine->{softwareVersion}" if $ddfLine->{softwareVersion};
+ $metadata .= ' dataVersion="' . $Encode::dataVersion .'"';
+ if($submitDir =~ /(\d+)$/) {
+ $metadata .= " subId=$1";
+ }
+ my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate($now);
+ $metadata .= sprintf(" dateSubmitted=%04d-%02d-%02d", 1900 + $year, $mon + 1, $mday);
+ $metadata .= sprintf(" dateUnrestricted=%04d-%02d-%02d", 1900 + $rYear, $rMon + 1, $rMDay);
HgAutomate::verbose(2, " View: $view\n");
my $replicate;
if($hasReplicates && $daf->{TRACKS}{$view}{hasReplicates}) {
@@ -1533,12 +1588,15 @@
die "$diePrefix invalid or missing replicate value\n";
}
}
# Construct table name from track name and variables
- my $tableName = "$compositeTrack$view";
+ my $tableName = "$compositeTrack";
+ if($Encode::dafVersion le "1.0") {
+ $tableName .= $view;
if(defined($replicate)) {
$tableName .= "Rep$replicate";
}
+ }
if(!defined($daf->{TRACKS}{$view}{shortLabelPrefix})) {
$daf->{TRACKS}{$view}{shortLabelPrefix} = "";
}
my $shortLabel = defined($daf->{TRACKS}{$view}{shortLabelPrefix}) ? $daf->{TRACKS}{$view}{shortLabelPrefix} : "";
@@ -1546,9 +1604,9 @@
if(defined($replicate)) {
$longLabel .= " Replicate $replicate";
}
my $subGroups = "view=$view";
- my $additional = "\n";
+ my $additional = "";
my $pushQDescription = "";
my $species;
my $tier1 = 0;
if (@variables) {
@@ -1566,9 +1624,9 @@
my $longSuffix;
my %shortViewMap = (Peaks => 'Pk', Signal => 'Sig', RawSignal => 'Raw', PlusRawSignal => 'PlusRaw', MinusRawSignal => 'MinusRaw');
if($hash{'antibody'} && $hash{'cell'}) {
$pushQDescription = "$hash{'antibody'} in $hash{'cell'}";
- $shortSuffix = "$hash{'antibody'} $hash{'cell'}";
+ $shortSuffix = "$hash{'cell'} $hash{'antibody'}";
$longSuffix = "$hash{'antibody'} in $hash{'cell'} cells";
} elsif($hash{'ripAntibody'} && $hash{'ripTgtProtein'} && $hash{'cell'}) {
$longSuffix = "$hash{'ripTgtProtein'} in $hash{'cell'} cells using $hash{'ripAntibody'}";
$pushQDescription = $longSuffix;
@@ -1618,8 +1676,14 @@
$subGroups .= " $groupVar=$hash{$var}";
$additional = " $var $hash{$var}\n" . $additional;
}
}
+ if($Encode::dafVersion gt "1.0") {
+ $tableName .= "$view";
+ if(defined($replicate)) {
+ $tableName .= "Rep$replicate";
+ }
+ }
# mysql doesn't allow hyphens in table names and our naming convention doesn't allow underbars; to be
# safe, we strip non-alphanumerics.
$tableName =~ s/[^A-Za-z0-9]//g;
@@ -1640,8 +1704,26 @@
# Already this is used in 2 places so made it a function,
# would be better in the DAF except we'd have to go change all the DAFs :(
my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
+ print FILE_RA " filename $tableName.$type.gz\n";
+ $metadata .= " composite=$compositeTrack";
+
+ if($downloadOnly) {
+ my $parentTable = $tableName;
+ $parentTable =~ s/RawData/RawSignal/ if $parentTable =~ /RawData/;
+ $parentTable =~ s/Alignments/RawSignal/ if $parentTable =~ /Alignments/;
+ print FILE_RA " parentTable $parentTable\n";
+ $metadata .= " parentTable=$parentTable";
+ } else {
+ print FILE_RA " tableName $tableName\n";
+ $metadata .= " tableName=$tableName";
+ }
+ print FILE_RA " composite $compositeTrack\n";
+ my $fileType = $type;
+ $fileType =~ s/ //g;
+ $metadata .= " fileName=$tableName.$fileType.gz";
+
print LOADER_RA "tablename $tableName\n";
print LOADER_RA "view $view\n";
print LOADER_RA "type $type\n";
if($species) {
@@ -1653,9 +1735,9 @@
print LOADER_RA "downloadOnly $downloadOnly\n";
print LOADER_RA "pushQDescription $pushQDescription\n";
print LOADER_RA "\n";
- my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate($now);
+ print FILE_RA sprintf(" metadata %s\n\n", $metadata);
if($downloadOnly || ($type eq "wig" && !grep(/$Encode::autoCreatedPrefix/, @{$ddfLine->{files}}))) {
# adds entries to README.txt for download only files AND wig data (excepting wig data generated by us)
print README "file: $tableName.$type.gz\n";
@@ -1697,32 +1779,17 @@
print TRACK_RA sprintf(" dataVersion %s\n", $Encode::dataVersion);
if(defined($ddfLine->{accession}) && length($ddfLine->{accession}) > 0) {
print TRACK_RA sprintf(" accession %s\n",$ddfLine->{accession});
}
- print TRACK_RA " priority " . ($priority + $daf->{TRACKS}{$view}{order}) . "\n";
- # noInherit is necessary b/c composite track will often have a different dummy type setting.
- print TRACK_RA " noInherit on\n";
- # removing default individual subtrack configurability, for performance reasons
- # add back as needed, by wrangler discretion
- #if($view eq 'RawSignal' and 0) { # Sorry tim, you will have to list your projects here
- #print TRACK_RA " configurable off\n";
- #} else {
- #print TRACK_RA " configurable on\n";
- #}
- if($type eq 'wig') {
- print TRACK_RA <<END;
- spanList first
- windowingFunction mean
- maxHeightPixels 100:16:16
-END
- } elsif($type eq 'bed 5 +') {
- print TRACK_RA " useScore 1\n";
- }
print TRACK_RA $additional;
+ # metadata proj=wgEncode lab=Yale cell=GM12878 antiBody=Pol2 labVersion="PeakSeq 1.2 ..." dataVersion="ENCODE Feb 2009 Freeze"
+ print TRACK_RA sprintf(" metadata %s\n", $metadata);
+ print TRACK_RA "\n";
}
}
close(LOADER_RA);
close(TRACK_RA);
+close(FILE_RA);
close(README);
doTime("done out files") if $opt_timing;
if($submitDir =~ /(\d+)$/) {