src/hg/encode/encodeValidate/doEncodeValidate.pl 1.170
1.170 2009/03/26 07:01:20 mikep
adding chrom/size checking against hg18
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.169
retrieving revision 1.170
diff -b -B -U 1000000 -r1.169 -r1.170
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 22 Mar 2009 02:37:47 -0000 1.169
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 26 Mar 2009 07:01:20 -0000 1.170
@@ -1,1780 +1,1782 @@
#!/usr/bin/env perl
# encodeValidate.pl - validate an ENCODE data submission generated by the
# automated submission pipeline
#
# Verifies that all files and metadata are present and of correct formats
# Creates a load file (load.ra) and track configuration (trackDb.ra) for the datasets
#
# Returns 0 if validation succeeds
#
# Error reporting:
#
# We die immediately (with a human readable message) when internal errors are encountered (e.g. file I/O errors or misconfiguration).
#
# In order to facilitate debugging of often very large file uploads, we try to accumulate multiple user errors (e.g. DAF, DAS or
# file syntax errors) before die'ing with a message with a list of errors.
# DO NOT EDIT the /cluster/bin/scripts copy of this file --
# edit the CVS'ed source at:
# $Header$
use warnings;
use strict;
use File::stat;
use File::Basename;
use Getopt::Long;
use English;
use Carp qw(cluck);
use Cwd;
use IO::File;
use File::Basename;
-use Data::Dumper; # MJP
use lib "/cluster/bin/scripts";
use Encode;
use HgAutomate;
use HgDb;
use RAFile;
use SafePipe;
use vars qw/
$opt_allowReloads
$opt_configDir
$opt_fileType
$opt_metaDataOnly
$opt_outDir
$opt_quick
$opt_skipAll
$opt_skipAutoCreation
$opt_skipOutput
$opt_skipValidateFiles
$opt_skipValidateFastQ
$opt_validateDaf
$opt_validateFile
$opt_sendEmail
$opt_verbose
$opt_timing
/;
# Global variables
our $submitPath; # full path of data submission directory
our $configPath; # full path of configuration directory
our $outPath; # full path of output directory
our %terms; # controlled vocabulary
our $quickCount=100;
our $time0 = time;
our $timeStart = time;
our %chromInfo; # chromInfo from assembly for chrom validation
our $maxBedRows=80_000_000; # number of rows to allow in a bed-type file
our %tableNamesUsed;
our ($grants, $fields, $daf);
our $SORT_BUF = " -S 5G ";
sub usage {
print STDERR <<END;
usage: encodeValidate.pl submission-type project-submission-dir
submission-type is currently ignored.
Current dafVersion is: $Encode::dafVersion
Creates the following output files: $Encode::loadFile, $Encode::trackFile and README.txt
options:
-allowReloads Allow reloads of existing tables
-configDir=dir Path of configuration directory, containing
metadata .ra files (default: submission-dir/../config)
-fileType=type used only with validateFile option; e.g. narrowPeak
-metaDataOnly Process DAF/DDF and just update the projects.metadata field;
equal to -allowReloads -skipAll
-quick Validate only first $quickCount lines of files
-skipAll Turn on all "-skip..." options
-skipAutoCreation Tells script skip creating the auto-created files (e.g. RawSignal, PlusRawSignal, MinusRawSignal)
this can save you a lot of time when you are debugging and re-running the script on large projects
-skipOutput Dont write the various output files
-skipValidateFiles Tells script skip the file validation step; to save a lot of time during testing
-validateDaf exit after validating DAF file (project-submission-dir is the DAF file name).
-validateFile exit after validating file (project-submission-dir is the file name;
requires -fileType option as well)
-verbose=num Set verbose level to num (default 1).
-outDir=dir Path of output directory, for validation files
(default: submission-dir/out)
END
exit 1;
}
sub pushError
{
my ($errors, @new) = @_;
if(@new) {
push(@{$errors}, @new);
HgAutomate::verbose(2, "pushing errors:\n\t" . join("\n\t", @new) . "\n");
}
}
sub doTime
# print out time difference in seconds since last call to this function, or the program started.
{
my $msg = shift || "";
my $lines = shift || 0;
my $time1 = time;
my $t = $time1-$time0;
$t = 1 if ($lines>0 and $t<1);
warn("# $msg : $t secs".($lines>0 ? " ($lines lines, ".(int($lines/$t))." lines/sec)" : ""));
$time0 = time;
}
sub dieTellWrangler
{
my ($msg) = @_;
my $email;
if($grants->{$daf->{grant}} && $grants->{$daf->{grant}}{wranglerEmail}) {
$email = $grants->{$daf->{grant}}{wranglerEmail};
}
$msg .= "Please contact your wrangler" . (defined($email) ? " at $email" : "") . "\n";
die $msg;
}
############################################################################
# Validators for DDF columns -- extend when adding new metadata fields
#
# validators should return list of errors encountered (empty list means no errors were found).
#
# validator callbacks are called thus:
#
# validator(value, track, daf);
#
# value is value in DDF column
# track is track/view value
# daf is daf hash
# dispatch table
our %validators = (
files => \&validateFiles,
view => \&validateDatasetName,
labVersion => \&validateNoValidation,
softwareVersion => \&validateNoValidation,
accession => \&validateNoValidation,
cell => \&validateCellLine,
gene => \&validateGeneType,
promoter => \&validatePromoter,
antibody => \&validateAntibody,
rnaExtract => \&validateRnaExtract,
localization => \&validateLocalization,
mapAlgorithm => \&validateMapAlgorithm,
ripAntibody => \&validateRipAntibody,
ripTgtProtein => \&validateRipTgtProtein,
fragSize => \&validateFragSize,
readType => \&validateReadType,
freezeDate => \&validateFreezeDate,
replicate => \&validateReplicate,
species => \&validateSpecies,
);
# standard validators (required or optional for all projects)
sub validateFiles {
# Validate array of filenames, ordered by part
# Check files exist and are of correct data format
my ($files, $track, $daf) = @_;
my @newFiles;
my @errors;
my $regex = "\`\|\\\|\|\"\|\'";
doTime("beginning validateFiles") if $opt_timing;
for my $file (@{$files}) {
my @list = glob $file;
if(@list) {
push(@newFiles, @list);
} else {
pushError(\@errors, "File '$file' does not exist (possibly bad glob?)");
}
}
HgAutomate::verbose(3, " Track: $track Files: " . join (' ', @newFiles) . "\n");
return () if $opt_skipValidateFiles;
for my $file (@newFiles) {
my ($fbase,$dir,$suf) = fileparse($file, ".gz");
# Check if the file has been replaced with an unzipped version
# This check is also done where we auto create the RawSignal view from the Alignments
if ($suf eq ".gz" and ! -e $file and -s "$dir/$fbase") {
$file = "$dir/$fbase";
}
if($file =~ /($regex)/) {
# Do not allows filenames with suspicious characters (b/c filename will be used in shell commands).
pushError(\@errors, "File '$file' has invalid characters; files cannot contain following characters: \"'`|");
} elsif(!-e $file) {
pushError(\@errors, "File \'$file\' does not exist");
} elsif(!(-s $file)) {
pushError(\@errors, "File \'$file\' is empty");
} elsif(!(-r $file)) {
pushError(\@errors, "File \'$file\' is un-readable");
} else {
pushError(\@errors, checkDataFormat($daf->{TRACKS}{$track}{type}, $file));
}
}
$files = \@newFiles;
doTime("done validateFiles") if $opt_timing;
return @errors;
}
sub validateDatasetName {
my ($val) = @_;
return ();
}
sub validateDataType {
my ($val) = @_;
return ();
}
sub validateRawDataAcc {
# No validation
return ();
}
sub validateNoValidation {
# No validation
return ();
}
# project-specific validators
sub validateCellLine {
my ($val) = @_;
return defined($terms{'Cell Line'}{$val} || $terms{'control'}{$val}) ? () : ("Cell line \'$val\' is not known");
}
sub validateRnaExtract {
my ($val) = @_;
return defined($terms{'rnaExtract'}{$val}) ? () : ("rnaExtract \'$val\' is not known");
}
sub validateLocalization {
my ($val) = @_;
return defined($terms{'localization'}{$val}) ? () : ("localization \'$val\' is not known");
}
sub validateMapAlgorithm {
my ($val) = @_;
return defined($terms{'mapAlgorithm'}{$val}) ? () : ("mapAlgorithm \'$val\' is not known");
}
sub validateRipAntibody {
my ($val) = @_;
# TODO: Remove Encode::isControlInput after testing
# return defined(lc($val) eq 'input' || lc($val) eq 'control' || $terms{'ripAntibody'}{$val}) ? () : ("ripAntibody \'$val\' is not known");
return defined($terms{'ripAntibody'}{$val} || $terms{'control'}{$val}) ? () : ("ripAntibody \'$val\' is not known");
}
sub validateRipTgtProtein {
my ($val) = @_;
return defined($terms{'ripTgtProtein'}{$val}) ? () : ("ripTgtProtein \'$val\' is not known");
}
sub validateFragSize {
my ($val) = @_;
return defined($terms{'fragSize'}{$val}) ? () : ("fragSize \'$val\' is not known");
}
sub validateReadType {
my ($val) = @_;
return defined($terms{'readType'}{$val}) ? () : ("readType \'$val\' is not known");
}
sub validateGeneType {
my ($val) = @_;
return defined($terms{'Gene Type'}{$val}) ? () : ("Gene type \'$val\' is not known");
}
sub validatePromoter {
my ($val) = @_;
return defined($terms{'promoter'}{$val}) ? () : ("promoter \'$val\' is not known");
}
sub validateAntibody {
my ($val) = @_;
if(defined($terms{'Antibody'}{$val}) || defined($terms{'control'}{$val})) {
return ();
} else {
return ("Antibody \'$val\' is not known");
}
}
sub validateFreezeDate {
my ($val) = @_;
return defined($terms{'freezeDate'}{$val}) ? () : ("freezeDate \'$val\' is not known");
}
sub validateReplicate {
return ();
}
sub validateSpecies {
my ($val) = @_;
return defined($terms{'species'}{$val}) ? () : ("species \'$val\' is not known");
}
############################################################################
# Format checkers - check file format for given types; extend when adding new
# data formats
#
# Some of the checkers use regular expressions to validate syntax of the files.
# Others pass first 10 lines to utility loaders; the later has:
# advantages:
# checks semantics as well as syntax
# disadvantages;
# only checks the beginning of the file
# but some of the loaders tolerate (but give incorrect results) for invalid files
# dispatch table
our %formatCheckers = (
wig => \&validateWig,
bed => \&validateBed,
bedGraph => \&validateBedGraph,
bed5FloatScore => \&validateBed,
genePred => \&validateGene,
gtf => \&validateGtf,
tagAlign => \&validateTagAlign,
pairedTagAlign => \&validatePairedTagAlign,
narrowPeak => \&validateNarrowPeak,
broadPeak => \&validateBroadPeak,
gappedPeak => \&validateGappedPeak,
fastq => \&validateFastQ,
csfasta => \&validateCsfasta,
csqual => \&validateCsqual,
rpkm => \&validateRpkm,
fasta => \&validateFasta,
bowtie => \&validateBowtie,
psl => \&validatePsl,
cBiP => \&validateFreepass, # TODO: this is a dodge, because bed file is for different species, so chrom violations
);
my $floatRegEx = "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?";
# my $floatRegEx = "[+-]?(?:\\.\\d+|\\d+(?:\\.\\d+|[eE]{1}?[+-]{1}?\\d+))"; # Tim's attempt
# my $floatRegEx = "[+-]?(?:\\.\\d+|\\d+(?:\\.\\d+|))"; # Original
my %typeMap = (int => "[+-]?\\d+", uint => "\\d+", float => $floatRegEx, string => "\\S+");
sub listToRegExp
{
# Return a regular expression for given list of field specific tests.
#
# $validateList is a reference to a list of hashes with: {NAME, REGEX or TYPE}
# If a line fails this regular expression, you should then call validateWithListUtil with this line
# and validation list to generate a field specific error message; this is a speedup hack,
# because we want to avoid calling validateWithListUtil for every line (because validateWithListUtil is really
# slow).
#
# Note that the 'chrom' field is captured, so you should test %chromInfo (e.g. $chromInfo($1))
# after using the regular expression to verify that the line has a valid chrom.
my ($validateList) = @_;
my @list;
for my $validateField (@{$validateList}) {
my $type = $validateField->{TYPE};
if(defined($type) && $type eq 'chrom') {
push(@list, "(\\S+)");
} else {
my $regex;
if($type) {
if(!($regex = $typeMap{$type})) {
die "PROGRAM ERROR: invalid TYPE: $type\n";
}
} elsif(!($regex = $validateField->{REGEX})) {
die "PROGRAM ERROR: invalid type list (missing required REGEX or TYPE)\n";
}
push(@list, $regex);
}
}
return "^" . join("\\s+", @list) . "\$";
}
sub validateWithListUtil
{
# Validate $line using a validation list.
# returns error string or undef if line passes validation
# This is designed to give better feedback to user; ideally we would load the validation list from the .as files
my ($line, $validateList) = @_;
my @list = split(/\s+/, $line);
my $fieldError = "; saw '" . scalar(@list) . "' fields; expected: '" . @{$validateList} . "'";
if(@list < @{$validateList}) {
return "not enough fields" . $fieldError;
} elsif(@list > @{$validateList}) {
return "too many fields" . $fieldError;
} else {
for my $validateField (@{$validateList}) {
my $val = shift(@list);
my $type = $validateField->{TYPE};
if(defined($type) && $type eq 'chrom') {
if(!$chromInfo{$val}) {
return "value '$val' for field '$validateField->{NAME}' is an invalid chromosome";
}
} else {
my $regex;
if($type) {
if(!($regex = $typeMap{$type})) {
die "PROGRAM ERROR: invalid TYPE: $type\n";
}
} elsif(!($regex = $validateField->{REGEX})) {
die "PROGRAM ERROR: invalid type list (missing required REGEX or TYPE)\n";
}
if($val !~ /^$regex$/) {
my $error = "value '$val' is an invalid value for field '$validateField->{NAME}'";
if($type) {
$error .= "; must be type '$type'";
}
return $error;
}
}
}
}
return undef;
}
sub validateWithList
{
# open a file and validate each line with $validateList
# $name is the caller's subroutine name (used in error and debug messages).
my ($path, $file, $type, $maxRows, $name, $validateList) = @_;
my $lineNumber = 0;
my $fh = Encode::openUtil($file, $path);
my $regexp = listToRegExp($validateList);
my $hasChrom = 0;
for my $rec (@{$validateList}) {
$hasChrom++ if($rec->{NAME} eq "chrom");
}
doTime("beginning validateWithList $name,$type,$maxRows") if $opt_timing;
while(my $line = <$fh>) {
chomp $line;
$lineNumber++;
return ("Invalid $type file; line $lineNumber in file '$file';\nerror: exceeded maximum number of rows allowed ($maxRows) \nline: $line") if $lineNumber > $maxRows;
next if($line =~ m/^#/); # allow comment lines, consistent with lineFile and hgLoadBed
if($line =~ /$regexp/) {
if($hasChrom) {
my $chrom = $1;
if(!$chromInfo{$1}) {
return ("Invalid $type file; line $lineNumber in file '$file';\nerror: invalid chrom '$chrom';\nline: $line");
}
}
} else {
if(my $error = validateWithListUtil($line, $validateList)) {
return ("Invalid $type file; line $lineNumber in file '$file' is invalid;\n$error;\nline: $line");
} else {
die "PROGRAM ERROR: inconsistent results from validateWithListUtil\n";
}
}
last if($opt_quick && $lineNumber >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
doTime("done validateWithList $name,$type,$maxRows",$lineNumber) if $opt_timing;
return ();
}
sub validateFreepass
{
my ($path, $file, $type) = @_;
doTime("beginning validateFreepass") if $opt_timing;
my $fh = Encode::openUtil($file, $path);
#my $lineNumber = 0;
#while(<$fh>) {
# chomp;
# $lineNumber++;
# last if($opt_quick && $lineNumber >= $quickCount);
#}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' free pass on validation\n");
doTime("done validateFreepass") if $opt_timing;
return ();
}
sub validateWig
{
my ($path, $file, $type) = @_;
my $filePath = defined($path) ? "$path/$file" : $file;
doTime("beginning validateWig") if $opt_timing;
HgAutomate::verbose(2, "validateWig($file,$type) -> wigEncode\n");
my @cmds;
# wigEncode knows how to handle zipped files so we do not need to special case them.
push(@cmds, "/cluster/bin/x86_64/wigEncode -noOverlapSpanData $filePath /dev/null /dev/null");
# This can produce /data/tmp/SafePipe_NNN_.err files
my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => "/dev/null", DEBUG => $opt_verbose - 1);
if(my $err = $safe->exec()) {
my $err = $safe->stderr();
chomp($err);
return "File \'$file\' failed wiggle validation: " . $err;
} else {
HgAutomate::verbose(2, "File \'$file\' passed wiggle validation\n");
}
doTime("done validateWig") if $opt_timing;
return ();
}
sub validateBed {
# Validate each line of a bed 5 or greater file.
my ($path, $file, $type) = @_;
my $lineNumber = 0;
doTime("beginning validateBed") if $opt_timing;
my $fh = Encode::openUtil($file, $path);
while(<$fh>) {
chomp;
$lineNumber++;
next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
my @fields = split /\s+/;
my $fieldCount = @fields;
next if(!$fieldCount);
my $prefix = "Failed bed validation, file '$file'; line $lineNumber:";
if(/^(track|browser)/) {
;
} elsif($fieldCount < 5) {
die "$prefix not enough fields; " . scalar(@fields) . " present; at least 5 are required\n";
} elsif (!$chromInfo{$fields[0]}) {
die "$prefix field 1 value ($fields[0]) is invalid; not a valid chrom name\n";
} elsif ($fields[1] !~ /^\d+$/) {
die "$prefix field 2 value ($fields[1]) is invalid; value must be a positive number\n";
} elsif ($fields[2] !~ /^\d+$/) {
die "$prefix field 3 value ($fields[2]) is invalid; value must be a positive number\n";
} elsif ($fields[2] < $fields[1]) {
die "$prefix field 3 value ($fields[2]) is less than field 2 value ($fields[1])\n";
} elsif ($fields[4] !~ /^\d+$/ && $fields[4] !~ /^\d+\.\d+$/) {
die "$prefix field 5 value ($fields[4]) is invalid; value must be a positive number\n";
} elsif ($fields[4] < 0 || $fields[4] > 1000) {
die "$prefix field 5 value ($fields[4]) is invalid; score must be 0-1000\n";
} elsif ($type eq 'bed5FloatScore' && $fieldCount < 6) {
die "$prefix field 6 invalid; bed5FloatScore requires 6 fields";
} elsif ($type eq 'bed5FloatScore' && $fields[5] !~ /^$floatRegEx$/) {
die "$prefix field 6 value '$fields[5]' is invalid; must be a float\n";
} else {
;
}
last if($opt_quick && $lineNumber >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed bed validation\n");
doTime("done validateBed",$lineNumber) if $opt_timing;
return ();
}
sub validateBedGraph {
# Validate each line of a bedGraph file.
my ($path, $file, $type) = @_;
my $lineNumber = 0;
doTime("beginning validateBedGraph") if $opt_timing;
my $fh = Encode::openUtil($file, $path);
while(<$fh>) {
chomp;
$lineNumber++;
next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
my @fields = split /\s+/;
my $fieldCount = @fields;
next if(!$fieldCount);
my $prefix = "Failed bedGraph validation, file '$file'; line $lineNumber:";
if(/^(track|browser)/) {
;
} elsif($fieldCount != 4) {
die "$prefix found " . scalar(@fields) . " fields; need 4\n";
} elsif (!$chromInfo{$fields[0]}) {
die "$prefix field 1 value ($fields[0]) is invalid; not a valid chrom name\n";
} elsif ($fields[1] !~ /^\d+$/) {
die "$prefix field 2 value ($fields[1]) is invalid; value must be a positive number\n";
} elsif ($fields[2] !~ /^\d+$/) {
die "$prefix field 3 value ($fields[2]) is invalid; value must be a positive number\n";
} elsif ($fields[2] < $fields[1]) {
die "$prefix field 3 value ($fields[2]) is less than field 2 value ($fields[1])\n";
} elsif ($fields[3] !~ /^$floatRegEx$/) {
die "$prefix field 4 value '$fields[3]' is invalid; must be a float [$floatRegEx]\n";
} else {
;
}
last if($opt_quick && $lineNumber >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed bedGraph validation\n");
doTime("done validateBedGraph", $lineNumber) if $opt_timing;
return ();
}
sub validateGtf {
# validate GTF by converting to genePred and validating that
my ($path, $file, $type) = @_;
my $errFile = "$path/doEncodeValidate.gtf.err";
doTime("beginning validateGtf") if $opt_timing;
my $filePath = defined($path) ? "$path/$file" : $file;
my $outFile = "$path/doEncodeValidate.gtf.bed";
if(Encode::isZipped($filePath)) {
# XXXX should be modified to handle zipped files.
die "We don't currently support gzipped gtf files\n";
}
HgAutomate::verbose(2, "validateGtf(path=$path,file=$file,type=$type)\n");
# XXXX Add support for $opt_quick
my $err = system ( "gtfToGenePred $filePath $outFile >$errFile 2>&1");
if ($err) {
print STDERR "File \'$file\' failed GTF validation\n";
open(ERR, "$errFile") || die "ERROR: Can't open gtfToGenePred error file \'$errFile\': $!\n";
my @err = <ERR>;
die "@err\n";
}
unlink $errFile;
HgAutomate::verbose(2, "File \'$file\' passed gtfToGenePred conversion \n");
doTime("done validateGtf") if $opt_timing;
my @res = validateGene(undef,$outFile,$type);
if (scalar(@res)==0) { # no errors so remove the temp .bed file
HgAutomate::verbose(2, "File \'$file\' passed gtf gene validation \n");
unlink $outFile;
}
return @res;
}
sub validateGene {
my ($path, $file, $type) = @_;
my $outFile = "validateGene.out";
doTime("beginning validateGene") if $opt_timing;
my $filePath = defined($path) ? "$path/$file" : $file;
if(Encode::isZipped($filePath)) {
# XXXX should be modified to handle zipped files.
die "We don't currently supporte gzipped gene files\n";
}
# XXXX Add support for $opt_quick
my $err = system (
"cd $outPath; egrep -v '^track|browser' $filePath | ldHgGene -out=genePred.tab -genePredExt hg18 testTable stdin >$outFile 2>&1");
if ($err) {
print STDERR "File \'$file\' failed GFF validation\n";
open(ERR, "$outPath/$outFile") || die "ERROR: Can't open GFF validation file \'$outPath/$outFile\': $!\n";
my @err = <ERR>;
die "@err\n";
} else {
HgAutomate::verbose(2, "File \'$file\' passed GFF validation\n");
}
doTime("done validateGene") if $opt_timing;
return ();
}
sub validateTagAlign
{
my ($path, $file, $type) = @_;
- my $safe = SafePipe->new(CMDS => ["validateFiles -type=tagAlign $file"]);
+ # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+ my $safe = SafePipe->new(CMDS => ["validateFiles -chromDb=hg18 -type=tagAlign $file"]);
if(my $err = $safe->exec()) {
print STDERR "ERROR: failed validateTagAlign : " . $safe->stderr() . "\n";
# don't show end-user pipe error(s)
return("failed validateTagAlign for '$file'");
}
return ();
}
sub validatePairedTagAlign
# This is like tag align but with two additional sequence fields appended; seq1 and seq2
{
my ($path, $file, $type) = @_;
- my $safe = SafePipe->new(CMDS => ["validateFiles -type=pairedTagAlign $file"]);
+ # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+ my $safe = SafePipe->new(CMDS => ["validateFiles -chromDb=hg18 -type=pairedTagAlign $file"]);
if(my $err = $safe->exec()) {
print STDERR "ERROR: failed validatePairedTagAlign : " . $safe->stderr() . "\n";
# don't show end-user pipe error(s)
return("failed validatePairedTagAlign for '$file'");
}
return ();
}
sub validateNarrowPeak
{
my ($path, $file, $type) = @_;
my @list = ({TYPE => "chrom", NAME => "chrom"},
{TYPE => "uint", NAME => "chromStart"},
{TYPE => "uint", NAME => "chromEnd"},
{TYPE => "string", NAME => "name"},
{TYPE => "uint", NAME => "score"},
{REGEX => "[+-\\.]", NAME => "strand"},
{TYPE => "float", NAME => "signalValue"},
{TYPE => "float", NAME => "pValue"},
{TYPE => "float", NAME => "qValue"},
{TYPE => "int", NAME => "peak"});
return validateWithList($path, $file, $type, $maxBedRows, "validateNarrowPeak", \@list);
}
sub validateBroadPeak
{
my ($path, $file, $type) = @_;
- my $safe = SafePipe->new(CMDS => ["validateFiles -type=broadPeak $file"]);
+ # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+ my $safe = SafePipe->new(CMDS => ["validateFiles -chromDb=hg18 -type=broadPeak $file"]);
if(my $err = $safe->exec()) {
print STDERR "ERROR: failed validateBroadPeak : " . $safe->stderr() . "\n";
# don't show end-user pipe error(s)
return("failed validateBroadPeak for '$file'");
}
return ();
}
sub validateGappedPeak
{
my ($path, $file, $type) = @_;
my @list = ({TYPE => "chrom", NAME => "chrom"},
{TYPE => "uint", NAME => "chromStart"},
{TYPE => "uint", NAME => "chromEnd"},
{TYPE => "string", NAME => "name"},
{TYPE => "uint", NAME => "score"},
{REGEX => "[+-\\.]", NAME => "strand"},
{TYPE => "uint", NAME => "thickStart"},
{TYPE => "uint", NAME => "thickEnd"},
{TYPE => "string", NAME => "itemRgb"},
{TYPE => "uint", NAME => "blockCount"},
{TYPE => "string", NAME => "blockSizes"},
{TYPE => "string", NAME => "blockStarts"},
{TYPE => "float", NAME => "signalValue"},
{TYPE => "float", NAME => "pValue"},
{TYPE => "float", NAME => "qValue"}
);
return validateWithList($path, $file, $type, $maxBedRows, "validateGappedPeak", \@list);
}
sub validateFastQ
{
# Syntax per http://maq.sourceforge.net/fastq.shtml
# I added '/' in the seqNameRegEx and plusLine even though it wasnt in the spec
# because this is what Colin Kingswood (Gingeras project)
# is getting in the fastq files from GIS for the GisPet project
# and they are being sent on to us
# Note on "FASTQ Quality scores":- http://maq.sourceforge.net/qual.shtml
# Fastq has 2 different semantics for the score field.
# - fastq produced directly from Solexa has a 'solexa' quality score
# - fastq defined by Sanger has a 'PHRED' quality score
# - The 2 urls above show how to convert between both
my ($path, $file, $type) = @_;
my $safe = SafePipe->new(CMDS => ["validateFiles -type=fastq $file"]);
if(my $err = $safe->exec()) {
print STDERR "ERROR: failed validateFastQ : " . $safe->stderr() . "\n";
# don't show end-user pipe error(s)
return("failed validateFastQ for '$file'");
}
return ();
}
sub validateCsfasta
{
# Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
# Sample:-
# # Wed Jul 30 15:30:48 2008 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/S0033/S0033_20080723_2/I22_EA/results.01/primary.20080730194737531 --name=S0033_20080723_2_I22_EA_ --tag=F3 --minlength=30 --mask=111111111111111111111111111111 --prefix=T /data/results/S0033/S0033_20080723_2/I22_EA/jobs/postPrimerSetPrimary.1416/rawseq
# # Cwd: /home/pipeline
# # Title: S0033_20080723_2_I22_EA_
# >461_19_90_F3
# T203033330010111011221200302001
# >461_19_209_F3
# T022213002230311203200200322000
# Files from GIS have this header:
# >920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0###
# T01301010111200210102321210100112312
my ($path, $file, $type) = @_;
doTime("beginning validateCsfasta") if $opt_timing;
my $fh = Encode::openUtil($file, $path);
my $line = 0;
my $state = 'header';
my $seqName;
my $states = {header => {REGEX => "^>\\d+_\\d+_\\d+_\.\\d+.*", NEXT => 'seq'},
seq => {REGEX => "^[GT]\\d+", NEXT => 'header'},
};
while(<$fh>) {
chomp;
$line++;
next if m/^#/;
my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateCsfasta]";
my $regex = $states->{$state}{REGEX};
if(/^${regex}$/) {
$seqName = $1 if($state eq 'header');
$state = $states->{$state}{NEXT};
} else {
return("$errorPrefix (expecting $state):\nline: $_");
}
last if($opt_quick && $line >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
doTime("done validateCsfasta", $line) if $opt_timing;
return ();
}
sub validateCsqual
{
# Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
# Sample:-
# # Cwd: /home/pipeline
# # Title: S0033_20080723_2_I22_EA_
# >461_19_90_F3
# 20 10 8 13 8 10 20 7 7 24 15 22 21 14 14 8 11 15 5 20 6 5 8 22 6 24 3 16 7 11
# >461_19_209_F3
# 16 8 5 12 20 24 19 8 13 17 11 23 8 24 8 7 17 4 20 8 29 7 3 16 3 4 8 20 17 9
my ($path, $file, $type) = @_;
doTime("beginning validateCsqual") if $opt_timing;
my $fh = Encode::openUtil($file, $path);
my $line = 0;
my $state = 'header';
my $seqName;
my $states = {header => {REGEX => "^>\\d+_\\d+_\\d+_\.\\d+", NEXT => 'qual'},
qual => {REGEX => "^(\\d+ )+", NEXT => 'header'},
};
while(<$fh>) {
chomp;
$line++;
next if m/^#/;
my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateCsqual]";
my $regex = $states->{$state}{REGEX};
if(/^${regex}$/) {
$seqName = $1 if($state eq 'header');
$state = $states->{$state}{NEXT};
} else {
return("$errorPrefix (expecting $state) [regex=$regex]:\nline: [$_]");
}
last if($opt_quick && $line >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
doTime("done validateCsqual", $line) if $opt_timing;
return ();
}
sub validateFasta
# Wold lab fasta files; they dont have fastq format.
# Sample fasta lines are:
#>HWI-EAS229_75_30DY0AAXX:7:1:0:949/1
#NGCGGATGTTCTCAGTGTCCACAGCGCAGGTGAAATAAGGGAAGCAGTAGCGACGCCCATCTCCACGCGCAGCGC
#>HWI-EAS229_75_30DY0AAXX:7:1:0:1739/1
#NAGCCATCAGGAAAGCAAGGAGGGGGCATTAAAGGACAATCAAGGGGTTTGGAGGAAGGAGCAGGCCGGAGGCAA
{
# Wold lab has fasta files, like fastq format without quality
my ($path, $file, $type) = @_;
doTime("beginning validateFasta") if $opt_timing;
HgAutomate::verbose(2, "validateFasta($path,$file,$type)\n");
return () if $opt_skipValidateFastQ;
doTime("beginning validateFasta") if $opt_timing;
my $fh = Encode::openUtil($file, $path);
my $line = 0;
my $state = 'firstLine';
my $seqName;
my $seqNameRegEx = "[A-Za-z0-9_.:/-]+";
my $seqRegEx = "[A-Za-z\n\.~]+";
my $states = {firstLine => {REGEX => ">($seqNameRegEx)", NEXT => 'seqLine'},
seqLine => {REGEX => $seqRegEx, NEXT => 'firstLine'}};
while(<$fh>) {
chomp;
$line++;
my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateFasta]";
my $regex = $states->{$state}{REGEX};
if(/^${regex}$/) {
$state = $states->{$state}{NEXT};
} else {
return("$errorPrefix (expecting $state):\nline: $_");
}
last if($opt_quick && $line >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
doTime("done validateFasta", $line) if $opt_timing;
return ();
}
sub validateRpkm
# Wold lab format, has gene name and 2 floats
# Allowing Gene name to be composed of any characters but <tab>
#
# Example format 1 (3 cols):-
# HBG2 0.583 1973.85
# RPS20 0.523 1910.01
# RPLP0 1.312 1800.51
#
# Example format 2 (7 cols):- (*.accepted.rpkm)
# ENSG00000003056 chr12 8989051 8989354 2.43 303 M6PR
# ENSG00000006015 chr19 18560887 18561077 1.10 190 C19orf60
# ENSG00000008516 chr16 3047223 3047380 0.61 157 MMP25
#
# Example format 3 (5 cols): (*.final.rpkm)
#GID gene len_kb RPKM multi/all
# OTTHUMG00000151214 IGLC2 0.722 3579.34 0.84
# FAR3664 FAR3664 0.200 3216.32 0.94
# OTTHUMG00000021144 TMSB4X 3.551 2767.52 0.35
{
my ($path, $file, $type) = @_;
doTime("beginning validateRpkm") if $opt_timing;
my $lineNumber = 0;
my $fh = Encode::openUtil($file, $path);
while(<$fh>) {
chomp;
$lineNumber++;
next if m/^#/;
my @fields = split /\s+/;
my $cols = scalar(@fields);
die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
unless $cols == 3 or $cols == 5 or $cols == 7;
# unless m/^([^\t]+)\t(\d+\.\d+)\t(\d+\.\d+)$/;
last if($opt_quick && $lineNumber >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
doTime("done validateRpkm", $lineNumber) if $opt_timing;
return ();
}
sub validateBowtie
# Unkown format (for download) from Wold lab.
# Assume last column is optional
# Sample lines:-
# HWI-EAS229_75_30DY0AAXX:7:1:0:1545/1 + chr1 5983615 NCGTCCATCTCACATCGTCAGGAAAGGGGGAAGCACTGGATGGCTGTGGCCTCACAGGCAGGGAGAGTGGGGTCC IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 0 0:G>N
# HWI-EAS229_75_30DY0AAXX:7:1:0:1591/1 - uc002fcb.1|22|70699936 45 CTATTTCCACCAAGCAGCCAAGCTCAAGGGAATCGGGGAGTACGTGAACATCCGCACAGGGATGCCCTGCCACTN IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 0 0:T>N]
# HWI-EAS229_75_30DY0AAXX:7:1:0:1766/1 - chr18 72954304 GCAGCCACCAGAAGCGGGAAGAGGTGAAGACAGAGCCTCCTGCAGAGCTCCCACTCTGCCAACGCCTTGACTTTN IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 0 0:G>N,59:T>G
{
my ($path, $file, $type) = @_;
doTime("beginning validateBowtie") if $opt_timing;
my $lineNumber = 0;
doTime("beginning validateBedGraph") if $opt_timing;
my $fh = Encode::openUtil($file, $path);
while(<$fh>) {
chomp;
$lineNumber++;
next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
die "Failed bowtie validation, file '$file'; line $lineNumber: line=[$_]\n"
unless $_ =~ m/^([A-Za-z0-9:>_,\.\|\/-]+)\t([+-])\t([A-Za-z0-9:>_,\.\|\/-]+)\t(\d+)\t(\w+)\t(\w+)\t(\d+)\t([A-Za-z0-9:>_,\.\|\/-]+)?$/;
last if($opt_quick && $lineNumber >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
doTime("done validateBowtie", $lineNumber) if $opt_timing;
return ();
}
sub validatePsl
# PSL format (for download) from Wold lab.
# EXAMPLE FROM http://genome.ucsc.edu/FAQ/FAQformat#format2
# This adds 2 columns (sequence,<tab>sequence,) to the standard 21 columns
# Only the first 21 are validated
#
# Sample first 6 lines
#psLayout version 3
#
#match mis- rep. N's Q gap Q gap T gap T gap strand Q Q Q Q T T T T block blockSizes qStarts tStarts
# match match count bases count bases name size start end name size start end count
#---------------------------------------------------------------------------------------------------------------------------------------------------------------
#71 3 0 0 0 0 0 0 - HWI-EAS229_75_30DY0AAXX:4:1:0:743/1 75 1 75 chr2 242951149 184181032 184181106 1 74, 0, 184181032, agccttttacagcaacacctttacctctgctagatctttctgtagctcgtctgaagccatgggggctgggtcag, agccttttccagcaacacctttacctcttctagatctttctgtagctcttctgaagccatgggggctgggtcag,
#72 2 0 0 0 0 0 0 - HWI-EAS229_75_30DY0AAXX:7:1:0:713/1 75 1 75 chr14 106368585 49540119 49540193 1 74, 0, 49540119, cgggtgcgggccgagcagttctccgcacctccggtaaaggttcaggaccgggtgatggtctctgcagcagtcag, ccggtgcgggccgagcagttctccgcacctccggtaaaggtgcaggaccgggtgatggtctctgcagcagtcag,
{
my ($path, $file, $type) = @_;
my $lineNumber = 0;
doTime("beginning validatePsl") if $opt_timing;
my $fh = Encode::openUtil($file, $path);
while(<$fh>) {
chomp;
$lineNumber++;
next if $lineNumber == 1 and m/^psLayout version \d+/; # check first line
next if $lineNumber == 2 and m/^$/;
next if $lineNumber == 3 and m/^match/;
next if $lineNumber == 4 and m/^\s+match/;
next if $lineNumber == 5 and m/^------/;
die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
unless m/^(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t([+-][+-]?)\t([A-Za-z0-9:>\|\/_-]+)\t(\d+)\t(\d+)\t(\d+)\t(\w+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t([0-9,]+)\t([0-9,]+)\t([0-9,]+)/;
last if($opt_quick && $lineNumber >= $quickCount);
}
$fh->close();
HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
doTime("done validatePsl", $lineNumber) if $opt_timing;
return ();
}
############################################################################
# Misc subroutines
sub validateDdfField {
# validate value for type of field
my ($type, $val, $track, $daf) = @_;
$type =~ s/ /_/g;
HgAutomate::verbose(4, "Validating $type: " . (defined($val) ? $val : "") . "\n");
if($validators{$type}) {
return $validators{$type}->($val, $track, $daf);
} else {
die "Validator for type '$type' is missing";
}
}
sub checkDataFormat {
# validate file type
my ($format, $file) = @_;
HgAutomate::verbose(3, "Checking data format for $file: $format\n");
my $type = $format;
if ($format =~ m/(bed) (\d+)/) {
$format = $1;
}
if ($format =~ m/(bedGraph) (\d+)/) {
$format = $1;
}
$formatCheckers{$format} || return "Data format \'$format\' is unknown\n";
return $formatCheckers{$format}->($submitPath, $file, $type);
HgAutomate::verbose(3, "Done checking data format for $file: $format\n");
}
sub ddfKey
{
# return key for given DDF line (e.g. "antibody=$antibody;cell=$cell" for ChIP-Seq data).
# The key includes replicate (if applicable) if $includeReplicate is true.
my ($fields, $ddfHeader, $daf, $includeReplicate) = @_;
if (defined($daf->{variables})) {
my $delim = ";";
my $key = join($delim, map("$_=" . $fields->{$_}, sort @{$daf->{variableArray}}));
if($includeReplicate && defined($fields->{replicate})) {
$key .= $delim . $fields->{replicate};
}
return $key;
} else {
return undef; # Some dafs have no variables, eg, Sanger Gencode
}
}
sub isDownloadOnly {
my ($view, $grant, $lab, $daf) = @_;
# Added 'downloadOnly' bool to DAF views so these rules can be explicit not hardcoded
# Dont load any RawData* or Comparative views,
# Dont load Alignments unless they are from Gingeras or Wold labs (RNA folks like to see their RNAs)
# Riken group have RawData and RawData2 because they have colorspace fasta and quality files
# Wold group have RawData, RawData[2-7]
# Wold group alignments are called 'Aligns', 'Splices', 'Paired'
return ( (($daf->{TRACKS}->{$view}->{downloadOnly} || "") eq 'yes') or ($view =~ m/^RawData\d*$/ or $view eq 'Comparative'
or ($view eq 'Alignments' and $grant ne "Gingeras" and $grant ne "Wold"))) ? 1 : 0;
}
sub printCompositeTdbSettings {
# prints out trackDb.ra settings for the composite track
local *OUT_FILE = shift;
my ($daf,%ddfSets) = @_;
my $compositeTrack = Encode::compositeTrackName($daf);
print OUT_FILE "track\t\t$compositeTrack\n";
print OUT_FILE "compositeTrack\ton\n";
my $setting = "subGroup1\tview Views";
my $visDefault = "visibilityViewDefaults\t";
# Cycle through to get best view to default labels and to get all views and terms
for my $view (keys %{$daf->{TRACKS}}) {
for my $key (keys %ddfSets) {
if(defined($ddfSets{$key}{VIEWS}{$view})) {
my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
if(!$downloadOnly) {
$setting = $setting . " " . $view . "=" . $view;
$visDefault = $visDefault . " " . $view . "=";
if($view eq "Peaks") {
$visDefault = $visDefault . "dense";
} elsif($view eq "Signal") {
$visDefault = $visDefault . "full";
} else {
$visDefault = $visDefault . "hide";
}
}
}
}
}
print OUT_FILE "shortLabel\t" . $daf->{lab} . " " . $daf->{dataType} . "\n"; # Default to lab datatype
print OUT_FILE "longLabel\tENCODE " . $daf->{lab} . " " . $daf->{grant} . " " . $daf->{dataType} . "\n"; # Default to lab grant datatype
print OUT_FILE "group\t\tregulation\n"; # This is just a guess. Buyer beware
print OUT_FILE $setting . "\n"; # "subGroup1\tview Views Peaks=Peaks Signal=Signal RawSignal=Raw_Signal\n";
# Need to create N subgroups with M members each
if (defined($daf->{variables})) {
my $grpNo = 1;
my $sortOrder = "sortOrder\t";
my $dimensions = "dimensions";
my $controlledVocab = "controlledVocabulary\tencode/cv.ra";
if (defined($daf->{variables})) {
my @variables = @{$daf->{variableArray}};
for my $variable (@variables) {
$grpNo++;
my $groupVar = $variable;
$groupVar = "factor" if $variable eq "antibody";
$groupVar = "cellType" if $variable eq "cell";
if($grpNo < 5) {
$dimensions .= "\tdimension" . chr(86 + $grpNo) . "=" . $groupVar;
}
$sortOrder = "$sortOrder$groupVar=+ ";
$controlledVocab = "$controlledVocab $groupVar";
$setting = "subGroup$grpNo\t$groupVar " . ucfirst($groupVar);
$setting = "subGroup$grpNo\t$groupVar " . "Cell_Line" if $variable eq "cell";
for my $key (keys %ddfSets) {
my @pairs = split(';', $key);
for my $pair (@pairs) {
my ($var, $term) = split('=', $pair);
if($var eq $variable) {
$setting = "$setting $term=$term";
}
}
}
print OUT_FILE $setting . "\n"; # "subGroup2\cellTyle Cell_Line ???\n;
}
}
$setting = $sortOrder . "view=+";
print OUT_FILE $dimensions . "\n"; # "dimensions dimensionX=cellType dimensionY=factor"
print OUT_FILE $setting . "\n"; # "sortOrder\tcellType=+ factor=+ view=+\n";
print OUT_FILE $controlledVocab . "\n"; # "controlledVocabulary\tencode/cv.ra cellType factor\n";
}
print OUT_FILE "dragAndDrop\tsubTracks\n";
print OUT_FILE $visDefault . "\n"; #"visibilityViewDefaults\tPeaks=dense Signal=full RawSignal=hide\n";
print OUT_FILE "priority\t0\n";
print OUT_FILE "type\t\tbed 3\n";
print OUT_FILE "wgEncode\t1\n\n";
}
############################################################################
# Main
my $now = time();
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($now);
my @ddfHeader; # list of field names on the first line of DDF file
my %ddfHeader = (); # convenience hash version of @ddfHeader (maps name to field index)
my @ddfLines = (); # each line in DDF (except for fields header); value is a hash; e.g. {files => 'foo.bed', cell => 'HeLa-S3', ...}
my %ddfSets = (); # info about DDF entries broken down by ddfKey
my %ddfReplicateSets = (); # info about DDF entries broken down by ddfKey (including replicate)
my $wd = cwd();
my $ok = GetOptions("allowReloads",
"configDir=s",
"fileType=s",
"metaDataOnly",
"outDir=s",
"quick",
"timing",
"skipAll",
"skipAutoCreation",
"skipOutput",
"skipValidateFiles",
"skipValidateFastQ",
"validateDaf",
"validateFile",
"sendEmail",
"verbose=i",
);
usage() if (!$ok);
$opt_verbose = 1 if (!defined $opt_verbose);
$opt_sendEmail = 0 if (!defined $opt_sendEmail);
if($opt_skipAll) {
$opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
}
if($opt_metaDataOnly) {
$opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
$opt_allowReloads = 1;
}
usage() if (scalar(@ARGV) < 2);
# Get command-line args
my $submitType = $ARGV[0]; # currently not used
my $submitDir = $ARGV[1];
$ENV{TMPDIR} = $Encode::tempDir;
if($opt_validateFile && $opt_fileType) {
# kludgy, but we need chromInfo populated to validate files, so we assume we are using hg18
my $db = HgDb->new(DB => 'hg18');
$db->getChromInfo(\%chromInfo);
if(my @errors = checkDataFormat($opt_fileType, $submitDir)) {
die "Invalid file: " . join(", ", @errors) . "\n";
} else {
exit(0);
}
}
# Determine submission, configuration, and output directory paths
HgAutomate::verbose(2, "Validating submission in directory \'$submitDir\'\n");
if ($submitDir =~ /^\/.*/) {
$submitPath = $submitDir;
} else {
$submitPath = "$wd/$submitDir";
}
HgAutomate::verbose(4, "Submission directory path: \'$submitPath\'\n");
if (defined $opt_configDir) {
if ($opt_configDir =~ /^\//) {
$configPath = $opt_configDir;
} else {
$configPath = "$wd/$opt_configDir";
}
} else {
$configPath = "$submitPath/../config"
}
if(!(-d $configPath)) {
die "configPath '$configPath' is invalid; Can't find the config directory\n";
}
HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");
if (defined $opt_outDir) {
if ($opt_outDir =~ /^\//) {
$outPath = $opt_outDir;
} else {
$outPath = "$wd/$opt_outDir";
}
} else {
$outPath = "$submitPath/out"
}
HgAutomate::verbose(4, "Output directory path: '$outPath'; submitPath: '$submitPath'\n");
if(!$opt_validateDaf) {
# Change dir to submission directory
if(!chdir($submitPath)) {
die ("SYS ERR; Can't change to submission directory \'$submitPath\': $OS_ERROR\n");
}
HgAutomate::verbose(3, "Creating output in directory \'$outPath\'\n");
if(!(-d $outPath)) {
mkdir $outPath || die ("SYS ERR: Can't create out directory \'$outPath\': $OS_ERROR\n");
}
}
# labs is now in fact the list of grants (labs are w/n grants, and are not currently validated).
$grants = Encode::getGrants($configPath);
$fields = Encode::getFields($configPath);
if($opt_validateDaf) {
if(-f $submitDir) {
Encode::parseDaf($submitDir, $grants, $fields);
} else {
Encode::getDaf($submitDir, $grants, $fields);
}
print STDERR "DAF is valid\n";
exit(0);
}
$daf = Encode::getDaf($submitDir, $grants, $fields);
my $db = HgDb->new(DB => $daf->{assembly});
$db->getChromInfo(\%chromInfo);
if($opt_sendEmail) {
if($grants->{$daf->{grant}} && $grants->{$daf->{grant}}{wranglerEmail}) {
my $email = $grants->{$daf->{grant}}{wranglerEmail};
if($email) {
`echo "dir: $submitPath" | /bin/mail -s "ENCODE data from $daf->{grant}/$daf->{lab} lab has been submitted for validation." $email`;
}
}
}
# Add the variables in the DAF file to the required fields list
if (defined($daf->{variables})) {
for my $variable (keys %{$daf->{variableHash}}) {
$fields->{$variable}{required} = 1;
$fields->{$variable}{file} = 'ddf';
}
}
# make replicate column required when appropriate.
my $hasReplicates = 0;
my $maxOrder = 0;
for my $view (keys %{$daf->{TRACKS}}) {
$hasReplicates += $daf->{TRACKS}{$view}{hasReplicates};
if($daf->{TRACKS}{$view}{order} > $maxOrder) {
$maxOrder = $daf->{TRACKS}{$view}{order}
}
}
if($hasReplicates) {
$fields->{replicate}{required} = 1;
}
# Open dataset descriptor file (DDF)
my @glob = glob "*.DDF";
push(@glob, glob "*.ddf");
my $ddfFile = Encode::newestFile(@glob);
HgAutomate::verbose(2, "Using newest DDF file \'$ddfFile\'\n");
my $lines = Encode::readFile($ddfFile);
my $ddfLineNumber = 0;
# Get header containing column names
while(@{$lines}) {
my $line = shift(@{$lines});
$ddfLineNumber++;
# remove leading and trailing spaces and newline
$line =~ s/^\s+//;
$line =~ s/\s+$//;
# ignore empty lines and comments
next if $line =~ /^$/;
next if $line =~ /^#/;
if($line !~ /\t/) {
die "ERROR: The DDF header has no tabs; the DDF is required to be tab delimited\n";
}
@ddfHeader = split(/\t/, $line);
for (my $i=0; $i < @ddfHeader; $i++) {
$ddfHeader{$ddfHeader[$i]} = $i;
}
last;
}
my @errors = Encode::validateFieldList(\@ddfHeader, $fields, 'ddf');
if(@errors) {
die "ERROR in DDF '$ddfFile':\n" . join("\n", @errors) . "\n";
}
%terms = Encode::getControlledVocab($configPath);
my @variables;
if (defined($daf->{variables})) {
@variables = @{$daf->{variableArray}};
} else {
# Hubbard Sanger Gencode project has no variables
@variables = ();
}
my %metadataHash;
# Process lines in DDF file. Create a list with one entry per line;
# the entry is field/value hash (fields per @ddfHeader).
while (@{$lines}) {
my $line = shift(@{$lines});
$ddfLineNumber++;
my $errorPrefix = "DDF lineNumber $ddfLineNumber:";
HgAutomate::verbose(2, "Parsing ddf line $ddfLineNumber\n");
$line =~ s/^\s+//;
$line =~ s/\s+$//;
next if $line =~ /^#/;
next if $line =~ /^$/;
if($line !~ /\t/) {
pushError(\@errors, "$errorPrefix line has no tabs; the DDF is required to be tab delimited");
next;
}
my $i = 0;
my %line;
for my $val (split('\t', $line)) {
$line{$ddfHeader[$i]} = $val;
$i++;
}
if(my @tmp = Encode::validateValueList(\%line, $fields, 'ddf')) {
pushError(\@errors, $errorPrefix . "\n" . join("\n", @tmp));
next;
}
my $view = $line{view};
HgAutomate::verbose(2,"Parsing $view\n");
if($daf->{TRACKS}{$view}) {
my $files = $line{files};
if($fields->{replicate}{required}) {
my $replicate = $line{replicate};
if($daf->{TRACKS}{$view}{hasReplicates} && (!defined($replicate) || !length($replicate))) {
pushError(\@errors, "$errorPrefix missing replicate number for view '$view'");
}
}
my @filenames;
for(split(',', $files)) {
# Use glob explicitly so our error messages have the list of files actually used.
if(my @glob = glob) {
push(@filenames, @glob);
} else {
push(@filenames, $_);
}
}
$line{files} = \@filenames;
my @metadataErrors;
for my $field (keys %line) {
push(@metadataErrors, validateDdfField($field, $line{$field}, $view, $daf));
}
if(@metadataErrors) {
pushError(\@errors, @metadataErrors);
} else {
# avoid spurious errors by not putting invalid lines into %ddfSets
# ddfKey returnes undef if there are no variables defined
if (defined(ddfKey(\%line, \%ddfHeader, $daf, 1))) {
$ddfSets{ddfKey(\%line, \%ddfHeader, $daf, 0)}{VIEWS}{$view} = \%line;
$ddfReplicateSets{ddfKey(\%line, \%ddfHeader, $daf, 1)}{VIEWS}{$view} = \%line;
my $str = join(", ", map($line{$_}, sort(@variables)));
$metadataHash{$str} = 1;
}
}
push(@ddfLines, \%line);
} else {
pushError(\@errors, "$errorPrefix undefined view '$view'");
}
HgAutomate::verbose(2, "End of parsing ddf line $ddfLineNumber\n");
}
my $tmpCount = 1;
if(!@errors) {
# Look for missing required views and create missing, optional views, but
# but don't bother if we have already encountered errors.
# Could also look for replicate inconsistency here (e.g. Alignments for replicate 3 but not fastq for replicate 3).
for my $key (keys %ddfSets) {
for my $view (keys %{$daf->{TRACKS}}) {
if($daf->{TRACKS}{$view}{required}) {
if(!defined($ddfSets{$key}{VIEWS}{$view})) {
pushError(\@errors, "view '$view' missing for $key");
}
}
}
}
doTime("beginning ddfReplicateSets loop") if $opt_timing;
for my $key (keys %ddfReplicateSets) {
# create missing optional views (e.g. ChIP-Seq RawSignal or transcriptome project PlusRawSignal and MinusRawSignal)
# note this loop assumes these are on a per replicate basis.
# Also note that any project (like transcriptome) that doesnt have replicates should also use
# this for their auto-create signals.
HgAutomate::verbose(2, "ddfReplicateSets loop key=[$key] aln=[".(defined($ddfReplicateSets{$key}{VIEWS}{Alignments}))."] rawsig=[".(defined($ddfReplicateSets{$key}{VIEWS}{RawSignal}))."]\n");
if($daf->{noAutoCreate} ne "yes" && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
&& !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})
&& !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
&& !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
&& ($daf->{dataType} ne 'MethylSeq')) {
if($daf->{dataType} eq 'ChipSeq' && !defined($daf->{medianFragmentLength})) {
pushError(\@errors, "Missing medianFragmentLength field; this field is required for dataType '$daf->{dataType}' when RawSignal view is not provided");
} else {
# hack for case where they have removed RawSignal view in the DAF
# - if no (Plus|Minus|)RawSignal is defined, assume RawSignal is required
if(!defined($daf->{TRACKS}{RawSignal}{order})
&& !defined($daf->{TRACKS}{PlusRawSignal}{order})
&& !defined($daf->{TRACKS}{MinusRawSignal}{order}) ) {
$daf->{TRACKS}{RawSignal}{order} = ++$maxOrder;
}
# Make a list of the PlusRawSignal/MinusRawSignal or RawSignals we are going to have to make
my @newViews = ();
push @newViews, "RawSignal" if $daf->{TRACKS}{RawSignal}{order};
push @newViews, "PlusRawSignal" if $daf->{TRACKS}{PlusRawSignal}{order};
push @newViews, "MinusRawSignal" if $daf->{TRACKS}{MinusRawSignal}{order};
foreach my $newView (@newViews) #loop around making them
{
my $alignmentLine = $ddfReplicateSets{$key}{VIEWS}{Alignments};
my %line = %{$alignmentLine};
$line{view} = $newView;
$line{type} = 'wig';
$ddfReplicateSets{$key}{VIEWS}{$newView} = \%line;
my @unzippedFiles = ();
doTime("beginning unzipping replicates files for view [$newView] key=[$key]") if $opt_timing;
for my $file (@{$alignmentLine->{files}}) {
# Unzip any zipped files - only works if they are with .gz suffix
my ($fbase,$dir,$suf) = fileparse($file, ".gz");
if ($suf eq ".gz") {
# If the zipped file exists then unzip it (do this each time, in case zip file is updated
# This check is also done above at the stage where we are testign the files in the ddf exist
if (-s $file) {
my $err = system("gunzip -c $file > $dir/$fbase");
if ($err) {
die ("File \'$file\' failed gunzip $file to [$dir/$fbase]\n");
}
HgAutomate::verbose(2, "File \'$file\' gunzipped to \'$fbase\'\n");
}
if ( ! -s "$dir/$fbase") {
die ("Unzipped file \'$fbase\' does not exist (or is empty) for DDF file \'$file\'\n");
}
push @unzippedFiles, $fbase;
} else {
push @unzippedFiles, $file;
}
}
doTime("done unzipping replicates files") if $opt_timing;
$alignmentLine->{files} = \@unzippedFiles;
# Now we can safely sort these files as none are zipped
my $files = join(" ", @{$alignmentLine->{files}});
my $tmpFile = $Encode::autoCreatedPrefix . $newView. "$tmpCount.bed"; # add the type of view to the name
$tmpCount++;
if($opt_skipAutoCreation) {
HgAutomate::verbose(2, "Skipping auto-creating view '$newView' for key '$key'\n");
} else {
HgAutomate::verbose(2, "Auto-creating view '$newView' for key '$key' in file '$tmpFile'\n");
doTime("beginning Auto-create of view $newView in file $tmpFile") if $opt_timing;
# XXXX gzip before saving to disk?
my @cmds;
my $sortFiles;
if(defined($daf->{medianFragmentLength})) {
push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $daf->{medianFragmentLength} $files");
$sortFiles = " -";
# sorting stdin, so have to sort in mem (and control how much mem we use)
push @cmds, "sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
} else {
$sortFiles = $files;
# sort each file in place, controling mem usage, then do merge sort
my @sortList = split(/\s+/, $sortFiles);
foreach my $f (@sortList) {
my $err = system("sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n -o $f $f ");
if ($err) {
die ("File \'$f\' failed sort\n");
}
HgAutomate::verbose(2, "File \'$f\' sorted\n");
}
# Now do the mergesort in the pipeline
push @cmds, "sort -m $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
}
push @cmds, "grep -v -E \"^track\" ";
push @cmds, "gawk '\$6 == \"+\" {print}'" if $newView eq "PlusRawSignal";
push @cmds, "gawk '\$6 == \"-\" {print}'" if $newView eq "MinusRawSignal";
push @cmds, "bedItemOverlapCount $daf->{assembly} stdin";
my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => $tmpFile, DEBUG => $opt_verbose - 1);
if(my $err = $safe->exec()) {
print STDERR "ERROR: failed auto bedItemOverlap creation of bedGraph for $key" . $safe->stderr() . "\n";
# don't show end-user pipe error(s)
pushError(\@errors, "failed creation of wiggle for '$key'");
}
doTime("done Auto-create of view $newView") if $opt_timing;
}
$line{files} = [$tmpFile];
push(@ddfLines, \%line);
} # End foreach newView loop
}
}
} # End replicate sets loop
doTime("done ddfReplicateSets loop") if $opt_timing;
}
my $compositeTrack = Encode::compositeTrackName($daf);
### No good reason to make this an error. Composite entry can be added when subtracks are 1st added to trackDb.
#if(!$db->quickQuery("select count(*) from trackDb where tableName = ?", $compositeTrack)) {
# pushError(\@errors, "Missing composite track '$compositeTrack'; please contact your data wrangler");
#}
my $compositeExists = $db->quickQuery("select count(*) from trackDb where tableName = ?", $compositeTrack);
if(@errors) {
my $prefix = @errors > 1 ? "Error(s)" : "Error";
die "$prefix:\n\n" . join("\n\n", @errors) . "\n";
}
# After this point, we don't use @errors and just die immediately.
# Validate files and metadata fields in all ddfLines using controlled
# vocabulary. Create load.ra file for loader and trackDb.ra file for wrangler.
doTime("beginning out files") if $opt_timing;
if($opt_skipOutput) {
open(LOADER_RA, ">>/dev/null");
open(TRACK_RA, ">>/dev/null");
open(README, ">>/dev/null");
} else {
open(LOADER_RA, ">$outPath/$Encode::loadFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::loadFile\' file; error: $!\n";
open(TRACK_RA, ">$outPath/$Encode::trackFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::trackFile\' file; error: $!\n";
open(README, ">$outPath/README.txt") || die "SYS ERROR: Can't write '$outPath/READEME.txt' file; error: $!\n";
}
# Create a composite track entry if the trackDb.ra entry was not found
if(!$opt_skipOutput && !$compositeExists) {
printCompositeTdbSettings(*TRACK_RA,$daf,%ddfSets);
}
# XXXX Calculation of priorities still needs work; we currently don't account for multiple experiments in the same DDF.
# It may in fact be too much work to do automatic calculation of priorities (i.e. the wrangler may have to do it manually).
my $priority = $db->quickQuery("select max(priority) from trackDb where settings like '%subTrack $compositeTrack%'") || 0;
$ddfLineNumber = 1;
foreach my $ddfLine (@ddfLines) {
$ddfLineNumber++;
my $diePrefix = "ERROR on DDF lineNumber $ddfLineNumber:";
my $view = $ddfLine->{view};
my $type = $daf->{TRACKS}{$view}{type} || die "Missing DAF entry for view '$view'\n";
HgAutomate::verbose(2, " View: $view\n");
my $replicate;
if($hasReplicates && $daf->{TRACKS}{$view}{hasReplicates}) {
$replicate = $ddfLine->{replicate};
if(defined($replicate) && $replicate > 0) {
} else {
die "$diePrefix invalid or missing replicate value\n";
}
}
# Construct table name from track name and variables
my $tableName = "$compositeTrack$view";
if(defined($replicate)) {
$tableName .= "Rep$replicate";
}
if(!defined($daf->{TRACKS}{$view}{shortLabelPrefix})) {
$daf->{TRACKS}{$view}{shortLabelPrefix} = "";
}
my $shortLabel = defined($daf->{TRACKS}{$view}{shortLabelPrefix}) ? $daf->{TRACKS}{$view}{shortLabelPrefix} : "";
my $longLabel = "ENCODE" . (defined($daf->{TRACKS}{$view}{longLabelPrefix}) ? " $daf->{TRACKS}{$view}{longLabelPrefix}" : "");
if(defined($replicate)) {
$longLabel .= " Replicate $replicate";
}
my $subGroups = "view=$view";
my $additional = "\n";
my $pushQDescription = "";
my $species;
if (@variables) {
my %hash = map { $_ => $ddfLine->{$_} } @variables;
for my $var (@variables) {
my $val = $hash{$var};
$val = ucfirst(lc($val));
# trailing + => Plus, - => Neg (e.g. H9ES-AFP+)
$val =~ s/\+$/Pos/;
$val =~ s/\-$/Neg/;
$tableName = $tableName . $val;
}
my $shortSuffix = "";
my $longSuffix;
my %shortViewMap = (Peaks => 'Pk', Signal => 'Sig', RawSignal => 'Raw', PlusRawSignal => 'PlusRaw', MinusRawSignal => 'MinusRaw');
if($hash{'antibody'} && $hash{'cell'}) {
$pushQDescription = "$hash{'antibody'} in $hash{'cell'}";
$shortSuffix = "$hash{'antibody'} $hash{'cell'}";
$longSuffix = "$hash{'antibody'} in $hash{'cell'} cells";
} elsif($hash{'ripAntibody'} && $hash{'ripTgtProtein'} && $hash{'cell'}) {
$longSuffix = "$hash{'ripTgtProtein'} in $hash{'cell'} cells using $hash{'ripAntibody'}";
$pushQDescription = $longSuffix;
$shortSuffix = "$hash{'ripTgtProtein'} $hash{'cell'} $hash{'ripAntibody'}";
} elsif($hash{'rnaExtract'} && $hash{'localization'} && $hash{'cell'}) {
my $suf = $hash{'mapAlgorithm'} ? "$hash{'mapAlgorithm'}" : "";
$shortSuffix = "$hash{'rnaExtract'} $hash{'cell'} $hash{'localization'} $suf";
$longSuffix = "$hash{'rnaExtract'} in $hash{'cell'} cell $hash{'localization'} using $suf";
$pushQDescription = $longSuffix;
} elsif($hash{'freezeDate'}) {
$shortSuffix = $hash{'freezeDate'};
$longSuffix = $hash{'freezeDate'};
$pushQDescription = $longSuffix;
} elsif ($hash{"species"}) {
$pushQDescription = "$hash{'species'}";
$shortSuffix = "$hash{'species'}";
$longSuffix = "in $hash{'species'}";
$species = "$hash{'species'}";
$pushQDescription = "$view $daf->{dataType} $longSuffix";
} elsif ($hash{"cell"}) {
$pushQDescription = "$hash{'cell'}";
$shortSuffix = "$hash{'cell'}";
$longSuffix = "in $hash{'cell'} cells";
} else {
warn "Warning: variables undefined for pushQDescription,shortSuffix,longSuffix\n";
}
if(defined($shortViewMap{$view})) {
$shortSuffix .= " " . $shortViewMap{$view};
}
if(defined($replicate)) {
$shortSuffix .= " $replicate";
$pushQDescription .= " Replicate $replicate";
}
if($shortSuffix) {
$shortLabel = $shortLabel ? "$shortLabel ($shortSuffix)" : $shortSuffix;
}
if($longSuffix) {
$longLabel .= " ($longSuffix)";
}
# make the "subGroups" and "additional" fields from all variables
for my $var (sort keys %hash) {
# The var name is over-ridden for antibody and cell, for historical reasons
my $groupVar = $var;
$groupVar = "factor" if $var eq "antibody";
$groupVar = "cellType" if $var eq "cell";
$subGroups .= " $groupVar=$hash{$var}";
$additional = " $var $hash{$var}\n" . $additional;
}
}
# mysql doesn't allow hyphens in table names and our naming convention doesn't allow underbars; to be
# safe, we strip non-alphanumerics.
$tableName =~ s/[^A-Za-z0-9]//g;
die "Table name [$tableName] too long, must be <= 64 chars, got [".length($tableName)."]\n" if length($tableName) > 64;
if($tableNamesUsed{$tableName}++) {
dieTellWrangler("System Error: identical tableName '$tableName' was generated by multiple data sets\n");
}
if(!$opt_allowReloads) {
if($db->quickQuery("select count(*) from trackDb where tableName = ?", $tableName)) {
die "view '$view' has already been loaded as track '$tableName'\nPlease contact your wrangler if you need to reload this data\n";
}
}
# XXXX Move the decision about which views have tracks into the DAF?
# Already this is used in 2 places so made it a function,
# would be better in the DAF except we'd have to go change all the DAFs :(
my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
print LOADER_RA "tablename $tableName\n";
print LOADER_RA "view $view\n";
print LOADER_RA "type $type\n";
if($species) {
print LOADER_RA "assembly $species\n";
} else {
print LOADER_RA "assembly $daf->{assembly}\n";
}
print LOADER_RA "files @{$ddfLine->{files}}\n";
print LOADER_RA "downloadOnly $downloadOnly\n";
print LOADER_RA "pushQDescription $pushQDescription\n";
print LOADER_RA "\n";
my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate($now);
if($downloadOnly || ($type eq "wig" && !grep(/$Encode::autoCreatedPrefix/, @{$ddfLine->{files}}))) {
# adds entries to README.txt for download only files AND wig data (excepting wig data generated by us)
print README "file: $tableName.$type.gz\n";
for my $var (@variables) {
print README "$var: " . $ddfLine->{$var} . "\n";
}
if(defined($replicate)) {
print README "replicate: $replicate\n";
}
print README sprintf("data RESTRICTED UNTIL: %d-%02d-%02d\n", 1900 + $rYear, $rMon + 1, $rMDay);
print README "\n";
}
if(!$downloadOnly) {
print TRACK_RA " track $tableName\n";
print TRACK_RA " release alpha\n";
print TRACK_RA " subTrack $compositeTrack\n";
print TRACK_RA " shortLabel $shortLabel\n";
print TRACK_RA " longLabel $longLabel\n";
print TRACK_RA " subGroups $subGroups\n";
if($type eq 'wig') {
my $placeHolder = Encode::wigMinMaxPlaceHolder($tableName);
print TRACK_RA " type $type $placeHolder\n";
} elsif($type eq 'gtf') { # GTF is converted to and loaded as genePred
print TRACK_RA " type genePred\n";
} elsif($type eq 'tagAlign') { # tagAligns are bed 6 but with column called 'sequence' instead of 'name'
print TRACK_RA " type bed 6\n";
} else {
print TRACK_RA " type $type\n";
}
print TRACK_RA sprintf(" dateSubmitted %04d-%02d-%02d\n", 1900 + $year, $mon + 1, $mday);
print TRACK_RA sprintf(" dateUnrestricted %04d-%02d-%02d\n", 1900 + $rYear, $rMon + 1, $rMDay);
print TRACK_RA sprintf(" dataVersion %s\n", $Encode::dataVersion);
if(defined($ddfLine->{accession}) && length($ddfLine->{accession}) > 0) {
print TRACK_RA sprintf(" accession %s\n",$ddfLine->{accession});
}
print TRACK_RA " priority " . ($priority + $daf->{TRACKS}{$view}{order}) . "\n";
# noInherit is necessary b/c composite track will often have a different dummy type setting.
print TRACK_RA " noInherit on\n";
if($view eq 'RawSignal' and 0) { # Sorry tim, you will have to list your projects here
print TRACK_RA " configurable off\n";
} else {
print TRACK_RA " configurable on\n";
}
if($type eq 'wig') {
print TRACK_RA <<END;
spanList first
windowingFunction mean
maxHeightPixels 100:16:16
END
} elsif($type eq 'bed 5 +') {
print TRACK_RA " useScore 1\n";
}
print TRACK_RA $additional;
}
}
close(LOADER_RA);
close(TRACK_RA);
close(README);
doTime("done out files") if $opt_timing;
if($submitDir =~ /(\d+)$/) {
my $id = $1;
if(dirname($submitDir) =~ /_(.*)/) {
my $instance = $1;
# XXXX rubyDb logic s/d probably be moved to Encode.pm
my $rubyDb = HgDb->new(DB => "encpipeline_$instance");
my @tmp = keys %metadataHash;
my $count = scalar(@tmp);
my $metadata = join("; ", @tmp);
HgAutomate::verbose(2, "Updating id '$id'; metdata: '$metadata'; count: 'count'\n");
$rubyDb->execute("update projects set count = ?, metadata = ?, lab = ?, data_type = ?, track = ? where id = ?",
$count, $metadata, $daf->{lab}, $daf->{dataType}, $compositeTrack, $id);
}
}
$time0=$timeStart;
doTime("done. ") if $opt_timing;
exit 0;