src/hg/encode/encodeValidate/doEncodeValidate.pl 1.212

1.212 2010/01/08 22:11:25 braney
check for table existence using tableExists instead of looking in trackDb
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.211
retrieving revision 1.212
diff -b -B -U 1000000 -r1.211 -r1.212
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	7 Jan 2010 23:30:05 -0000	1.211
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	8 Jan 2010 22:11:25 -0000	1.212
@@ -1,1978 +1,1978 @@
 #!/usr/bin/env perl
 
 # encodeValidate.pl - validate an ENCODE data submission generated by the
 #                       automated submission pipeline
 #
 # Verifies that all files and metadata are present and of correct formats
 # Creates a load file (load.ra) and track configuration (trackDb.ra) for the datasets
 #
 # Returns 0 if validation succeeds
 #
 # Error reporting:
 #
 # We die immediately (with a human readable message) when internal errors are encountered (e.g. file I/O errors or misconfiguration).
 #
 # In order to facilitate debugging of often very large file uploads, we try to accumulate multiple user errors (e.g. DAF, DAS or
 # file syntax errors) before die'ing with a message with a list of errors.
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit the CVS'ed source at:
 # $Header$
 
 use warnings;
 use strict;
 
 use File::stat;
 use File::Basename;
 use Getopt::Long;
 use English;
 use Carp qw(cluck);
 use Cwd;
 use IO::File;
 use File::Basename;
 
 use lib "/cluster/bin/scripts";
 use Encode;
 use HgAutomate;
 use HgDb;
 use RAFile;
 use SafePipe;
 
 use vars qw/
     $opt_allowReloads
     $opt_configDir
     $opt_fileType
     $opt_justFileDb
     $opt_metaDataOnly
     $opt_outDir
     $opt_quick
     $opt_skipAll
     $opt_skipAutoCreation
     $opt_skipOutput
     $opt_skipValidateFiles
     $opt_skipValidateFastQ
     $opt_validateDaf
     $opt_validateFile
     $opt_sendEmail
     $opt_verbose
     $opt_timing
     /;
 
 # Global variables
 our $submitPath;        # full path of data submission directory
 our $configPath;        # full path of configuration directory
 our $outPath;           # full path of output directory
 our %terms;             # controlled vocabulary, indexed by type and term
 our %tags;              # controlled vocabulary, indexed by tag
 our $quickCount=100;
 our $quickOpt = "";     # option to pass to validateFiles prog
 our $time0 = time;
 our $timeStart = time;
 our %chromInfo;         # chromInfo from assembly for chrom validation
 our $maxBedRows=80_000_000; # number of rows to allow in a bed-type file
 our %tableNamesUsed;
 our ($grants, $fields, $daf);
 our $SORT_BUF = " -S 5G ";
 our $assembly;
 
 sub usage {
     print STDERR <<END;
 usage: encodeValidate.pl submission-type project-submission-dir
 
 submission-type is currently ignored.
 
 Current dafVersion is: $Encode::dafVersion
 
 Creates the following output files: $Encode::loadFile, $Encode::trackFile and README.txt
 
 options:
     -allowReloads       Allow reloads of existing tables
     -configDir=dir      Path of configuration directory, containing
                         metadata .ra files (default: submission-dir/../config)
     -database=assembly  Specify an assembly; necessary only when using -validateFile
     -fileType=type	used only with validateFile option; e.g. narrowPeak
     -justFileDb         Just generate the fileDb.ra file which contains all metadata
     -metaDataOnly       Process DAF/DDF and just update the projects.metadata field;
                         equal to -allowReloads -skipAll
     -quick		Validate only first $quickCount lines of files
     -skipAll            Turn on all "-skip..." options
     -skipAutoCreation   Tells script skip creating the auto-created files (e.g. RawSignal, PlusRawSignal, MinusRawSignal)
                         this can save you a lot of time when you are debugging and re-running the script on large projects
     -skipOutput         Don't write the various output files
     -skipValidateFiles  Tells script skip the file validation step; to save a lot of time during testing
     -validateDaf	exit after validating DAF file (project-submission-dir is the DAF file name).
     -validateFile	exit after validating file (project-submission-dir is the file name;
                         requires -fileType option as well)
     -verbose=num        Set verbose level to num (default 1).
     -outDir=dir         Path of output directory, for validation files
                         (default: submission-dir/out)
 END
 exit 1;
 }
 
 sub pushError
 {
     my ($errors, @new) = @_;
     if(@new) {
         push(@{$errors}, @new);
         HgAutomate::verbose(2, "pushing errors:\n\t" . join("\n\t", @new) . "\n");
     }
 }
 
 sub doTime
 # print out time difference in seconds since last call to this function, or the program started.
 {
     my $msg = shift || "";
     my $lines = shift || 0;
     my $time1 = time;
     my $t = $time1-$time0;
     $t = 1 if ($lines>0 and $t<1);
     warn("# $msg : $t secs".($lines>0 ? "  ($lines lines, ".(int($lines/$t))." lines/sec)" : ""));
     $time0 = time;
 }
 
 sub dieTellWrangler
 {
     my ($msg) = @_;
     my $email;
     if($grants->{$daf->{grant}} && $grants->{$daf->{grant}}{wranglerEmail}) {
         $email = $grants->{$daf->{grant}}{wranglerEmail};
     }
     $msg .= "Please contact your wrangler" . (defined($email) ? " at $email" : "") . "\n";
     die $msg;
 }
 
 ############################################################################
 # Validators for DDF columns -- extend when adding new metadata fields
 #
 # validators should return list of errors encountered (empty list means no errors were found).
 #
 # validator callbacks are called thus:
 #
 # validator(value, track, daf);
 #
 # value is value in DDF column
 # track is track/view value
 # daf is daf hash
 
 # dispatch table
 our %validators = (
     files => \&validateFiles,
     view => \&validateDatasetName,
     labVersion => \&validateNoValidation,
     softwareVersion => \&validateNoValidation,
     accession => \&validateNoValidation,
     replicate => \&validateNoValidation,
     fragLength => \&validateNoValidation,
     cell => \&validateControlledVocabOrControl,
     antibody => \&validateControlledVocabOrControl,
     ripAntibody => \&validateControlledVocabOrControl,
     treatment => \&validateControlledVocabOrControl,
     protocol => \&validateControlledVocabOrControl,
     restrictionEnzyme => \&validateControlledVocabOrControl,
     default => \&validateControlledVocab,
     );
 
 # standard validators (required or optional for all projects)
 
 sub validateFiles {
     # Validate array of filenames, ordered by part
     # Check files exist and are of correct data format
     my ($files, $type, $track, $daf) = @_;
     my @newFiles;
     my @errors;
     my $regex = "\`\|\\\|\|\"\|\'";
     doTime("beginning validateFiles") if $opt_timing;
     for my $file (@{$files}) {
         my @list = glob $file;
         if(@list) {
             push(@newFiles, @list);
         } else {
             pushError(\@errors, "File '$file' does not exist (possibly bad glob?)");
         }
     }
     HgAutomate::verbose(3, "     Track: $track    Files: " . join (' ', @newFiles) . "\n");
     return () if $opt_skipValidateFiles;
     for my $file (@newFiles) {
         my ($fbase,$dir,$suf) = fileparse($file, ".gz");
 	# Check if the file has been replaced with an unzipped version
         # This check is also done where we auto create the RawSignal view from the Alignments
         if ($suf eq ".gz" and ! -e $file and -s "$dir/$fbase") {
             $file = "$dir/$fbase";
         }
         if($file =~ /($regex)/) {
             # Do not allows filenames with suspicious characters (b/c filename will be used in shell commands).
             pushError(\@errors, "File '$file' has invalid characters; files cannot contain following characters: \"'`|");
         } elsif(!-e $file) {
             pushError(\@errors, "File \'$file\' does not exist");
         } elsif(!(-s $file)) {
             pushError(\@errors, "File \'$file\' is empty");
         } elsif(!(-r $file)) {
             pushError(\@errors, "File \'$file\' is un-readable");
         } else {
             pushError(\@errors, checkDataFormat($daf->{TRACKS}{$track}{type}, $file));
         }
     }
     $files = \@newFiles;
     doTime("done validateFiles") if $opt_timing;
     return @errors;
 }
 
 sub validateDatasetName {
     my ($val) = @_;
     return ();
 }
 
 sub validateDataType {
     my ($val) = @_;
     return ();
 }
 
 sub validateRawDataAcc {
 # No validation
     return ();
 }
 
 sub validateNoValidation {
 # No validation
     return ();
 }
 
 # project-specific validators
 
 sub validateControlledVocabOrControl {
     my ($val, $type) = @_;
     if($type eq 'cell') {
         $type = 'Cell Line';
     } elsif ($type eq 'antibody') {
         $type = 'Antibody';
     }
     return defined($terms{$type}{$val} || $terms{'control'}{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");
 }
 
 sub validateControlledVocab {
     my ($val, $type) = @_;
     return defined($terms{$type}{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");
 }
 
 ############################################################################
 # Format checkers - check file format for given types; extend when adding new
 # data formats
 #
 # Some of the checkers use regular expressions to validate syntax of the files.
 # Others pass first 10 lines to utility loaders; the later has:
 # advantages:
 # 	checks semantics as well as syntax
 # disadvantages;
 # 	only checks the beginning of the file
 # 	but some of the loaders tolerate (but give incorrect results) for invalid files
 
 # dispatch table
 our %formatCheckers = (
     wig => \&validateWig,
     bed => \&validateBed,
     bedGraph => \&validateBedGraph,
     bed5FloatScore => \&validateBed,
     genePred => \&validateGene,
     gtf => \&validateGtf,
     tagAlign => \&validateTagAlign,
     pairedTagAlign => \&validatePairedTagAlign,
     narrowPeak => \&validateNarrowPeak,
     broadPeak => \&validateBroadPeak,
     gappedPeak => \&validateGappedPeak,
     fastq => \&validateFastQ,
     csfasta => \&validateCsfasta,
     csqual  => \&validateCsqual,
     rpkm  => \&validateRpkm,
     fasta  => \&validateFasta,
     bowtie  => \&validateBowtie,
     psl  => \&validatePsl,
     SAM => \&validateSAM,
     BAM => \&validateBAM,
     cBiP => \&validateFreepass,  # TODO: this is a dodge, because bed file is for different species, so chrom violations
     );
 
 my $floatRegEx = "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?";
 # my $floatRegEx = "[+-]?(?:\\.\\d+|\\d+(?:\\.\\d+|[eE]{1}?[+-]{1}?\\d+))";  # Tim's attempt
 # my $floatRegEx = "[+-]?(?:\\.\\d+|\\d+(?:\\.\\d+|))";                      # Original
 my %typeMap = (int => "[+-]?\\d+", uint => "\\d+", float => $floatRegEx, string => "\\S+");
 
 sub listToRegExp
 {
 # Return a regular expression for given list of field specific tests.
 #
 # $validateList is a reference to a list of hashes with: {NAME, REGEX or TYPE}
 # If a line fails this regular expression, you should then call validateWithListUtil with this line
 # and validation list to generate a field specific error message; this is a speedup hack,
 # because we want to avoid calling validateWithListUtil for every line (because validateWithListUtil is really
 # slow).
 #
 # Note that the 'chrom' field is captured, so you should test %chromInfo (e.g. $chromInfo($1))
 # after using the regular expression to verify that the line has a valid chrom.
     my ($validateList) = @_;
     my @list;
     for my $validateField (@{$validateList}) {
         my $type = $validateField->{TYPE};
         if(defined($type) && $type eq 'chrom') {
             push(@list, "(\\S+)");
         } else {
             my $regex;
             if($type) {
                 if(!($regex = $typeMap{$type})) {
                     die "PROGRAM ERROR: invalid TYPE: $type\n";
                 }
             } elsif(!($regex = $validateField->{REGEX})) {
                 die "PROGRAM ERROR: invalid type list (missing required REGEX or TYPE)\n";
             }
             push(@list, $regex);
         }
     }
     return "^" . join("\\s+", @list) . "\$";
 }
 
 sub validateWithListUtil
 {
 # Validate $line using a validation list.
 # returns error string or undef if line passes validation
 # This is designed to give better feedback to user; ideally we would load the validation list from the .as files
     my ($line, $validateList) = @_;
     my @list = split(/\s+/, $line);
     my $fieldError = "; saw '" . scalar(@list) . "' fields; expected: '" . @{$validateList} . "'";
     if(@list < @{$validateList}) {
         return "not enough fields" . $fieldError;
     } elsif(@list > @{$validateList}) {
         return "too many fields" . $fieldError;
     } else {
         for my $validateField (@{$validateList}) {
             my $val = shift(@list);
             my $type = $validateField->{TYPE};
             if(defined($type) && $type eq 'chrom') {
                 if(!$chromInfo{$val}) {
                     return "value '$val' for field '$validateField->{NAME}' is an invalid chromosome";
                 }
             } else {
                 my $regex;
                 if($type) {
                     if(!($regex = $typeMap{$type})) {
                         die "PROGRAM ERROR: invalid TYPE: $type\n";
                     }
                 } elsif(!($regex = $validateField->{REGEX})) {
                     die "PROGRAM ERROR: invalid type list (missing required REGEX or TYPE)\n";
                 }
                 if($val !~ /^$regex$/) {
                     my $error = "value '$val' is an invalid value for field '$validateField->{NAME}'";
                     if($type) {
                         $error .= "; must be type '$type'";
                     }
                     return $error;
                 }
             }
         }
     }
     return undef;
 }
 
 sub validateWithList
 {
 # open a file and validate each line with $validateList
 # $name is the caller's subroutine name (used in error and debug messages).
     my ($path, $file, $type, $maxRows, $name, $validateList) = @_;
     my $lineNumber = 0;
     my $fh = Encode::openUtil($file, $path);
     my $regexp = listToRegExp($validateList);
     my $hasChrom = 0;
     for my $rec (@{$validateList}) {
         $hasChrom++ if($rec->{NAME} eq "chrom");
     }
     doTime("beginning validateWithList $name,$type,$maxRows") if $opt_timing;
     while(my $line = <$fh>) {
         chomp $line;
         $lineNumber++;
         return ("Invalid $type file; line $lineNumber in file '$file';\nerror: exceeded maximum number of rows allowed ($maxRows) \nline: $line") if $lineNumber > $maxRows;
         next if($line =~ m/^#/); # allow comment lines, consistent with lineFile and hgLoadBed
         if($line =~ /$regexp/) {
             if($hasChrom) {
                 my $chrom = $1;
                 if(!$chromInfo{$1}) {
                     return ("Invalid $type file; line $lineNumber in file '$file';\nerror: invalid chrom '$chrom';\nline: $line");
                 }
             }
         } else {
             if(my $error = validateWithListUtil($line, $validateList)) {
                 return ("Invalid $type file; line $lineNumber in file '$file' is invalid;\n$error;\nline: $line");
             } else {
                 die "PROGRAM ERROR: inconsistent results from validateWithListUtil\n";
             }
         }
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateWithList $name,$type,$maxRows",$lineNumber) if $opt_timing;
     return ();
 }
 
 
 sub validateFreepass
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateFreepass") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     #my $lineNumber = 0;
     #while(<$fh>) {
     #    chomp;
     #    $lineNumber++;
     #    last if($opt_quick && $lineNumber >= $quickCount);
     #}
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' free pass on validation\n");
 
     doTime("done validateFreepass") if $opt_timing;
     return ();
 }
 
 
 sub validateWig
 {
     my ($path, $file, $type) = @_;
     my $filePath = defined($path) ? "$path/$file" : $file;
     doTime("beginning validateWig") if $opt_timing;
 
     HgAutomate::verbose(2, "validateWig($file,$type) -> wigEncode\n");
     my @cmds;
     # wigEncode knows how to handle zipped files so we do not need to special case them.
     push(@cmds, "/cluster/bin/x86_64/wigEncode -noOverlapSpanData $filePath /dev/null /dev/null");
     # This can produce /data/tmp/SafePipe_NNN_.err files
     my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => "/dev/null", DEBUG => $opt_verbose - 1);
     if(my $err = $safe->exec()) {
         my $err = $safe->stderr();
         chomp($err);
         return "File \'$file\' failed wiggle validation: " . $err;
     } else {
         HgAutomate::verbose(2, "File \'$file\' passed wiggle validation\n");
     }
     doTime("done validateWig") if $opt_timing;
     return ();
 }
 
 sub validateBed {
 # Validate each line of a bed 5 or greater file.
     my ($path, $file, $type) = @_;
     my $lineNumber = 0;
     doTime("beginning validateBed") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
         my @fields = split /\s+/;
         my $fieldCount = @fields;
         next if(!$fieldCount);
         my $prefix = "Failed bed validation, file '$file'; line $lineNumber:";
         if(/^(track|browser)/) {
             ;
         } elsif($fieldCount < 3) {
             die "$prefix not enough fields; " . scalar(@fields) . " present; at least 3 are required\n";
         } elsif (!$chromInfo{$fields[0]}) {
             die "$prefix field 1 value ($fields[0]) is invalid; not a valid chrom name\n";
         } elsif ($fields[1] !~ /^\d+$/) {
             die "$prefix field 2 value ($fields[1]) is invalid; value must be a positive number\n";
         } elsif ($fields[2] !~ /^\d+$/) {
             die "$prefix field 3 value ($fields[2]) is invalid; value must be a positive number\n";
         } elsif ($fields[2] < $fields[1]) {
             die "$prefix field 3 value ($fields[2]) is less than field 2 value ($fields[1])\n";
         } elsif ($fieldCount < 3 && $fields[4] !~ /^\d+$/ && $fields[4] !~ /^\d+\.\d+$/) {
             die "$prefix field 5 value ($fields[4]) is invalid; value must be a positive number\n";
         } elsif ($fieldCount < 3 && $fields[4] < 0 || $fields[4] > 1000) {
             die "$prefix field 5 value ($fields[4]) is invalid; score must be 0-1000\n";
         } elsif ($type eq 'bed5FloatScore' && $fieldCount < 6) {
             die "$prefix field 6 invalid; bed5FloatScore requires 6 fields";
         } elsif ($type eq 'bed5FloatScore' && $fields[5] !~ /^$floatRegEx$/) {
             die "$prefix field 6 value '$fields[5]' is invalid; must be a float\n";
         } else {
             ;
         }
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed bed validation\n");
     doTime("done validateBed",$lineNumber) if $opt_timing;
     return ();
 }
 
 sub validateBedGraph {
 # Validate each line of a bedGraph file.
     my ($path, $file, $type) = @_;
     my $lineNumber = 0;
     doTime("beginning validateBedGraph") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
         my @fields = split /\s+/;
         my $fieldCount = @fields;
         next if(!$fieldCount);
         my $prefix = "Failed bedGraph validation, file '$file'; line $lineNumber:";
         if(/^(track|browser)/) {
             ;
         } elsif($fieldCount != 4) {
             die "$prefix found " . scalar(@fields) . " fields; need 4\n";
         } elsif (!$chromInfo{$fields[0]}) {
             die "$prefix field 1 value ($fields[0]) is invalid; not a valid chrom name\n";
         } elsif ($fields[1] !~ /^\d+$/) {
             die "$prefix field 2 value ($fields[1]) is invalid; value must be a positive number\n";
         } elsif ($fields[2] !~ /^\d+$/) {
             die "$prefix field 3 value ($fields[2]) is invalid; value must be a positive number\n";
         } elsif ($fields[2] < $fields[1]) {
             die "$prefix field 3 value ($fields[2]) is less than field 2 value ($fields[1])\n";
         } elsif ($fields[3] !~ /^$floatRegEx$/) {
             die "$prefix field 4 value '$fields[3]' is invalid; must be a float [$floatRegEx]\n";
         } else {
             ;
         }
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed bedGraph validation\n");
     doTime("done validateBedGraph", $lineNumber) if $opt_timing;
     return ();
 }
 
 sub validateGtf {
 # validate GTF by converting to genePred and validating that
     my ($path, $file, $type) = @_;
     my $errFile = "$path/doEncodeValidate.gtf.err";
     doTime("beginning validateGtf") if $opt_timing;
     my $filePath = defined($path) ? "$path/$file" : $file;
     my $outFile = "$path/doEncodeValidate.gtf.bed";
     if(Encode::isZipped($filePath)) {
         # XXXX should be modified to handle zipped files.
         die "We don't currently support gzipped gtf files\n";
     }
     HgAutomate::verbose(2, "validateGtf(path=$path,file=$file,type=$type)\n");
     # XXXX Add support for $opt_quick
     my $err = system ( "gtfToGenePred $filePath $outFile >$errFile 2>&1");
     if ($err) {
         print STDERR  "File \'$file\' failed GTF validation\n";
         open(ERR, "$errFile") || die "ERROR: Can't open gtfToGenePred error file \'$errFile\': $!\n";
         my @err = <ERR>;
         die "@err\n";
     }
     unlink $errFile;
     HgAutomate::verbose(2, "File \'$file\' passed gtfToGenePred conversion \n");
     doTime("done validateGtf") if $opt_timing;
     my @res = validateGene(undef,$outFile,$type);
     if (scalar(@res)==0) { # no errors so remove the temp .bed file
         HgAutomate::verbose(2, "File \'$file\' passed gtf gene validation \n");
 	unlink $outFile;
     }
     return @res;
 }
 
 sub validateGene {
     my ($path, $file, $type) = @_;
     my $outFile = "validateGene.out";
     doTime("beginning validateGene") if $opt_timing;
     my $filePath = defined($path) ? "$path/$file" : $file;
     if(Encode::isZipped($filePath)) {
         # XXXX should be modified to handle zipped files.
         die "We don't currently supporte gzipped gene files\n";
     }
     # XXXX Add support for $opt_quick
     my $err = system (
         "cd $outPath; egrep -v '^track|browser' $filePath | ldHgGene -out=genePred.tab -genePredExt $assembly testTable stdin >$outFile 2>&1");
     if ($err) {
         print STDERR  "File \'$file\' failed GFF validation\n";
         open(ERR, "$outPath/$outFile") || die "ERROR: Can't open GFF validation file \'$outPath/$outFile\': $!\n";
         my @err = <ERR>;
         die "@err\n";
     } else {
         HgAutomate::verbose(2, "File \'$file\' passed GFF validation\n");
     }
     doTime("done validateGene") if $opt_timing;
     return ();
 }
 
 sub validateTagAlign
 {
     my ($path, $file, $type) = @_;
     # validate chroms, chromSize, etc.
     my $paramList = validationSettings("validateFiles","tagAlign",$assembly);
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=tagAlign $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateTagAlign : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateTagAlign for '$file'");
     }
     return ();
 }
 
 sub validatePairedTagAlign
 # This is like tag align but with two additional sequence fields appended; seq1 and seq2
 {
     my ($path, $file, $type) = @_;
     # validate chroms, chromSize, etc.
     my $paramList = validationSettings("validateFiles","pairedTagAlign",$assembly);
     my $safe = SafePipe->new(CMDS => ["validateFiles $paramList $quickOpt -chromDb=$assembly -type=pairedTagAlign $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validatePairedTagAlign : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validatePairedTagAlign for '$file'");
     }
     return ();
 }
 
 sub validateNarrowPeak
 {
     my ($path, $file, $type) = @_;
     my @list = ({TYPE => "chrom", NAME => "chrom"},
                 {TYPE => "uint", NAME => "chromStart"},
                 {TYPE => "uint", NAME => "chromEnd"},
                 {TYPE => "string", NAME => "name"},
                 {TYPE => "uint", NAME => "score"},
                 {REGEX => "[+-\\.]", NAME => "strand"},
                 {TYPE => "float", NAME => "signalValue"},
                 {TYPE => "float", NAME => "pValue"},
                 {TYPE => "float", NAME => "qValue"},
                 {TYPE => "int", NAME => "peak"});
     return validateWithList($path, $file, $type, $maxBedRows, "validateNarrowPeak", \@list);
 }
 
 sub validateBroadPeak
 {
     my ($path, $file, $type) = @_;
     # validate chroms, chromSize, etc.
     my $paramList = validationSettings("validateFiles","broadPeak",$assembly);
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=broadPeak $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateBroadPeak : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateBroadPeak for '$file'");
     }
     return ();
 }
 
 sub validateGappedPeak
 {
     my ($path, $file, $type) = @_;
     my @list = ({TYPE => "chrom", NAME => "chrom"},
                 {TYPE => "uint", NAME => "chromStart"},
                 {TYPE => "uint", NAME => "chromEnd"},
                 {TYPE => "string", NAME => "name"},
                 {TYPE => "uint", NAME => "score"},
                 {REGEX => "[+-\\.]", NAME => "strand"},
                 {TYPE => "uint", NAME => "thickStart"},
                 {TYPE => "uint", NAME => "thickEnd"},
                 {TYPE => "string", NAME => "itemRgb"},
                 {TYPE => "uint", NAME => "blockCount"},
                 {TYPE => "string", NAME => "blockSizes"},
                 {TYPE => "string", NAME => "blockStarts"},
                 {TYPE => "float", NAME => "signalValue"},
                 {TYPE => "float", NAME => "pValue"},
                 {TYPE => "float", NAME => "qValue"}
                 );
     return validateWithList($path, $file, $type, $maxBedRows, "validateGappedPeak", \@list);
 }
 
 sub validateFastQ
 {
     # Syntax per http://maq.sourceforge.net/fastq.shtml
     # I added '/' in the seqNameRegEx and plusLine even though it wasnt in the spec
     #   because this is what Colin Kingswood (Gingeras project)
     #   is getting in the fastq files from GIS for the GisPet project
     #   and they are being sent on to us
     # Note on "FASTQ Quality scores":-   http://maq.sourceforge.net/qual.shtml
     # Fastq has 2 different semantics for the score field.
     # - fastq produced directly from Solexa has a 'solexa' quality score
     # - fastq defined by Sanger has a 'PHRED' quality score
     # - The 2 urls above show how to convert between both
     my ($path, $file, $type) = @_;
     my $paramList = validationSettings("validateFiles","fastq");
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=fastq $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateFastQ : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateFastQ for '$file'");
     }
     return ();
 }
 
 sub validateCsfasta
 {
     # Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
     # Sample:-
 
     # # Wed Jul 30 15:30:48 2008 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/S0033/S0033_20080723_2/I22_EA/results.01/primary.20080730194737531 --name=S0033_20080723_2_I22_EA_ --tag=F3 --minlength=30 --mask=111111111111111111111111111111 --prefix=T /data/results/S0033/S0033_20080723_2/I22_EA/jobs/postPrimerSetPrimary.1416/rawseq
     # # Cwd: /home/pipeline
     # # Title: S0033_20080723_2_I22_EA_
     # >461_19_90_F3
     # T203033330010111011221200302001
     # >461_19_209_F3
     # T022213002230311203200200322000
 
     # Files from GIS have this header:
     # >920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0###
     # T01301010111200210102321210100112312
     my ($path, $file, $type) = @_;
     doTime("beginning validateCsfasta") if $opt_timing;
     HgAutomate::verbose(2, "validateCsfasta($path,$file,$type)\n");
     my $paramList = validationSettings("validateFiles","csfasta");
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=csfasta $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateCsfasta : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateCsfasta for '$file'");
     }
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateCsfasta") if $opt_timing;
     return ();
 }
 
 sub validateSAM
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateSAM") if $opt_timing;
     HgAutomate::verbose(2, "validateSAM($path,$file,$type)\n");
     my $paramList = validationSettings("validateFiles","SAM");
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=SAM $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateSAM : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateSAM for '$file'");
     }
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateSAM") if $opt_timing;
     return ();
 }
 
 sub validateBAM
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateBAM") if $opt_timing;
     HgAutomate::verbose(2, "validateBAM($path,$file,$type)\n");
     my $paramList = validationSettings("validateFiles","BAM");
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=BAM $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateBAM : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateBAM for '$file'");
     }
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateBAM") if $opt_timing;
     return ();
 }
 
 sub validateCsqual
 {
     # Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
     # Sample:-
 
     # # Cwd: /home/pipeline
     # # Title: S0033_20080723_2_I22_EA_
     # >461_19_90_F3
     # 20 10 8 13 8 10 20 7 7 24 15 22 21 14 14 8 11 15 5 20 6 5 8 22 6 24 3 16 7 11
     # >461_19_209_F3
     # 16 8 5 12 20 24 19 8 13 17 11 23 8 24 8 7 17 4 20 8 29 7 3 16 3 4 8 20 17 9
     my ($path, $file, $type) = @_;
     doTime("beginning validateCsqual") if $opt_timing;
     HgAutomate::verbose(2, "validateCsqual($path,$file,$type)\n");
     my $paramList = validationSettings("validateFiles","csqual");
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=csqual $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateCsqual : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateCsqual for '$file'");
     }
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateCsqual") if $opt_timing;
     return ();
 }
 
 sub validateFasta
 # Wold lab & Helicos have fasta files; no quality, one line per sequence
 # Sample fasta lines are:
 #>HWI-EAS229_75_30DY0AAXX:7:1:0:949/1
 #NGCGGATGTTCTCAGTGTCCACAGCGCAGGTGAAATAAGGGAAGCAGTAGCGACGCCCATCTCCACGCGCAGCGC
 #>HWI-EAS229_75_30DY0AAXX:7:1:0:1739/1
 #NAGCCATCAGGAAAGCAAGGAGGGGGCATTAAAGGACAATCAAGGGGTTTGGAGGAAGGAGCAGGCCGGAGGCAA
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateFasta") if $opt_timing;
     HgAutomate::verbose(2, "validateFasta($path,$file,$type)\n");
     my $paramList = validationSettings("validateFiles","fasta");
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=fasta $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateFasta : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateFasta for '$file'");
     }
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateFasta") if $opt_timing;
     return ();
 }
 
 sub validateRpkm
 # Wold lab format, has gene name and 2 floats
 #   Allowing Gene name to be composed of any characters but <tab>
 #
 # Example format 1 (3 cols):-
 # HBG2    0.583   1973.85
 # RPS20   0.523   1910.01
 # RPLP0   1.312   1800.51
 #
 # Example format 2 (7 cols):- (*.accepted.rpkm)
 # ENSG00000003056 chr12   8989051 8989354 2.43    303     M6PR
 # ENSG00000006015 chr19   18560887        18561077        1.10    190     C19orf60
 # ENSG00000008516 chr16   3047223 3047380 0.61    157     MMP25
 #
 # Example format 3 (5 cols): (*.final.rpkm)
 #GID    gene    len_kb  RPKM    multi/all
 # OTTHUMG00000151214      IGLC2   0.722   3579.34 0.84
 # FAR3664 FAR3664 0.200   3216.32 0.94
 # OTTHUMG00000021144      TMSB4X  3.551   2767.52 0.35
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateRpkm") if $opt_timing;
     my $lineNumber = 0;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if m/^#/;
 	my @fields = split /\s+/;
 	my $cols = scalar(@fields);
         die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
 	    unless $cols == 3 or $cols == 5 or $cols == 7;
 #            unless m/^([^\t]+)\t(\d+\.\d+)\t(\d+\.\d+)$/;
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateRpkm", $lineNumber) if $opt_timing;
     return ();
 }
 
 sub validateBowtie
 # Unkown format (for download) from Wold lab.
 # Assume last column is optional
 # Sample lines:-
 # HWI-EAS229_75_30DY0AAXX:7:1:0:1545/1    +       chr1    5983615 NCGTCCATCTCACATCGTCAGGAAAGGGGGAAGCACTGGATGGCTGTGGCCTCACAGGCAGGGAGAGTGGGGTCC     IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 0       0:G>N
 # HWI-EAS229_75_30DY0AAXX:7:1:0:1591/1      -       uc002fcb.1|22|70699936  45      CTATTTCCACCAAGCAGCCAAGCTCAAGGGAATCGGGGAGTACGTGAACATCCGCACAGGGATGCCCTGCCACTN     IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII     0       0:T>N]
 # HWI-EAS229_75_30DY0AAXX:7:1:0:1766/1    -       chr18   72954304        GCAGCCACCAGAAGCGGGAAGAGGTGAAGACAGAGCCTCCTGCAGAGCTCCCACTCTGCCAACGCCTTGACTTTN     IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 0       0:G>N,59:T>G
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateBowtie") if $opt_timing;
     my $lineNumber = 0;
     doTime("beginning validateBedGraph") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
         die "Failed bowtie validation, file '$file'; line $lineNumber: line=[$_]\n"
 	    unless $_ =~ m/^([A-Za-z0-9:>_,\.\|\/-]+)\t([+-])\t([A-Za-z0-9:>_,\.\|\/-]+)\t(\d+)\t(\w+)\t(\w+)\t(\d+)\t([A-Za-z0-9:>_,\.\|\/-]+)?$/;
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateBowtie", $lineNumber) if $opt_timing;
     return ();
 }
 
 sub validatePsl
 # PSL format (for download) from Wold lab.
 # EXAMPLE FROM http://genome.ucsc.edu/FAQ/FAQformat#format2
 # This adds 2 columns (sequence,<tab>sequence,) to the standard 21 columns
 # Only the first 21 are validated
 #
 # Sample first 6 lines
 #psLayout version 3
 #
 #match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStarts  tStarts
 #        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 #---------------------------------------------------------------------------------------------------------------------------------------------------------------
 #71      3       0       0       0       0       0       0       -       HWI-EAS229_75_30DY0AAXX:4:1:0:743/1     75      1       75      chr2    242951149       184181032       184181106       1  74,      0,      184181032,      agccttttacagcaacacctttacctctgctagatctttctgtagctcgtctgaagccatgggggctgggtcag,     agccttttccagcaacacctttacctcttctagatctttctgtagctcttctgaagccatgggggctgggtcag,
 #72      2       0       0       0       0       0       0       -       HWI-EAS229_75_30DY0AAXX:7:1:0:713/1     75      1       75      chr14   106368585       49540119        49540193        1  74,      0,      49540119,       cgggtgcgggccgagcagttctccgcacctccggtaaaggttcaggaccgggtgatggtctctgcagcagtcag,     ccggtgcgggccgagcagttctccgcacctccggtaaaggtgcaggaccgggtgatggtctctgcagcagtcag,
 {
     my ($path, $file, $type) = @_;
     my $lineNumber = 0;
     doTime("beginning validatePsl") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if $lineNumber == 1 and m/^psLayout version \d+/; # check first line
         next if $lineNumber == 2 and m/^$/;
         next if $lineNumber == 3 and m/^match/;
         next if $lineNumber == 4 and m/^\s+match/;
         next if $lineNumber == 5 and m/^------/;
         die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
 	    unless m/^(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t([+-][+-]?)\t([A-Za-z0-9:>\|\/_-]+)\t(\d+)\t(\d+)\t(\d+)\t(\w+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t([0-9,]+)\t([0-9,]+)\t([0-9,]+)/;
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validatePsl", $lineNumber) if $opt_timing;
     return ();
 }
 
 
 ############################################################################
 # Misc subroutines
 
 sub validateDdfField {
     # validate value for type of field
     my ($type, $val, $track, $daf) = @_;
     $type =~ s/ /_/g;
     HgAutomate::verbose(4, "Validating $type: " . (defined($val) ? $val : "") . "\n");
     if($validators{$type}) {
         return $validators{$type}->($val, $type, $track, $daf);
     } else {
         return $validators{'default'}->($val, $type, $track, $daf); # Considers the term controlled vocab
     }
 }
 
 sub checkDataFormat {
     # validate file type
     my ($format, $file) = @_;
     HgAutomate::verbose(3, "Checking data format for $file: $format\n");
     my $type = $format;
     if ($format =~ m/(bed) (\d+)/) {
         $format = $1;
     }
     if ($format =~ m/(bedGraph) (\d+)/) {
         $format = $1;
     }
     $formatCheckers{$format} || return "Data format \'$format\' is unknown\n";
     return $formatCheckers{$format}->($submitPath, $file, $type);
     HgAutomate::verbose(3, "Done checking data format for $file: $format\n");
 }
 
 sub ddfKey
 {
 # return key for given DDF line (e.g. "antibody=$antibody;cell=$cell" for ChIP-Seq data).
 # The key includes replicate (if applicable) if $includeReplicate is true.
     my ($fields, $ddfHeader, $daf, $includeReplicate) = @_;
 
     if (defined($daf->{variables})) {
         my $delim = ";";
         my $key = join($delim, map("$_=" . $fields->{$_}, sort @{$daf->{variableArray}}));
         if($includeReplicate && defined($fields->{replicate})) {
             $key .= $delim . $fields->{replicate};
         }
         return $key;
     } else {
 	return undef; # Some dafs have no variables, eg, Sanger Gencode
     }
 }
 
 sub isDownloadOnly {
     my ($view, $grant, $lab, $daf) = @_;
     # Added 'downloadOnly' bool to DAF views so these rules can be explicit not hardcoded
     # Dont load any RawData* or Comparative views,
     # Dont load Alignments unless they are from Gingeras or Wold labs (RNA folks like to  see their RNAs)
     # Riken group have RawData and RawData2 because they have colorspace fasta and quality files
     # Wold group have RawData, RawData[2-7]
     # Wold group alignments are called 'Aligns', 'Splices', 'Paired'
     return ( (($daf->{TRACKS}->{$view}->{downloadOnly} || "") eq 'yes') or ($view =~ m/^RawData\d*$/ or $view eq 'Comparative'
 	or ($view eq 'Alignments' and $grant ne "Gingeras" and $grant ne "Wold"))) ? 1 : 0;
 
 }
 
 sub printCompositeTdbSettings {
 # prints out trackDb.ra settings for the composite track
     local *OUT_FILE = shift;
     my ($daf,%ddfSets) = @_;
 
     my $compositeTrack = Encode::compositeTrackName($daf);
 
     print OUT_FILE "track $compositeTrack\n";
     print OUT_FILE "compositeTrack on\n";
 
     my $setting    = "subGroup1 view Views";
     my $visDefault = "visibilityViewDefaults ";
     # Cycle through to get best view to default labels and to get all views and terms
     for my $view (keys %{$daf->{TRACKS}}) {
         for my $key (keys %ddfSets) {
             if(defined($ddfSets{$key}{VIEWS}{$view})) {
                 my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
                 if(!$downloadOnly) {
                     $setting = $setting . " " . $view . "=" . $view;
                     $visDefault = $visDefault . " " . $view . "=";
                     if($view eq "Peaks") {
                         $visDefault = $visDefault . "dense";
                     } elsif($view eq "Signal") {
                         $visDefault = $visDefault . "full";
                     } else {
                         $visDefault = $visDefault . "hide";
                     }
                 }
             }
         }
     }
     print OUT_FILE "shortLabel " . $daf->{lab} . " " . $daf->{dataType} . "\n"; # Default to  lab datatype
     print OUT_FILE "longLabel ENCODE " . $daf->{lab} . " " . $daf->{grant} . " " . $daf->{dataType} . "\n";  # Default to lab grant datatype
     my $group =  "regulation"; # default (common case for ENCODE)
     if (defined($daf->{group})) {
         $group = $daf->{group};
     }
     print OUT_FILE "group $group\n";
     print OUT_FILE $setting . "\n"; # "subGroup1 view Views Peaks=Peaks Signal=Signal RawSignal=Raw_Signal\n";
 
     # Need to create N subgroups with M members each
     if (defined($daf->{variables})) {
         my $grpNo = 1;
         my $sortOrder = "sortOrder ";
         my $dimensions = "dimensions";
         my $controlledVocab = "controlledVocabulary encode/cv.ra";
         my %tags = ();
         if (defined($daf->{variables})) {
             my @variables = @{$daf->{variableArray}};
             for my $variable (@variables) {
                 $grpNo++;
                 my $groupVar = $variable;
                 my $cvTypeVar = $variable;
                 # special names for cell and antibody
                 if ($variable eq "cell") {
                     $groupVar = "cellType";
                     $cvTypeVar = "Cell Line";
                 }
                 if ($variable eq "antibody") {
                     $groupVar = "factor";
                     $cvTypeVar = "Antibody";
                 }
                 if($grpNo < 5) {
                     $dimensions .= " dimension" . chr(86 + $grpNo) . "=" . $groupVar;
                 }
                 $sortOrder = "$sortOrder$groupVar=+ ";
                 $controlledVocab = "$controlledVocab $groupVar";
                 $setting = "subGroup$grpNo $groupVar " . ucfirst($groupVar);
                 $setting = "subGroup$grpNo $groupVar " . "Cell_Line" if $variable eq "cell";
                 for my $key (keys %ddfSets) {
                     my @pairs = split(';', $key);
                     for my $pair (@pairs) {
                         my ($var, $term) = split('=', $pair);
                         my $tag = $terms{$cvTypeVar}->{$term}->{'tag'};
                         if ($var eq $variable) {
                             if (!defined($tags{$tag})) {
                                 # suppress dups, requested by Brian
                                 $setting = "$setting $tag=$term";
                                 $tags{$tag} = $term;
                             }
                         }
                     }
                 }
                 print OUT_FILE $setting . "\n";     # "subGroup2\cellTyle Cell_Line ???\n;
             }
         }
         $setting = $sortOrder . "view=+";
         print OUT_FILE $dimensions . "\n";         # "dimensions  dimensionX=cellType dimensionY=factor"
         print OUT_FILE $setting . "\n";         # "sortOrder cellType=+ factor=+ view=+\n";
         print OUT_FILE $controlledVocab . "\n"; # "controlledVocabulary encode/cv.ra cellType factor\n";
     }
     print OUT_FILE "dragAndDrop subTracks\n";
     print OUT_FILE $visDefault . "\n";          #"visibilityViewDefaults Peaks=dense Signal=full RawSignal=hide\n";
     print OUT_FILE "priority 0\n";
     print OUT_FILE "type bed 3\n";
     print OUT_FILE "wgEncode 1\n\n";
 }
 
 sub validationSettings {
     # parse validationSettings: "validationSettings allowReloads;validateFiles.tagAlign:mmCheckOnInN=100,mismatches=3"
     my ($type, $fileType, $genome ) = @_;
     my $chrom=1;
     my $align=1;
 
     if($opt_metaDataOnly) {
         return 0;
     }
     if($daf->{validationSettings} || $opt_validateFile) {
         my @set = $opt_validateFile ? () : split('\;', $daf->{validationSettings});
         if($type eq "validateFiles") {
             my $paramList = "";
             for my $setting (@set) {
                 if($setting =~ /^validateFiles\./) {
                     my @pair = split('\:',$setting,2);
                     my @subTypes = split('\.',$pair[0],2);
                     if($fileType eq $subTypes[1]) {
                         my @params = split('\,',$pair[1]);
                         for my $param (@params) {
                             if ($param eq "ignoreAlignment") {
                                 $align = 0;
                             } elsif ($param eq "ignoreChromLen") {
                                 $chrom = 0;
                             } else {
                                 $paramList .= " -" . $param;
                             }
                         }
                         last;
                         #return $paramList;
                     }
                 }
             }
             if($genome) {
                 if($align) {
                     $paramList .= " -genome=/cluster/data/$genome/$genome.2bit";
                 }
                 if($chrom) {
                     $paramList .= " -chromDb=$genome";
                 }
             }
             if ($paramList ne "") {
                 HgAutomate::verbose(2, "validationSettings $type $fileType params:$paramList\n");
             }
             return $paramList;
         } else {
             for my $setting (@set) {
                 if($setting eq $type) {
                     HgAutomate::verbose(2, "validationSettings $type found\n");
                     return 1;
                 }
             }
         }
     } else  {
         die "Must specify validationSettings in daf\n";
     }
 
     if( scalar(@_) > 1 ) {
 	return "";
     }
     return 0;
 }
 
 ############################################################################
 # Main
 
 my @ddfHeader;		# list of field names on the first line of DDF file
 my %ddfHeader = ();	# convenience hash version of @ddfHeader (maps name to field index)
 my @ddfLines = ();	# each line in DDF (except for fields header); value is a hash; e.g. {files => 'foo.bed', cell => 'HeLa-S3', ...}
 my %ddfSets = ();	# info about DDF entries broken down by ddfKey
 my %ddfReplicateSets = ();	# info about DDF entries broken down by ddfKey (including replicate)
 my $wd = cwd();
 
 my $ok = GetOptions("allowReloads",
                     "configDir=s",
                     "fileType=s",
                     "metaDataOnly",
                     "outDir=s",
                     "quick",
                     "timing",
                     "skipAll",
                     "skipAutoCreation",
                     "justFileDb",
                     "skipOutput",
                     "skipValidateFiles",
                     "skipValidateFastQ",
                     "validateDaf",
                     "validateFile",
                     "sendEmail",
                     "verbose=i",
                     "database=s" => \$assembly
                     );
 usage() if (!$ok);
 $opt_verbose = 1 if (!defined $opt_verbose);
 $opt_sendEmail = 0 if (!defined $opt_sendEmail);
 if($opt_justFileDb) {
    $opt_skipAll = $opt_quick = $opt_allowReloads =1;
 }
 $quickOpt = " -quick=100 " if defined ($opt_quick);  # use validateFiles to validate 100 lines
 
 if($opt_skipAll) {
     $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
 }
 
 if($opt_metaDataOnly) {
     $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
     $opt_allowReloads = 1;
 }
 
 usage() if (scalar(@ARGV) < 2);
 
 # Get command-line args
 my $submitType = $ARGV[0];	# currently not used
 my $submitDir = $ARGV[1];
 
 $ENV{TMPDIR} = $Encode::tempDir;
 
 if($opt_validateFile) {
     if(!$opt_fileType) {
         die "Error: -fileType argument is required when using -validateFile\n";
     }
     if(!$assembly) {
         die "Error: -database argument is required when using -validateFile\n";
     }
     my $db = HgDb->new(DB => $assembly);
     $db->getChromInfo(\%chromInfo);
     if(my @errors = checkDataFormat($opt_fileType, $submitDir)) {
         die "Invalid file: " . join(", ", @errors) . "\n";
     } else {
         exit(0);
     }
 }
 
 # Determine submission, configuration, and output directory paths
 HgAutomate::verbose(2, "Validating submission in directory \'$submitDir\'\n");
 if ($submitDir =~ /^\/.*/) {
     $submitPath = $submitDir;
 } else {
     $submitPath = "$wd/$submitDir";
 }
 HgAutomate::verbose(4, "Submission directory path: \'$submitPath\'\n");
 
 if (defined $opt_configDir) {
     if ($opt_configDir =~ /^\//) {
         $configPath = $opt_configDir;
     } else {
         $configPath = "$wd/$opt_configDir";
     }
 } else {
     $configPath = "$submitPath/../config"
 }
 if(!(-d $configPath)) {
     die "configPath '$configPath' is invalid; Can't find the config directory\n";
 }
 HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");
 
 if (defined $opt_outDir) {
     if ($opt_outDir =~ /^\//) {
         $outPath = $opt_outDir;
     } else {
         $outPath = "$wd/$opt_outDir";
     }
 } else {
     $outPath = "$submitPath/out"
 }
 HgAutomate::verbose(4, "Output directory path: '$outPath'; submitPath: '$submitPath'\n");
 
 if(!$opt_validateDaf) {
     # Change dir to submission directory
     if(!chdir($submitPath)) {
         die ("SYS ERR; Can't change to submission directory \'$submitPath\': $OS_ERROR\n");
     }
     HgAutomate::verbose(3, "Creating output in directory \'$outPath\'\n");
     if(!(-d $outPath)) {
         mkdir $outPath || die ("SYS ERR: Can't create out directory \'$outPath\': $OS_ERROR\n");
     }
 }
 
 # labs is now in fact the list of grants (labs are w/n grants, and are not currently validated).
 $grants = Encode::getGrants($configPath);
 $fields = Encode::getFields($configPath);
 
 if($opt_validateDaf) {
     if(-f $submitDir) {
         Encode::parseDaf($submitDir, $grants, $fields);
     } else {
         Encode::getDaf($submitDir, $grants, $fields);
     }
     print STDERR "DAF is valid\n";
     exit(0);
 }
 
 $daf = Encode::getDaf($submitDir, $grants, $fields);
 $assembly = $daf->{assembly};
 
 my $db = HgDb->new(DB => $daf->{assembly});
 $db->getChromInfo(\%chromInfo);
 
 if($opt_sendEmail) {
     if($grants->{$daf->{grant}} && $grants->{$daf->{grant}}{wranglerEmail}) {
         my $email = $grants->{$daf->{grant}}{wranglerEmail};
         if($email) {
             `echo "dir: $submitPath" | /bin/mail -s "ENCODE data from $daf->{grant}/$daf->{lab} lab has been submitted for validation." $email`;
         }
     }
 }
 
 # Add the variables in the DAF file to the required fields list
 if (defined($daf->{variables})) {
     for my $variable (keys %{$daf->{variableHash}}) {
         $fields->{$variable}{required} = 1;
         $fields->{$variable}{file} = 'ddf';
     }
 }
 
 # make replicate column required when appropriate.
 my $hasReplicates = 0;
 my $maxOrder = 0;
 for my $view (keys %{$daf->{TRACKS}}) {
     $hasReplicates += $daf->{TRACKS}{$view}{hasReplicates};
     if($daf->{TRACKS}{$view}{order} > $maxOrder) {
         $maxOrder = $daf->{TRACKS}{$view}{order}
     }
 }
 
 if($hasReplicates) {
     $fields->{replicate}{required} = 1;
 }
 
 # DAF may contain option to allow Reloads
 if(validationSettings("allowReloads")) {
     $opt_allowReloads = 1;
 }
 if(validationSettings("skipAutoCreation")) {
     $opt_skipAutoCreation = 1;
 }
 if(validationSettings("skipValidateFiles")) {
     $opt_skipValidateFiles = 1;
 }
 if(validationSettings("skipOutput")) {
     $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
 }
 
 
 # Open dataset descriptor file (DDF)
 my @glob = glob "*.DDF";
 push(@glob, glob "*.ddf");
 my $ddfFile = Encode::newestFile(@glob);
 die "ERROR: Can't find DDF file\n" unless -e $ddfFile;
 my $ddfFileTime = (stat($ddfFile))->ctime;
 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime($ddfFileTime);
 
 HgAutomate::verbose(2, "Using newest DDF file \'$ddfFile\'\n");
 my $lines = Encode::readFile($ddfFile);
 
 my $ddfLineNumber = 0;
 # Get header containing column names
 while(@{$lines}) {
     my $line = shift(@{$lines});
     $ddfLineNumber++;
     # remove leading and trailing spaces and newline
     $line =~ s/^\s+//;
     $line =~ s/\s+$//;
     # ignore empty lines and comments
     next if $line =~ /^$/;
     next if $line =~ /^#/;
     if($line !~ /\t/) {
         die "ERROR: The DDF header has no tabs; the DDF is required to be tab delimited\n";
     }
     @ddfHeader = split(/\t/, $line);
     for (my $i=0; $i < @ddfHeader; $i++) {
         $ddfHeader{$ddfHeader[$i]} = $i;
     }
     last;
 }
 
 my @errors = Encode::validateFieldList(\@ddfHeader, $fields, 'ddf');
 if(@errors) {
     die "ERROR in DDF '$ddfFile':\n" . join("\n", @errors) . "\n";
 }
 
 %terms = Encode::getControlledVocab($configPath);
 
 my @variables;
 if (defined($daf->{variables})) {
     @variables = @{$daf->{variableArray}};
 } else {
     # Hubbard Sanger Gencode project has no variables
     @variables = ();
 }
 
 my %metadataHash;
 
 # Process lines in DDF file. Create a list with one entry per line;
 # the entry is field/value hash (fields per @ddfHeader).
 
 while (@{$lines}) {
     my $line = shift(@{$lines});
     $ddfLineNumber++;
     my $errorPrefix = "DDF lineNumber $ddfLineNumber:";
     HgAutomate::verbose(2, "Parsing ddf line $ddfLineNumber\n");
 
     $line =~ s/^\s+//;
     $line =~ s/\s+$//;
     next if $line =~ /^#/;
     next if $line =~ /^$/;
 
     if($line !~ /\t/) {
         pushError(\@errors, "$errorPrefix line has no tabs; the DDF is required to be tab delimited");
         next;
     }
     my $i = 0;
     my %line;
     for my $val (split('\t', $line)) {
         if($ddfHeader[$i] ne "files" && $val =~ / /) {
             $val =~ s/\"/\\"/g if $val =~ /\"/;
             $val = '"' . $val . '"';
         }
         $line{$ddfHeader[$i]} = $val;
         $i++;
     }
     if(my @tmp = Encode::validateValueList(\%line, $fields, 'ddf')) {
         pushError(\@errors, $errorPrefix . "\n" . join("\n", @tmp));
         next;
     }
 
     my $view = $line{view};
     HgAutomate::verbose(2,"Parsing $view\n");
     if($daf->{TRACKS}{$view}) {
         my $files = $line{files};
         if($fields->{replicate}{required}) {
             my $replicate = $line{replicate};
             if($daf->{TRACKS}{$view}{hasReplicates} && (!defined($replicate) || !length($replicate))) {
                 pushError(\@errors, "$errorPrefix missing replicate number for view '$view'");
             }
         }
         my @filenames;
         for(split(',', $files)) {
             # Use glob explicitly so our error messages have the list of files actually used.
             if(my @glob = glob) {
                 push(@filenames, @glob);
             } else {
                 push(@filenames, $_);
             }
         }
         $line{files} = \@filenames;
         my @metadataErrors;
         for my $field (keys %line) {
             push(@metadataErrors, validateDdfField($field, $line{$field}, $view, $daf));
         }
         if(@metadataErrors) {
             pushError(\@errors, @metadataErrors);
         } else {
             # avoid spurious errors by not putting invalid lines into %ddfSets
 	    # ddfKey returnes undef if there are no variables defined
 	    if (defined(ddfKey(\%line, \%ddfHeader, $daf, 1))) {
 		$ddfSets{ddfKey(\%line, \%ddfHeader, $daf, 0)}{VIEWS}{$view} = \%line;
 		$ddfReplicateSets{ddfKey(\%line, \%ddfHeader, $daf, 1)}{VIEWS}{$view} = \%line;
 		my $str = join(", ", map($line{$_}, sort(@variables)));
 		$metadataHash{$str} = 1;
 	    }
         }
         push(@ddfLines, \%line);
     } else {
         pushError(\@errors, "$errorPrefix undefined view '$view'");
     }
     HgAutomate::verbose(2, "End of parsing ddf line $ddfLineNumber\n");
 }
 
 my $tmpCount = 1;
 
 if(!@errors) {
     # Look for missing required views and create missing, optional views, but
     # but don't bother if we have already encountered errors.
     # Could also look for replicate inconsistency here (e.g. Alignments for replicate 3 but not fastq for replicate 3).
 
     for my $key (keys %ddfSets) {
         for my $view (keys %{$daf->{TRACKS}}) {
             if($daf->{TRACKS}{$view}{required}) {
                 if(!defined($ddfSets{$key}{VIEWS}{$view})) {
                     pushError(\@errors, "view '$view' missing for $key");
                 }
             }
         }
     }
 
     doTime("beginning ddfReplicateSets loop") if $opt_timing;
     for my $key (keys %ddfReplicateSets) {
         # create missing optional views (e.g. ChIP-Seq RawSignal or transcriptome project PlusRawSignal and MinusRawSignal)
         # note this loop assumes these are on a per replicate basis.
         # Also note that any project (like transcriptome) that doesnt have replicates should also use
         # this for their auto-create signals.
         HgAutomate::verbose(2, "ddfReplicateSets loop key=[$key] aln=[".(defined($ddfReplicateSets{$key}{VIEWS}{Alignments}))."] rawsig=[".(defined($ddfReplicateSets{$key}{VIEWS}{RawSignal}))."]\n");
         if( ( !defined($daf->{noAutoCreate}) || $daf->{noAutoCreate} ne "yes") && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
         && !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})
         && !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
         && !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
         && ($daf->{dataType} ne 'MethylSeq')) {
             # Make a list of the PlusRawSignal/MinusRawSignal or RawSignals we are going to have to make
             my @newViews = ();
             push @newViews, "RawSignal" if $daf->{TRACKS}{RawSignal}{order};
             push @newViews, "PlusRawSignal" if $daf->{TRACKS}{PlusRawSignal}{order};
             push @newViews, "MinusRawSignal" if $daf->{TRACKS}{MinusRawSignal}{order};
 
             foreach my $newView (@newViews) #loop around making them
             {
                 my $alignmentLine = $ddfReplicateSets{$key}{VIEWS}{Alignments};
                 # Time to check for fragLength by replicate (in alignments line) (die, don't just push error and build anyway)
                 if($newView eq "RawSignal") {
                     if(!defined($alignmentLine->{fragLength})) {
                         if(!defined($daf->{medianFragmentLength})) {
                             die (\@errors, "Missing fragLength field for building $daf->{dataType} '$newView' for replicate $alignmentLine->{replicate}\nThe fragLength is required and is the median fragment length used in generating this replicate.\n");
                         } else { # Letting medianFramentLength stand in for per relicate fragLength
                             $alignmentLine->{fragLength} = $daf->{medianFragmentLength};
                         }
                     }
                     if ($alignmentLine->{fragLength} < 0 || $alignmentLine->{fragLength} > 10000) {
                         die (\@errors, "Missing or invalid fragLength field for building $daf->{dataType} '$newView' for replicate $alignmentLine->{replicate}\nThe fragLength is required and is the median fragment length used in generating this replicate.\n");
                     }
                 }
                 my %line = %{$alignmentLine};
                 $line{view} = $newView;
                 $line{type} = 'wig';
                 $ddfReplicateSets{$key}{VIEWS}{$newView} = \%line;
                 my @unzippedFiles = ();
                 doTime("beginning unzipping replicates files for view [$newView] key=[$key]") if $opt_timing;
                 for my $file (@{$alignmentLine->{files}}) {
                     # Unzip any zipped files - only works if they are with .gz suffix
                     my ($fbase,$dir,$suf) = fileparse($file, ".gz");
                     if ($suf eq ".gz") {
                         # If the zipped file exists then unzip it (do this each time, in case zip file is updated
                         # This check is also done above at the stage where we are testign the files in the ddf exist
                         if (-s $file) {
                             my $err = system("gunzip -c $file > $dir/$fbase");
                             if ($err) {
                                 die ("File \'$file\' failed gunzip $file to [$dir/$fbase]\n");
                             }
                             HgAutomate::verbose(2, "File \'$file\' gunzipped to \'$fbase\'\n");
                         }
                         if ( ! -s "$dir/$fbase") {
                             die ("Unzipped file \'$fbase\' does not exist (or is empty) for DDF file \'$file\'\n");
                         }
                         push @unzippedFiles, "$dir/$fbase";
                     } else {
                         push @unzippedFiles, $file;
                     }
                 }
                 doTime("done unzipping replicates files") if $opt_timing;
                 $alignmentLine->{files} = \@unzippedFiles;
                 # Now we can safely sort these files as none are zipped
                 my $files = join(" ", @{$alignmentLine->{files}});
                 my $tmpFile = $Encode::autoCreatedPrefix . $newView. "$tmpCount.bed"; # add the type of view to the name
                 $tmpCount++;
                 if($opt_skipAutoCreation) {
                     HgAutomate::verbose(2, "Skipping auto-creating view '$newView' for key '$key'\n");
                 } else {
                     HgAutomate::verbose(2, "Auto-creating view '$newView' for key '$key' in file '$tmpFile'\n");
                     doTime("beginning Auto-create of view $newView in file $tmpFile") if $opt_timing;
 
                     # XXXX gzip before saving to disk?
                     my @cmds;
                     my $sortFiles;
                     if(defined($alignmentLine->{fragLength}) && $alignmentLine->{fragLength} != 0) {
                         push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $alignmentLine->{fragLength} $files");
                         $sortFiles = " -";
                         # sorting stdin, so have to sort in mem (and control how much mem we use)
                         push @cmds, "sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                     } else {
                         $sortFiles = $files;
                         # sort each file in place, controling mem usage, then do merge sort
                         my @sortList = split(/\s+/, $sortFiles);
                         foreach my $f (@sortList) {
                             my $err = system("sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n -o $f $f ");
                             if ($err) {
                                 die ("File \'$f\' failed sort\n");
                             }
                             HgAutomate::verbose(2, "File \'$f\' sorted\n");
                         }
                         # Now do the mergesort in the pipeline
                         push @cmds, "sort -m $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                     }
                     push @cmds, "grep -v -E \"^track\" ";
                     push @cmds, "gawk '\$6 == \"+\" {print}'" if $newView eq "PlusRawSignal";
                     push @cmds, "gawk '\$6 == \"-\" {print}'" if $newView eq "MinusRawSignal";
                     push @cmds, "bedItemOverlapCount $daf->{assembly} stdin";
                     my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => $tmpFile, DEBUG => $opt_verbose - 1);
                     if(my $err = $safe->exec()) {
                         print STDERR  "ERROR: failed auto bedItemOverlap creation of bedGraph for $key" . $safe->stderr() . "\n";
                         # don't show end-user pipe error(s)
                         pushError(\@errors, "failed creation of wiggle for '$key'");
                     }
                     doTime("done Auto-create of view $newView") if $opt_timing;
                 }
                 $line{files} = [$tmpFile];
                 push(@ddfLines, \%line);
             }  # End foreach newView loop
         }
     } # End replicate sets loop
     doTime("done ddfReplicateSets loop") if $opt_timing;
 }
 
 my $compositeTrack = Encode::compositeTrackName($daf);
 ### No good reason to make this an error.  Composite entry can be added when subtracks are 1st added to trackDb.
 #if(!$db->quickQuery("select count(*) from trackDb where tableName = ?", $compositeTrack)) {
 #    pushError(\@errors, "Missing composite track '$compositeTrack'; please contact your data wrangler");
 #}
 my $compositeExists = $db->quickQuery("select count(*) from trackDb where tableName = ?", $compositeTrack);
 
 if(@errors) {
     my $prefix = @errors > 1 ? "Error(s)" : "Error";
     die "$prefix:\n\n" . join("\n\n", @errors) . "\n";
 }
 
 # After this point, we don't use @errors and just die immediately.
 
 # Validate files and metadata fields in all ddfLines using controlled
 # vocabulary.  Create load.ra file for loader and trackDb.ra file for wrangler.
 doTime("beginning out files") if $opt_timing;
 
 if($opt_skipOutput) {
     open(LOADER_RA, ">>/dev/null");
     open(TRACK_RA, ">>/dev/null");
     open(README, ">>/dev/null");
 } else {
     open(LOADER_RA, ">$outPath/$Encode::loadFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::loadFile\' file; error: $!\n";
     open(TRACK_RA, ">$outPath/$Encode::trackFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::trackFile\' file; error: $!\n";
     open(README, ">$outPath/README.txt") || die "SYS ERROR: Can't write '$outPath/READEME.txt' file; error: $!\n";
 }
 if($opt_justFileDb || !$opt_skipOutput) {
     open(FILE_RA, ">$outPath/$Encode::fileDbFile") || die "SYS ERROR: Can't write '$outPath/$Encode::fileDbFile' file; error: $!\n";
 } else {
     open(FILE_RA, ">>/dev/null");
 }
 
 # Create a composite track entry if the trackDb.ra entry was not found
 if(!$opt_skipOutput && !$compositeExists) {
     printCompositeTdbSettings(*TRACK_RA,$daf,%ddfSets);
 }
 
 # XXXX Calculation of priorities still needs work; we currently don't account for multiple experiments in the same DDF.
 # It may in fact be too much work to do automatic calculation of priorities (i.e. the wrangler may have to do it manually).
 
 my $priority = $db->quickQuery("select max(priority) from trackDb where settings like '%subTrack $compositeTrack%'") || 0;
 $ddfLineNumber = 1;
 
 foreach my $ddfLine (@ddfLines) {
     $ddfLineNumber++;
     my $diePrefix = "ERROR on DDF lineNumber $ddfLineNumber:";
     my $view = $ddfLine->{view};
     my $type = $daf->{TRACKS}{$view}{type} || die "Missing DAF entry for view '$view'\n";
     my $sql = $daf->{TRACKS}{$view}{sql};
     my $metadata = "project=wgEncode grant=$daf->{grant} lab=$daf->{lab} dataType=$daf->{dataType}";
     $metadata .= " cell=$ddfLine->{cell}" if $ddfLine->{cell}; # force some order
     $metadata .= " antibody=$ddfLine->{antibody}" if $ddfLine->{antibody};
     for my $key (keys %{$ddfLine}) {
         my $value = $ddfLine->{$key};
         if($value
         && $key ne 'files'
         && $key ne 'cell'
         && $key ne 'antibody'
         && $key ne 'view'
         && $key ne 'replicate'
         && $key ne 'labVersion'
         && $key ne 'softwareVersion') {
             $metadata .= " $key=$value"; # and the rest
         }
     }
     $metadata .= " view=$view";
     $metadata .= " replicate=$ddfLine->{replicate}" if $ddfLine->{replicate} && $daf->{TRACKS}{$view}{hasReplicates};
     $metadata .= " labVersion=$ddfLine->{labVersion}" if $ddfLine->{labVersion};
     $metadata .= " softwareVersion=$ddfLine->{softwareVersion}" if $ddfLine->{softwareVersion};
     $metadata .= ' dataVersion="' . $Encode::dataVersion .'"';
     if($submitDir =~ /(\d+)$/) {
         $metadata .= " subId=$1";
     }
     $metadata .= " submittedDataVersion=V$daf->{dataVersion}" if(defined($daf->{dataVersion}) && $daf->{dataVersion} > 1);
 
     HgAutomate::verbose(2, "  View: $view\n");
     my $replicate;
     if($hasReplicates && $daf->{TRACKS}{$view}{hasReplicates}) {
         $replicate = $ddfLine->{replicate};
         if(defined($replicate) && $replicate > 0) {
         } else {
             die "$diePrefix invalid or missing replicate value\n";
         }
     }
     # Construct table name from track name and variables
     my $tableName = "$compositeTrack";
     #if($Encode::dafVersion le "1.0") {
         $tableName .= $view;
         if(defined($replicate)) {
             $tableName .= "Rep$replicate";
         }
     #}
     if(!defined($daf->{TRACKS}{$view}{shortLabelPrefix})) {
         $daf->{TRACKS}{$view}{shortLabelPrefix} = "";
     }
     my $shortLabel = defined($daf->{TRACKS}{$view}{shortLabelPrefix}) ? $daf->{TRACKS}{$view}{shortLabelPrefix} : "";
     my $longLabel = "ENCODE" . (defined($daf->{TRACKS}{$view}{longLabelPrefix}) ? " $daf->{TRACKS}{$view}{longLabelPrefix}" : "");
     if(defined($replicate)) {
         $longLabel .= " Replicate $replicate";
     }
     my $subGroups = "view=$view";
     my $pushQDescription = "";
     my $species;
     my $tier1 = 0;
     if (@variables) {
         my %hash = map { $_ => $ddfLine->{$_} } @variables;
         for my $var (@variables) {
             my $cvTypeVar = $var;
             if ($var eq "antibody") {
                 $cvTypeVar = "Antibody";
             } elsif ($var eq "cell") {
                 $cvTypeVar = "Cell Line";
             }
             if(!defined($terms{$cvTypeVar}->{$hash{$var}})) {
                 $cvTypeVar = "control";
             }
             my $val = $terms{$cvTypeVar}->{$hash{$var}}->{'tag'};
             $val = ucfirst(lc($val));
             if($val ne 'None') {  # Special control term does not show up in the name!
                 # trailing + => Plus, - => Neg (e.g. H9ES-AFP+)
                 $val =~ s/\+$/Pos/;
                 $val =~ s/\-$/Neg/;
                 $tableName = $tableName . $val;
             }
         }
 
         my $shortSuffix = "";
         my $longSuffix;
         my %shortViewMap = (Peaks => 'Pk', Signal => 'Sig', RawSignal => 'Raw', PlusRawSignal => 'PlusRaw', MinusRawSignal => 'MinusRaw');
         if($hash{'antibody'} && $hash{'cell'}) {
             $pushQDescription = "$hash{'antibody'} in $hash{'cell'}";
             $shortSuffix = "$hash{'cell'} $hash{'antibody'}";
             $longSuffix = "$hash{'antibody'} in $hash{'cell'} cells";
         } elsif($hash{'ripAntibody'} && $hash{'ripTgtProtein'} && $hash{'cell'}) {
             $longSuffix = "$hash{'ripTgtProtein'} in $hash{'cell'} cells using $hash{'ripAntibody'}";
             $pushQDescription = $longSuffix;
             $shortSuffix = "$hash{'cell'} $hash{'ripTgtProtein'} $hash{'ripAntibody'}";
         } elsif($hash{'rnaExtract'} && $hash{'localization'} && $hash{'cell'}) {
             $shortSuffix = "$hash{'cell'} $hash{'localization'} $hash{'rnaExtract'}";
             $longSuffix = "$hash{'rnaExtract'} in $hash{'cell'} $hash{'localization'}";
             if ($hash{'mapAlgorithm'}) {
                 $shortSuffix = $shortSuffix . $hash{'mapAlgorithm'};
                 $longSuffix = $longSuffix . "using" . $hash{'mapAlgorithm'};
             }
             $pushQDescription = $longSuffix;
         } elsif($hash{'freezeDate'}) {
             $shortSuffix = $hash{'freezeDate'};
             $longSuffix = $hash{'freezeDate'};
             $pushQDescription = $longSuffix;
         } elsif ($hash{"species"}) {
             $pushQDescription = "$hash{'species'}";
             $shortSuffix = "$hash{'species'}";
             $longSuffix = "in $hash{'species'}";
             $species = "$hash{'species'}";
             $pushQDescription = "$view $daf->{dataType} $longSuffix";
         } elsif ($hash{"cell"}) {
             $pushQDescription = "$hash{'cell'}";
             $shortSuffix = "$hash{'cell'}";
             $longSuffix = "in $hash{'cell'} cells";
             $tier1 = 1 if ($hash{'cell'} eq 'GM12878' || $hash{'cell'} eq 'K562');
         } else {
 	    warn "Warning: variables undefined for pushQDescription,shortSuffix,longSuffix\n";
     	}
         if(defined($shortViewMap{$view})) {
             $shortSuffix .= " " . $shortViewMap{$view};
         }
         if(defined($replicate)) {
             $shortSuffix .= " $replicate";
             $pushQDescription .= " Replicate $replicate";
         }
         if($shortSuffix) {
             $shortLabel = $shortLabel ? "$shortLabel ($shortSuffix)" : $shortSuffix;
         }
         if($longSuffix) {
             $longLabel .= " ($longSuffix)";
         }
         # make the "subGroups" setting from all variables
         for my $var (sort keys %hash) {
             # The var name is over-ridden for antibody and cell, for historical reasons
             my $groupVar = $var;
             my $cvTypeVar = $groupVar;
             # handle inconsistent naming for antibody & cell type
             if ($var eq "antibody") {
                 $groupVar = "factor";
                 $cvTypeVar = "Antibody";
             } elsif ($var eq "cell") {
                 $groupVar = "cellType";
                 $cvTypeVar = "Cell Line";
             }
             if(!defined($terms{$cvTypeVar}->{$hash{$var}})) {
                 $cvTypeVar = "control";
             }
             $subGroups .= " $groupVar=$terms{$cvTypeVar}->{$hash{$var}}->{'tag'}";
         }
         if(defined($replicate) && ($daf->{lab} eq "HudsonAlpha" || $daf->{lab} eq "Uw") || $daf->{lab} eq "Gis") {
             $subGroups .= " rep=rep$replicate"; # UGLY special casing
         }
     }
     #if($Encode::dafVersion gt "1.0") {
     #    $tableName .= "$view";
     #    if(defined($replicate)) {
     #        $tableName .= "Rep$replicate";
     #    }
     #}
 
     # mysql doesn't allow hyphens in table names and our naming convention doesn't allow underbars; to be
     # safe, we strip non-alphanumerics.
     $tableName =~ s/[^A-Za-z0-9]//g;
 
     my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate($ddfFileTime); # Use DDF time
     my $dateSubmitted    = sprintf("%04d-%02d-%02d", 1900 + $year, $mon + 1, $mday);
     my $dateUnrestricted = sprintf("%04d-%02d-%02d", 1900 + $rYear, $rMon + 1, $rMDay);
 
 
     # dataVersion means the tableName must be different (append Vn), and the old metaddata should be used for dateSubmitted and dateUnrestricted
     if(defined($daf->{dataVersion}) && $daf->{dataVersion} > 1) {
         my $prevTableName = "$tableName";
         # Find old metadata to lookup dateSubmitted and dateUnrestricted
         my $prevTableFound = 0;
         for (my $preVer=$daf->{dataVersion} - 1; $preVer > 1; $preVer--) {
             $prevTableFound = $db->quickQuery("select count(*) from trackDb where tableName = ?", $prevTableName . "V$preVer");
             if($prevTableFound) {
                 $prevTableName .= "V$preVer";
                 last;
             }
         }
         if($prevTableFound == 0) {
             $prevTableFound = $db->quickQuery("select count(*) from trackDb where tableName = '$prevTableName'");
         }
         if($prevTableFound) {
             my $oldSettings = $db->quickQuery("select settings from trackDb where tableName = '$prevTableName'");
             if( $oldSettings ) {
                 $oldSettings =~ m/metadata (.*?)\n/;    # Is this throwing away all but the contents of the metadata line?
                 my ( $tagRef, $valRef ) = Encode::metadataLineToArrays($1);
                 my @tags = @{$tagRef};
                 my @vals = @{$valRef};
                 my $tix = 0;
                 while($tags[$tix]) {
                     if($tags[$tix] eq "dateUnrestricted") {
                         $dateUnrestricted = $vals[$tix];
                     } elsif($tags[$tix] eq "dateSubmitted") {
                         $metadata .= " dateResubmitted=$dateSubmitted";
                         $dateSubmitted = $vals[$tix];
                     }
                     $tix++;
                 }
             }
         }
         # Now finally complete the real tableName
         $tableName = $tableName . "V" . $daf->{dataVersion};
     }
     # Delayed adding these terms to metadata so that resubmissions could have the looked up term
     $metadata .= " dateSubmitted=$dateSubmitted";
     $metadata .= " dateUnrestricted=$dateUnrestricted";
 
     $tableName =~ "/Utaustin/Uta/";  # Special case for certain transgressors
     if(length($tableName) > 64) {
         $tableName =~ "/Hudsonalpha/Haib/" if length($tableName) > 64; # Special case for certain transgressors
         $tableName =~ "/Sunyalbany/Sunya/" if length($tableName) > 64;
         $tableName =~ "/Alignments/Aln/" if length($tableName) > 64;
         $tableName =~ "/Signal/Sig/" if length($tableName) > 64;
         $tableName =~ "/Control/Ctrl/" if length($tableName) > 64;
         die "Table name [$tableName] too long, must be <= 64 chars, got [".length($tableName)."]\n" if length($tableName) > 64;
     }
 
 
     if($tableNamesUsed{$tableName}++) {
         dieTellWrangler("System Error: identical tableName '$tableName' was generated by multiple data sets\n");
     }
 
     if(!$opt_allowReloads) {
-        if($db->quickQuery("select count(*) from trackDb where tableName = ?", $tableName)) {
+        if ($db->tableExist( $tableName)) {
             die "view '$view' has already been loaded as track '$tableName'\nPlease contact your wrangler if you need to reload this data\n";
         }
     }
 
     # XXXX Move the decision about which views have tracks into the DAF?
     # Already this is used in 2 places so made it a function,
     # would be better in the DAF except we'd have to go change all the DAFs :(
     my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
 
     my $fileType = $type;
     $fileType =~ s/ //g;
     print FILE_RA "    filename $tableName.$fileType.gz\n";
     $metadata .= " composite=$compositeTrack";
 
     if($downloadOnly) {
         my $parentTable = $tableName;
         $parentTable =~ s/RawData/RawSignal/    if $parentTable =~ /RawData/;
         $parentTable =~ s/Alignments/RawSignal/ if $parentTable =~ /Alignments/;
         print FILE_RA "    parentTable $parentTable\n";
         $metadata .= " parentTable=$parentTable";
     } else {
         print FILE_RA "    tableName $tableName\n";
         $metadata .= " tableName=$tableName";
     }
     print FILE_RA "    composite $compositeTrack\n";
     $metadata .= " fileName=$tableName.$fileType.gz";
 
     print LOADER_RA "tablename $tableName\n";
     print LOADER_RA "view $view\n";
     print LOADER_RA "type $type\n";
     if($sql) {
         print LOADER_RA "sql $sql\n";
     }
     if($species) {
         print LOADER_RA "assembly $species\n";
     } else {
         print LOADER_RA "assembly $daf->{assembly}\n";
     }
     print LOADER_RA "files @{$ddfLine->{files}}\n";
     print LOADER_RA "downloadOnly $downloadOnly\n";
     print LOADER_RA "pushQDescription $pushQDescription\n";
     print LOADER_RA "\n";
 
     print FILE_RA sprintf("    metadata %s\n\n", $metadata);
 
     if($downloadOnly || ($type eq "wig" && !grep(/$Encode::autoCreatedPrefix/, @{$ddfLine->{files}}))) {
         # adds entries to README.txt for download only files AND wig data (excepting wig data generated by us)
         print README "file: $tableName.$type.gz\n";
         for my $var (@variables) {
             print README "$var: " . $ddfLine->{$var} . "\n";
         }
         if(defined($replicate)) {
             print README "replicate: $replicate\n";
         }
 
         print README sprintf("data RESTRICTED UNTIL: %d-%02d-%02d\n", 1900 + $rYear, $rMon + 1, $rMDay);
         print README "\n";
     }
     if(!$downloadOnly) {
         print TRACK_RA "    track $tableName\n";
         print TRACK_RA "    release alpha\n";
         if ($tier1 eq 1) {
             # default to only Tier1 subtracks visible.  Wrangler should review if this is
             #   correct for the track
             print TRACK_RA "    subTrack $compositeTrack\n";
         } else {
             print TRACK_RA "    subTrack $compositeTrack off\n";
         }
         print TRACK_RA "    shortLabel $shortLabel\n";
         print TRACK_RA "    longLabel $longLabel\n";
         print TRACK_RA "    subGroups $subGroups\n";
         if($type eq 'wig') {
             my $placeHolder = Encode::wigMinMaxPlaceHolder($tableName);
             print TRACK_RA "    type $type $placeHolder\n";
         } elsif($type eq 'gtf') { # GTF is converted to and loaded as genePred
             print TRACK_RA "    type genePred\n";
         } elsif($type eq 'tagAlign') { # tagAligns are bed 6 but with column called 'sequence' instead of 'name'
             print TRACK_RA "    type bed 6\n";
         } else {
             print TRACK_RA "    type $type\n";
         }
         # Obsolete: now in metadata
         # print TRACK_RA sprintf("    dateSubmitted %04d-%02d-%02d\n", 1900 + $year, $mon + 1, $mday);
         # print TRACK_RA sprintf("    dateUnrestricted %04d-%02d-%02d\n", 1900 + $rYear, $rMon + 1, $rMDay);
         # print TRACK_RA sprintf("    dataVersion %s\n", $Encode::dataVersion);
         if(defined($ddfLine->{accession}) && length($ddfLine->{accession}) > 0) {
             print TRACK_RA sprintf("    accession %s\n",$ddfLine->{accession});
         }
         # color track by color setting for cell type in cv.ra
         if(defined($ddfLine->{cell})) {
             if(defined($terms{'Cell Line'}->{$ddfLine->{cell}}->{'color'})) {
                 print TRACK_RA sprintf("    color %s\n",
                         $terms{'Cell Line'}->{$ddfLine->{cell}}->{'color'});
             }
         }
         # metadata proj=wgEncode lab=Yale cell=GM12878 antiBody=Pol2 labVersion="PeakSeq 1.2 ..." dataVersion="ENCODE Feb 2009 Freeze"
         print TRACK_RA sprintf("    metadata %s\n", $metadata);
         print TRACK_RA "\n";
     }
 }
 close(LOADER_RA);
 close(TRACK_RA);
 close(FILE_RA);
 close(README);
 doTime("done out files") if $opt_timing;
 
 if($submitPath =~ /(\d+)$/) {
     my $id = $1;
     if(dirname($submitPath) =~ /_(\w+)/) {
         my $instance = $1;
         # XXXX rubyDb logic s/d probably be moved to Encode.pm
         my $rubyDb = HgDb->new(DB => "encpipeline_$instance");
         my @tmp = keys %metadataHash;
         my $count = scalar(@tmp);
         my $metadata = join("; ", @tmp);
         HgAutomate::verbose(2, "Updating id '$id'; metdata: '$metadata'; count: 'count'\n");
         $rubyDb->execute("update projects set count = ?, metadata = ?, db = ?, lab = ?, data_type = ?, track = ? where id = ?",
              $count, $metadata,
              $daf->{assembly}, $daf->{lab}, $daf->{dataType}, $compositeTrack, $id);
     }
 }
 $time0=$timeStart;
 doTime("done. ") if $opt_timing;
 exit 0;