src/hg/encode/encodeValidate/doEncodeValidate.pl 1.170

1.170 2009/03/26 07:01:20 mikep
adding chrom/size checking against hg18
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.169
retrieving revision 1.170
diff -b -B -U 1000000 -r1.169 -r1.170
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	22 Mar 2009 02:37:47 -0000	1.169
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	26 Mar 2009 07:01:20 -0000	1.170
@@ -1,1780 +1,1782 @@
 #!/usr/bin/env perl
 
 # encodeValidate.pl - validate an ENCODE data submission generated by the
 #                       automated submission pipeline
 #
 # Verifies that all files and metadata are present and of correct formats
 # Creates a load file (load.ra) and track configuration (trackDb.ra) for the datasets
 #
 # Returns 0 if validation succeeds
 #
 # Error reporting:
 #
 # We die immediately (with a human readable message) when internal errors are encountered (e.g. file I/O errors or misconfiguration).
 #
 # In order to facilitate debugging of often very large file uploads, we try to accumulate multiple user errors (e.g. DAF, DAS or
 # file syntax errors) before die'ing with a message with a list of errors.
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit the CVS'ed source at:
 # $Header$
 
 use warnings;
 use strict;
 
 use File::stat;
 use File::Basename;
 use Getopt::Long;
 use English;
 use Carp qw(cluck);
 use Cwd;
 use IO::File;
 use File::Basename;
-use Data::Dumper; # MJP
 
 use lib "/cluster/bin/scripts";
 use Encode;
 use HgAutomate;
 use HgDb;
 use RAFile;
 use SafePipe;
 
 use vars qw/
     $opt_allowReloads
     $opt_configDir
     $opt_fileType
     $opt_metaDataOnly
     $opt_outDir
     $opt_quick
     $opt_skipAll
     $opt_skipAutoCreation
     $opt_skipOutput
     $opt_skipValidateFiles
     $opt_skipValidateFastQ
     $opt_validateDaf
     $opt_validateFile
     $opt_sendEmail
     $opt_verbose
     $opt_timing
     /;
 
 # Global variables
 our $submitPath;        # full path of data submission directory
 our $configPath;        # full path of configuration directory
 our $outPath;           # full path of output directory
 our %terms;             # controlled vocabulary
 our $quickCount=100;
 our $time0 = time;
 our $timeStart = time;
 our %chromInfo;         # chromInfo from assembly for chrom validation
 our $maxBedRows=80_000_000; # number of rows to allow in a bed-type file
 our %tableNamesUsed;
 our ($grants, $fields, $daf);
 our $SORT_BUF = " -S 5G ";
 
 sub usage {
     print STDERR <<END;
 usage: encodeValidate.pl submission-type project-submission-dir
 
 submission-type is currently ignored.
 
 Current dafVersion is: $Encode::dafVersion
 
 Creates the following output files: $Encode::loadFile, $Encode::trackFile and README.txt
 
 options:
     -allowReloads       Allow reloads of existing tables
     -configDir=dir      Path of configuration directory, containing
                         metadata .ra files (default: submission-dir/../config)
     -fileType=type	used only with validateFile option; e.g. narrowPeak
     -metaDataOnly       Process DAF/DDF and just update the projects.metadata field;
                         equal to -allowReloads -skipAll
     -quick		Validate only first $quickCount lines of files
     -skipAll            Turn on all "-skip..." options
     -skipAutoCreation   Tells script skip creating the auto-created files (e.g. RawSignal, PlusRawSignal, MinusRawSignal)
                         this can save you a lot of time when you are debugging and re-running the script on large projects
     -skipOutput         Dont write the various output files
     -skipValidateFiles  Tells script skip the file validation step; to save a lot of time during testing
     -validateDaf	exit after validating DAF file (project-submission-dir is the DAF file name).
     -validateFile	exit after validating file (project-submission-dir is the file name;
                         requires -fileType option as well)
     -verbose=num        Set verbose level to num (default 1).
     -outDir=dir         Path of output directory, for validation files
                         (default: submission-dir/out)
 END
 exit 1;
 }
 
 sub pushError
 {
     my ($errors, @new) = @_;
     if(@new) {
         push(@{$errors}, @new);
         HgAutomate::verbose(2, "pushing errors:\n\t" . join("\n\t", @new) . "\n");
     }
 }
 
 sub doTime
 # print out time difference in seconds since last call to this function, or the program started.
 {
     my $msg = shift || "";
     my $lines = shift || 0;
     my $time1 = time;
     my $t = $time1-$time0;
     $t = 1 if ($lines>0 and $t<1);
     warn("# $msg : $t secs".($lines>0 ? "  ($lines lines, ".(int($lines/$t))." lines/sec)" : ""));
     $time0 = time;
 }
 
 sub dieTellWrangler
 {
     my ($msg) = @_;
     my $email;
     if($grants->{$daf->{grant}} && $grants->{$daf->{grant}}{wranglerEmail}) {
         $email = $grants->{$daf->{grant}}{wranglerEmail};
     }
     $msg .= "Please contact your wrangler" . (defined($email) ? " at $email" : "") . "\n";
     die $msg;
 }
 
 ############################################################################
 # Validators for DDF columns -- extend when adding new metadata fields
 #
 # validators should return list of errors encountered (empty list means no errors were found).
 #
 # validator callbacks are called thus:
 #
 # validator(value, track, daf);
 #
 # value is value in DDF column
 # track is track/view value
 # daf is daf hash
 
 # dispatch table
 our %validators = (
     files => \&validateFiles,
     view => \&validateDatasetName,
     labVersion => \&validateNoValidation,
     softwareVersion => \&validateNoValidation,
     accession => \&validateNoValidation,
     cell => \&validateCellLine,
     gene => \&validateGeneType,
     promoter => \&validatePromoter,
     antibody => \&validateAntibody,
     rnaExtract => \&validateRnaExtract,
     localization => \&validateLocalization,
     mapAlgorithm => \&validateMapAlgorithm,
     ripAntibody => \&validateRipAntibody,
     ripTgtProtein => \&validateRipTgtProtein,
     fragSize => \&validateFragSize,
     readType => \&validateReadType,
     freezeDate => \&validateFreezeDate,
     replicate => \&validateReplicate,
     species => \&validateSpecies,
     );
 
 # standard validators (required or optional for all projects)
 
 sub validateFiles {
     # Validate array of filenames, ordered by part
     # Check files exist and are of correct data format
     my ($files, $track, $daf) = @_;
     my @newFiles;
     my @errors;
     my $regex = "\`\|\\\|\|\"\|\'";
     doTime("beginning validateFiles") if $opt_timing;
     for my $file (@{$files}) {
         my @list = glob $file;
         if(@list) {
             push(@newFiles, @list);
         } else {
             pushError(\@errors, "File '$file' does not exist (possibly bad glob?)");
         }
     }
     HgAutomate::verbose(3, "     Track: $track    Files: " . join (' ', @newFiles) . "\n");
     return () if $opt_skipValidateFiles;
     for my $file (@newFiles) {
         my ($fbase,$dir,$suf) = fileparse($file, ".gz");
 	# Check if the file has been replaced with an unzipped version
         # This check is also done where we auto create the RawSignal view from the Alignments
         if ($suf eq ".gz" and ! -e $file and -s "$dir/$fbase") {
             $file = "$dir/$fbase";
         }
         if($file =~ /($regex)/) {
             # Do not allows filenames with suspicious characters (b/c filename will be used in shell commands).
             pushError(\@errors, "File '$file' has invalid characters; files cannot contain following characters: \"'`|");
         } elsif(!-e $file) {
             pushError(\@errors, "File \'$file\' does not exist");
         } elsif(!(-s $file)) {
             pushError(\@errors, "File \'$file\' is empty");
         } elsif(!(-r $file)) {
             pushError(\@errors, "File \'$file\' is un-readable");
         } else {
             pushError(\@errors, checkDataFormat($daf->{TRACKS}{$track}{type}, $file));
         }
     }
     $files = \@newFiles;
     doTime("done validateFiles") if $opt_timing;
     return @errors;
 }
 
 sub validateDatasetName {
     my ($val) = @_;
     return ();
 }
 
 sub validateDataType {
     my ($val) = @_;
     return ();
 }
 
 sub validateRawDataAcc {
 # No validation
     return ();
 }
 
 sub validateNoValidation {
 # No validation
     return ();
 }
 
 # project-specific validators
 
 sub validateCellLine {
     my ($val) = @_;
     return defined($terms{'Cell Line'}{$val} || $terms{'control'}{$val}) ? () : ("Cell line \'$val\' is not known");
 }
 
 sub validateRnaExtract {
     my ($val) = @_;
     return defined($terms{'rnaExtract'}{$val}) ? () : ("rnaExtract \'$val\' is not known");
 }
 
 sub validateLocalization {
     my ($val) = @_;
     return defined($terms{'localization'}{$val}) ? () : ("localization \'$val\' is not known");
 }
 
 sub validateMapAlgorithm {
     my ($val) = @_;
     return defined($terms{'mapAlgorithm'}{$val}) ? () : ("mapAlgorithm \'$val\' is not known");
 }
 
 sub validateRipAntibody {
     my ($val) = @_;
     # TODO: Remove Encode::isControlInput after testing
     # return defined(lc($val) eq 'input' || lc($val) eq 'control' || $terms{'ripAntibody'}{$val}) ? () : ("ripAntibody \'$val\' is not known");
     return defined($terms{'ripAntibody'}{$val} || $terms{'control'}{$val}) ? () : ("ripAntibody \'$val\' is not known");
 }
 
 sub validateRipTgtProtein {
     my ($val) = @_;
     return defined($terms{'ripTgtProtein'}{$val}) ? () : ("ripTgtProtein \'$val\' is not known");
 }
 
 sub validateFragSize {
     my ($val) = @_;
     return defined($terms{'fragSize'}{$val}) ? () : ("fragSize \'$val\' is not known");
 }
 
 sub validateReadType {
     my ($val) = @_;
     return defined($terms{'readType'}{$val}) ? () : ("readType \'$val\' is not known");
 }
 
 sub validateGeneType {
     my ($val) = @_;
     return defined($terms{'Gene Type'}{$val}) ? () : ("Gene type \'$val\' is not known");
 }
 
 sub validatePromoter {
     my ($val) = @_;
     return defined($terms{'promoter'}{$val}) ? () : ("promoter \'$val\' is not known");
 }
 
 sub validateAntibody {
     my ($val) = @_;
     if(defined($terms{'Antibody'}{$val}) || defined($terms{'control'}{$val})) {
         return ();
     } else {
         return ("Antibody \'$val\' is not known");
     }
 }
 
 sub validateFreezeDate {
     my ($val) = @_;
     return defined($terms{'freezeDate'}{$val}) ? () : ("freezeDate \'$val\' is not known");
 }
 
 sub validateReplicate {
     return ();
 }
 
 
 sub validateSpecies {
     my ($val) = @_;
     return defined($terms{'species'}{$val}) ? () : ("species \'$val\' is not known");
 }
 ############################################################################
 # Format checkers - check file format for given types; extend when adding new
 # data formats
 #
 # Some of the checkers use regular expressions to validate syntax of the files.
 # Others pass first 10 lines to utility loaders; the later has:
 # advantages:
 # 	checks semantics as well as syntax
 # disadvantages;
 # 	only checks the beginning of the file
 # 	but some of the loaders tolerate (but give incorrect results) for invalid files
 
 # dispatch table
 our %formatCheckers = (
     wig => \&validateWig,
     bed => \&validateBed,
     bedGraph => \&validateBedGraph,
     bed5FloatScore => \&validateBed,
     genePred => \&validateGene,
     gtf => \&validateGtf,
     tagAlign => \&validateTagAlign,
     pairedTagAlign => \&validatePairedTagAlign,
     narrowPeak => \&validateNarrowPeak,
     broadPeak => \&validateBroadPeak,
     gappedPeak => \&validateGappedPeak,
     fastq => \&validateFastQ,
     csfasta => \&validateCsfasta,
     csqual  => \&validateCsqual,
     rpkm  => \&validateRpkm,
     fasta  => \&validateFasta,
     bowtie  => \&validateBowtie,
     psl  => \&validatePsl,
     cBiP => \&validateFreepass,  # TODO: this is a dodge, because bed file is for different species, so chrom violations
     );
 
 my $floatRegEx = "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?";
 # my $floatRegEx = "[+-]?(?:\\.\\d+|\\d+(?:\\.\\d+|[eE]{1}?[+-]{1}?\\d+))";  # Tim's attempt
 # my $floatRegEx = "[+-]?(?:\\.\\d+|\\d+(?:\\.\\d+|))";                      # Original
 my %typeMap = (int => "[+-]?\\d+", uint => "\\d+", float => $floatRegEx, string => "\\S+");
 
 sub listToRegExp
 {
 # Return a regular expression for given list of field specific tests.
 #
 # $validateList is a reference to a list of hashes with: {NAME, REGEX or TYPE}
 # If a line fails this regular expression, you should then call validateWithListUtil with this line
 # and validation list to generate a field specific error message; this is a speedup hack,
 # because we want to avoid calling validateWithListUtil for every line (because validateWithListUtil is really
 # slow).
 #
 # Note that the 'chrom' field is captured, so you should test %chromInfo (e.g. $chromInfo($1))
 # after using the regular expression to verify that the line has a valid chrom.
     my ($validateList) = @_;
     my @list;
     for my $validateField (@{$validateList}) {
         my $type = $validateField->{TYPE};
         if(defined($type) && $type eq 'chrom') {
             push(@list, "(\\S+)");
         } else {
             my $regex;
             if($type) {
                 if(!($regex = $typeMap{$type})) {
                     die "PROGRAM ERROR: invalid TYPE: $type\n";
                 }
             } elsif(!($regex = $validateField->{REGEX})) {
                 die "PROGRAM ERROR: invalid type list (missing required REGEX or TYPE)\n";
             }
             push(@list, $regex);
         }
     }
     return "^" . join("\\s+", @list) . "\$";
 }
 
 sub validateWithListUtil
 {
 # Validate $line using a validation list.
 # returns error string or undef if line passes validation
 # This is designed to give better feedback to user; ideally we would load the validation list from the .as files
     my ($line, $validateList) = @_;
     my @list = split(/\s+/, $line);
     my $fieldError = "; saw '" . scalar(@list) . "' fields; expected: '" . @{$validateList} . "'";
     if(@list < @{$validateList}) {
         return "not enough fields" . $fieldError;
     } elsif(@list > @{$validateList}) {
         return "too many fields" . $fieldError;
     } else {
         for my $validateField (@{$validateList}) {
             my $val = shift(@list);
             my $type = $validateField->{TYPE};
             if(defined($type) && $type eq 'chrom') {
                 if(!$chromInfo{$val}) {
                     return "value '$val' for field '$validateField->{NAME}' is an invalid chromosome";
                 }
             } else {
                 my $regex;
                 if($type) {
                     if(!($regex = $typeMap{$type})) {
                         die "PROGRAM ERROR: invalid TYPE: $type\n";
                     }
                 } elsif(!($regex = $validateField->{REGEX})) {
                     die "PROGRAM ERROR: invalid type list (missing required REGEX or TYPE)\n";
                 }
                 if($val !~ /^$regex$/) {
                     my $error = "value '$val' is an invalid value for field '$validateField->{NAME}'";
                     if($type) {
                         $error .= "; must be type '$type'";
                     }
                     return $error;
                 }
             }
         }
     }
     return undef;
 }
 
 sub validateWithList
 {
 # open a file and validate each line with $validateList
 # $name is the caller's subroutine name (used in error and debug messages).
     my ($path, $file, $type, $maxRows, $name, $validateList) = @_;
     my $lineNumber = 0;
     my $fh = Encode::openUtil($file, $path);
     my $regexp = listToRegExp($validateList);
     my $hasChrom = 0;
     for my $rec (@{$validateList}) {
         $hasChrom++ if($rec->{NAME} eq "chrom");
     }
     doTime("beginning validateWithList $name,$type,$maxRows") if $opt_timing;
     while(my $line = <$fh>) {
         chomp $line;
         $lineNumber++;
         return ("Invalid $type file; line $lineNumber in file '$file';\nerror: exceeded maximum number of rows allowed ($maxRows) \nline: $line") if $lineNumber > $maxRows;
         next if($line =~ m/^#/); # allow comment lines, consistent with lineFile and hgLoadBed
         if($line =~ /$regexp/) {
             if($hasChrom) {
                 my $chrom = $1;
                 if(!$chromInfo{$1}) {
                     return ("Invalid $type file; line $lineNumber in file '$file';\nerror: invalid chrom '$chrom';\nline: $line");
                 }
             }
         } else {
             if(my $error = validateWithListUtil($line, $validateList)) {
                 return ("Invalid $type file; line $lineNumber in file '$file' is invalid;\n$error;\nline: $line");
             } else {
                 die "PROGRAM ERROR: inconsistent results from validateWithListUtil\n";
             }
         }
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateWithList $name,$type,$maxRows",$lineNumber) if $opt_timing;
     return ();
 }
 
 
 sub validateFreepass
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateFreepass") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     #my $lineNumber = 0;
     #while(<$fh>) {
     #    chomp;
     #    $lineNumber++;
     #    last if($opt_quick && $lineNumber >= $quickCount);
     #}
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' free pass on validation\n");
 
     doTime("done validateFreepass") if $opt_timing;
     return ();
 }
 
 
 sub validateWig
 {
     my ($path, $file, $type) = @_;
     my $filePath = defined($path) ? "$path/$file" : $file;
     doTime("beginning validateWig") if $opt_timing;
 
     HgAutomate::verbose(2, "validateWig($file,$type) -> wigEncode\n");
     my @cmds;
     # wigEncode knows how to handle zipped files so we do not need to special case them.
     push(@cmds, "/cluster/bin/x86_64/wigEncode -noOverlapSpanData $filePath /dev/null /dev/null");
     # This can produce /data/tmp/SafePipe_NNN_.err files
     my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => "/dev/null", DEBUG => $opt_verbose - 1);
     if(my $err = $safe->exec()) {
         my $err = $safe->stderr();
         chomp($err);
         return "File \'$file\' failed wiggle validation: " . $err;
     } else {
         HgAutomate::verbose(2, "File \'$file\' passed wiggle validation\n");
     }
     doTime("done validateWig") if $opt_timing;
     return ();
 }
 
 sub validateBed {
 # Validate each line of a bed 5 or greater file.
     my ($path, $file, $type) = @_;
     my $lineNumber = 0;
     doTime("beginning validateBed") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
         my @fields = split /\s+/;
         my $fieldCount = @fields;
         next if(!$fieldCount);
         my $prefix = "Failed bed validation, file '$file'; line $lineNumber:";
         if(/^(track|browser)/) {
             ;
         } elsif($fieldCount < 5) {
             die "$prefix not enough fields; " . scalar(@fields) . " present; at least 5 are required\n";
         } elsif (!$chromInfo{$fields[0]}) {
             die "$prefix field 1 value ($fields[0]) is invalid; not a valid chrom name\n";
         } elsif ($fields[1] !~ /^\d+$/) {
             die "$prefix field 2 value ($fields[1]) is invalid; value must be a positive number\n";
         } elsif ($fields[2] !~ /^\d+$/) {
             die "$prefix field 3 value ($fields[2]) is invalid; value must be a positive number\n";
         } elsif ($fields[2] < $fields[1]) {
             die "$prefix field 3 value ($fields[2]) is less than field 2 value ($fields[1])\n";
         } elsif ($fields[4] !~ /^\d+$/ && $fields[4] !~ /^\d+\.\d+$/) {
             die "$prefix field 5 value ($fields[4]) is invalid; value must be a positive number\n";
         } elsif ($fields[4] < 0 || $fields[4] > 1000) {
             die "$prefix field 5 value ($fields[4]) is invalid; score must be 0-1000\n";
         } elsif ($type eq 'bed5FloatScore' && $fieldCount < 6) {
             die "$prefix field 6 invalid; bed5FloatScore requires 6 fields";
         } elsif ($type eq 'bed5FloatScore' && $fields[5] !~ /^$floatRegEx$/) {
             die "$prefix field 6 value '$fields[5]' is invalid; must be a float\n";
         } else {
             ;
         }
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed bed validation\n");
     doTime("done validateBed",$lineNumber) if $opt_timing;
     return ();
 }
 
 sub validateBedGraph {
 # Validate each line of a bedGraph file.
     my ($path, $file, $type) = @_;
     my $lineNumber = 0;
     doTime("beginning validateBedGraph") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
         my @fields = split /\s+/;
         my $fieldCount = @fields;
         next if(!$fieldCount);
         my $prefix = "Failed bedGraph validation, file '$file'; line $lineNumber:";
         if(/^(track|browser)/) {
             ;
         } elsif($fieldCount != 4) {
             die "$prefix found " . scalar(@fields) . " fields; need 4\n";
         } elsif (!$chromInfo{$fields[0]}) {
             die "$prefix field 1 value ($fields[0]) is invalid; not a valid chrom name\n";
         } elsif ($fields[1] !~ /^\d+$/) {
             die "$prefix field 2 value ($fields[1]) is invalid; value must be a positive number\n";
         } elsif ($fields[2] !~ /^\d+$/) {
             die "$prefix field 3 value ($fields[2]) is invalid; value must be a positive number\n";
         } elsif ($fields[2] < $fields[1]) {
             die "$prefix field 3 value ($fields[2]) is less than field 2 value ($fields[1])\n";
         } elsif ($fields[3] !~ /^$floatRegEx$/) {
             die "$prefix field 4 value '$fields[3]' is invalid; must be a float [$floatRegEx]\n";
         } else {
             ;
         }
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed bedGraph validation\n");
     doTime("done validateBedGraph", $lineNumber) if $opt_timing;
     return ();
 }
 
 sub validateGtf {
 # validate GTF by converting to genePred and validating that
     my ($path, $file, $type) = @_;
     my $errFile = "$path/doEncodeValidate.gtf.err";
     doTime("beginning validateGtf") if $opt_timing;
     my $filePath = defined($path) ? "$path/$file" : $file;
     my $outFile = "$path/doEncodeValidate.gtf.bed";
     if(Encode::isZipped($filePath)) {
         # XXXX should be modified to handle zipped files.
         die "We don't currently support gzipped gtf files\n";
     }
     HgAutomate::verbose(2, "validateGtf(path=$path,file=$file,type=$type)\n");
     # XXXX Add support for $opt_quick
     my $err = system ( "gtfToGenePred $filePath $outFile >$errFile 2>&1");
     if ($err) {
         print STDERR  "File \'$file\' failed GTF validation\n";
         open(ERR, "$errFile") || die "ERROR: Can't open gtfToGenePred error file \'$errFile\': $!\n";
         my @err = <ERR>;
         die "@err\n";
     }
     unlink $errFile;
     HgAutomate::verbose(2, "File \'$file\' passed gtfToGenePred conversion \n");
     doTime("done validateGtf") if $opt_timing;
     my @res = validateGene(undef,$outFile,$type);
     if (scalar(@res)==0) { # no errors so remove the temp .bed file
         HgAutomate::verbose(2, "File \'$file\' passed gtf gene validation \n");
 	unlink $outFile;
     }
     return @res;
 }
 
 sub validateGene {
     my ($path, $file, $type) = @_;
     my $outFile = "validateGene.out";
     doTime("beginning validateGene") if $opt_timing;
     my $filePath = defined($path) ? "$path/$file" : $file;
     if(Encode::isZipped($filePath)) {
         # XXXX should be modified to handle zipped files.
         die "We don't currently supporte gzipped gene files\n";
     }
     # XXXX Add support for $opt_quick
     my $err = system (
         "cd $outPath; egrep -v '^track|browser' $filePath | ldHgGene -out=genePred.tab -genePredExt hg18 testTable stdin >$outFile 2>&1");
     if ($err) {
         print STDERR  "File \'$file\' failed GFF validation\n";
         open(ERR, "$outPath/$outFile") || die "ERROR: Can't open GFF validation file \'$outPath/$outFile\': $!\n";
         my @err = <ERR>;
         die "@err\n";
     } else {
         HgAutomate::verbose(2, "File \'$file\' passed GFF validation\n");
     }
     doTime("done validateGene") if $opt_timing;
     return ();
 }
 
 sub validateTagAlign
 {
     my ($path, $file, $type) = @_;
-    my $safe = SafePipe->new(CMDS => ["validateFiles -type=tagAlign $file"]);
+    # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+    my $safe = SafePipe->new(CMDS => ["validateFiles -chromDb=hg18 -type=tagAlign $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateTagAlign : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateTagAlign for '$file'");
     }
     return ();
 }
 
 sub validatePairedTagAlign
 # This is like tag align but with two additional sequence fields appended; seq1 and seq2
 {
     my ($path, $file, $type) = @_;
-    my $safe = SafePipe->new(CMDS => ["validateFiles -type=pairedTagAlign $file"]);
+    # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+    my $safe = SafePipe->new(CMDS => ["validateFiles -chromDb=hg18 -type=pairedTagAlign $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validatePairedTagAlign : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validatePairedTagAlign for '$file'");
     }
     return ();
 }
 
 sub validateNarrowPeak
 {
     my ($path, $file, $type) = @_;
     my @list = ({TYPE => "chrom", NAME => "chrom"},
                 {TYPE => "uint", NAME => "chromStart"},
                 {TYPE => "uint", NAME => "chromEnd"},
                 {TYPE => "string", NAME => "name"},
                 {TYPE => "uint", NAME => "score"},
                 {REGEX => "[+-\\.]", NAME => "strand"},
                 {TYPE => "float", NAME => "signalValue"},
                 {TYPE => "float", NAME => "pValue"},
                 {TYPE => "float", NAME => "qValue"},
                 {TYPE => "int", NAME => "peak"});
     return validateWithList($path, $file, $type, $maxBedRows, "validateNarrowPeak", \@list);
 }
 
 sub validateBroadPeak
 {
     my ($path, $file, $type) = @_;
-    my $safe = SafePipe->new(CMDS => ["validateFiles -type=broadPeak $file"]);
+    # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+    my $safe = SafePipe->new(CMDS => ["validateFiles -chromDb=hg18 -type=broadPeak $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateBroadPeak : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateBroadPeak for '$file'");
     }
     return ();
 }
 
 sub validateGappedPeak
 {
     my ($path, $file, $type) = @_;
     my @list = ({TYPE => "chrom", NAME => "chrom"},
                 {TYPE => "uint", NAME => "chromStart"},
                 {TYPE => "uint", NAME => "chromEnd"},
                 {TYPE => "string", NAME => "name"},
                 {TYPE => "uint", NAME => "score"},
                 {REGEX => "[+-\\.]", NAME => "strand"},
                 {TYPE => "uint", NAME => "thickStart"},
                 {TYPE => "uint", NAME => "thickEnd"},
                 {TYPE => "string", NAME => "itemRgb"},
                 {TYPE => "uint", NAME => "blockCount"},
                 {TYPE => "string", NAME => "blockSizes"},
                 {TYPE => "string", NAME => "blockStarts"},
                 {TYPE => "float", NAME => "signalValue"},
                 {TYPE => "float", NAME => "pValue"},
                 {TYPE => "float", NAME => "qValue"}
                 );
     return validateWithList($path, $file, $type, $maxBedRows, "validateGappedPeak", \@list);
 }
 
 sub validateFastQ
 {
     # Syntax per http://maq.sourceforge.net/fastq.shtml
     # I added '/' in the seqNameRegEx and plusLine even though it wasnt in the spec
     #   because this is what Colin Kingswood (Gingeras project)
     #   is getting in the fastq files from GIS for the GisPet project
     #   and they are being sent on to us
     # Note on "FASTQ Quality scores":-   http://maq.sourceforge.net/qual.shtml
     # Fastq has 2 different semantics for the score field.
     # - fastq produced directly from Solexa has a 'solexa' quality score
     # - fastq defined by Sanger has a 'PHRED' quality score
     # - The 2 urls above show how to convert between both
     my ($path, $file, $type) = @_;
     my $safe = SafePipe->new(CMDS => ["validateFiles -type=fastq $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateFastQ : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validateFastQ for '$file'");
     }
     return ();
 }
 
 sub validateCsfasta
 {
     # Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
     # Sample:-
 
     # # Wed Jul 30 15:30:48 2008 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/S0033/S0033_20080723_2/I22_EA/results.01/primary.20080730194737531 --name=S0033_20080723_2_I22_EA_ --tag=F3 --minlength=30 --mask=111111111111111111111111111111 --prefix=T /data/results/S0033/S0033_20080723_2/I22_EA/jobs/postPrimerSetPrimary.1416/rawseq
     # # Cwd: /home/pipeline
     # # Title: S0033_20080723_2_I22_EA_
     # >461_19_90_F3
     # T203033330010111011221200302001
     # >461_19_209_F3
     # T022213002230311203200200322000
 
     # Files from GIS have this header:
     # >920_22_656_F3,1.-152654094.1.35.35.0###,19.43558664.1.35.35.0###
     # T01301010111200210102321210100112312
 
     my ($path, $file, $type) = @_;
     doTime("beginning validateCsfasta") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     my $line = 0;
     my $state = 'header';
     my $seqName;
     my $states = {header => {REGEX => "^>\\d+_\\d+_\\d+_\.\\d+.*", NEXT => 'seq'},
                   seq => {REGEX => "^[GT]\\d+", NEXT => 'header'},
                   };
     while(<$fh>) {
         chomp;
         $line++;
         next if m/^#/;
         my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateCsfasta]";
         my $regex = $states->{$state}{REGEX};
         if(/^${regex}$/) {
 	        $seqName = $1 if($state eq 'header');
 	        $state = $states->{$state}{NEXT};
         } else {
 	         return("$errorPrefix (expecting $state):\nline: $_");
         }
         last if($opt_quick && $line >= $quickCount);
      }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateCsfasta", $line) if $opt_timing;
     return ();
 }
 
 sub validateCsqual
 {
     # Syntax per http://marketing.appliedbiosystems.com/mk/submit/SOLID_KNOWLEDGE_RD?_JS=T&rd=dm
     # Sample:-
 
     # # Cwd: /home/pipeline
     # # Title: S0033_20080723_2_I22_EA_
     # >461_19_90_F3
     # 20 10 8 13 8 10 20 7 7 24 15 22 21 14 14 8 11 15 5 20 6 5 8 22 6 24 3 16 7 11
     # >461_19_209_F3
     # 16 8 5 12 20 24 19 8 13 17 11 23 8 24 8 7 17 4 20 8 29 7 3 16 3 4 8 20 17 9
 
     my ($path, $file, $type) = @_;
     doTime("beginning validateCsqual") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     my $line = 0;
     my $state = 'header';
     my $seqName;
     my $states = {header => {REGEX => "^>\\d+_\\d+_\\d+_\.\\d+", NEXT => 'qual'},
                   qual => {REGEX => "^(\\d+ )+", NEXT => 'header'},
                   };
     while(<$fh>) {
         chomp;
         $line++;
         next if m/^#/;
         my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateCsqual]";
         my $regex = $states->{$state}{REGEX};
         if(/^${regex}$/) {
 	        $seqName = $1 if($state eq 'header');
 	        $state = $states->{$state}{NEXT};
         } else {
 	         return("$errorPrefix (expecting $state) [regex=$regex]:\nline: [$_]");
         }
         last if($opt_quick && $line >= $quickCount);
      }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateCsqual", $line) if $opt_timing;
     return ();
 }
 
 sub validateFasta
 # Wold lab fasta files; they dont have fastq format. 
 # Sample fasta lines are:
 #>HWI-EAS229_75_30DY0AAXX:7:1:0:949/1
 #NGCGGATGTTCTCAGTGTCCACAGCGCAGGTGAAATAAGGGAAGCAGTAGCGACGCCCATCTCCACGCGCAGCGC
 #>HWI-EAS229_75_30DY0AAXX:7:1:0:1739/1
 #NAGCCATCAGGAAAGCAAGGAGGGGGCATTAAAGGACAATCAAGGGGTTTGGAGGAAGGAGCAGGCCGGAGGCAA
 {
     # Wold lab has fasta files, like fastq format without quality
     my ($path, $file, $type) = @_;
     doTime("beginning validateFasta") if $opt_timing;
     HgAutomate::verbose(2, "validateFasta($path,$file,$type)\n");
     return () if $opt_skipValidateFastQ;
     doTime("beginning validateFasta") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     my $line = 0;
     my $state = 'firstLine';
     my $seqName;
     my $seqNameRegEx = "[A-Za-z0-9_.:/-]+";
     my $seqRegEx = "[A-Za-z\n\.~]+";
     my $states = {firstLine => {REGEX => ">($seqNameRegEx)", NEXT => 'seqLine'},
                   seqLine => {REGEX => $seqRegEx, NEXT => 'firstLine'}};
     while(<$fh>) {
         chomp;
         $line++;
         my $errorPrefix = "Invalid $type file; line $line in file '$file' is invalid [validateFasta]";
         my $regex = $states->{$state}{REGEX};
         if(/^${regex}$/) {
 	        $state = $states->{$state}{NEXT};
         } else {
 	         return("$errorPrefix (expecting $state):\nline: $_");
         }
         last if($opt_quick && $line >= $quickCount);
      }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateFasta", $line) if $opt_timing;
     return ();
 }
 
 sub validateRpkm
 # Wold lab format, has gene name and 2 floats 
 #   Allowing Gene name to be composed of any characters but <tab>
 #
 # Example format 1 (3 cols):-
 # HBG2    0.583   1973.85
 # RPS20   0.523   1910.01
 # RPLP0   1.312   1800.51
 #
 # Example format 2 (7 cols):- (*.accepted.rpkm)
 # ENSG00000003056 chr12   8989051 8989354 2.43    303     M6PR
 # ENSG00000006015 chr19   18560887        18561077        1.10    190     C19orf60
 # ENSG00000008516 chr16   3047223 3047380 0.61    157     MMP25
 #
 # Example format 3 (5 cols): (*.final.rpkm)
 #GID    gene    len_kb  RPKM    multi/all
 # OTTHUMG00000151214      IGLC2   0.722   3579.34 0.84
 # FAR3664 FAR3664 0.200   3216.32 0.94
 # OTTHUMG00000021144      TMSB4X  3.551   2767.52 0.35
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateRpkm") if $opt_timing;
     my $lineNumber = 0;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if m/^#/;
 	my @fields = split /\s+/;
 	my $cols = scalar(@fields);
         die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n"
 	    unless $cols == 3 or $cols == 5 or $cols == 7;
 #            unless m/^([^\t]+)\t(\d+\.\d+)\t(\d+\.\d+)$/;
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateRpkm", $lineNumber) if $opt_timing;
     return ();
 }
 
 sub validateBowtie
 # Unkown format (for download) from Wold lab. 
 # Assume last column is optional
 # Sample lines:-
 # HWI-EAS229_75_30DY0AAXX:7:1:0:1545/1    +       chr1    5983615 NCGTCCATCTCACATCGTCAGGAAAGGGGGAAGCACTGGATGGCTGTGGCCTCACAGGCAGGGAGAGTGGGGTCC     IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 0       0:G>N
 # HWI-EAS229_75_30DY0AAXX:7:1:0:1591/1      -       uc002fcb.1|22|70699936  45      CTATTTCCACCAAGCAGCCAAGCTCAAGGGAATCGGGGAGTACGTGAACATCCGCACAGGGATGCCCTGCCACTN     IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII     0       0:T>N]
 # HWI-EAS229_75_30DY0AAXX:7:1:0:1766/1    -       chr18   72954304        GCAGCCACCAGAAGCGGGAAGAGGTGAAGACAGAGCCTCCTGCAGAGCTCCCACTCTGCCAACGCCTTGACTTTN     IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII 0       0:G>N,59:T>G
 {
     my ($path, $file, $type) = @_;
     doTime("beginning validateBowtie") if $opt_timing;
     my $lineNumber = 0;
     doTime("beginning validateBedGraph") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if m/^#/; # allow comment lines, consistent with lineFile and hgLoadBed
         die "Failed bowtie validation, file '$file'; line $lineNumber: line=[$_]\n" 
 	    unless $_ =~ m/^([A-Za-z0-9:>_,\.\|\/-]+)\t([+-])\t([A-Za-z0-9:>_,\.\|\/-]+)\t(\d+)\t(\w+)\t(\w+)\t(\d+)\t([A-Za-z0-9:>_,\.\|\/-]+)?$/;
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validateBowtie", $lineNumber) if $opt_timing;
     return ();
 }
 
 sub validatePsl
 # PSL format (for download) from Wold lab. 
 # EXAMPLE FROM http://genome.ucsc.edu/FAQ/FAQformat#format2
 # This adds 2 columns (sequence,<tab>sequence,) to the standard 21 columns
 # Only the first 21 are validated
 #
 # Sample first 6 lines
 #psLayout version 3
 #
 #match   mis-    rep.    N's     Q gap   Q gap   T gap   T gap   strand  Q               Q       Q       Q       T               T       T       T       block   blockSizes      qStarts  tStarts
 #        match   match           count   bases   count   bases           name            size    start   end     name            size    start   end     count
 #---------------------------------------------------------------------------------------------------------------------------------------------------------------
 #71      3       0       0       0       0       0       0       -       HWI-EAS229_75_30DY0AAXX:4:1:0:743/1     75      1       75      chr2    242951149       184181032       184181106       1  74,      0,      184181032,      agccttttacagcaacacctttacctctgctagatctttctgtagctcgtctgaagccatgggggctgggtcag,     agccttttccagcaacacctttacctcttctagatctttctgtagctcttctgaagccatgggggctgggtcag,
 #72      2       0       0       0       0       0       0       -       HWI-EAS229_75_30DY0AAXX:7:1:0:713/1     75      1       75      chr14   106368585       49540119        49540193        1  74,      0,      49540119,       cgggtgcgggccgagcagttctccgcacctccggtaaaggttcaggaccgggtgatggtctctgcagcagtcag,     ccggtgcgggccgagcagttctccgcacctccggtaaaggtgcaggaccgggtgatggtctctgcagcagtcag,
 {
     my ($path, $file, $type) = @_;
     my $lineNumber = 0;
     doTime("beginning validatePsl") if $opt_timing;
     my $fh = Encode::openUtil($file, $path);
     while(<$fh>) {
         chomp;
         $lineNumber++;
         next if $lineNumber == 1 and m/^psLayout version \d+/; # check first line 
         next if $lineNumber == 2 and m/^$/;
         next if $lineNumber == 3 and m/^match/;
         next if $lineNumber == 4 and m/^\s+match/;
         next if $lineNumber == 5 and m/^------/;
         die "Failed $type validation, file '$file'; line $lineNumber: line=[$_]\n" 
 	    unless m/^(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t([+-][+-]?)\t([A-Za-z0-9:>\|\/_-]+)\t(\d+)\t(\d+)\t(\d+)\t(\w+)\t(\d+)\t(\d+)\t(\d+)\t(\d+)\t([0-9,]+)\t([0-9,]+)\t([0-9,]+)/;
         last if($opt_quick && $lineNumber >= $quickCount);
     }
     $fh->close();
     HgAutomate::verbose(2, "File \'$file\' passed $type validation\n");
     doTime("done validatePsl", $lineNumber) if $opt_timing;
     return ();
 }
 
 
 ############################################################################
 # Misc subroutines
 
 sub validateDdfField {
     # validate value for type of field
     my ($type, $val, $track, $daf) = @_;
     $type =~ s/ /_/g;
     HgAutomate::verbose(4, "Validating $type: " . (defined($val) ? $val : "") . "\n");
     if($validators{$type}) {
         return $validators{$type}->($val, $track, $daf);
     } else {
         die "Validator for type '$type' is missing";
     }
 }
 
 sub checkDataFormat {
     # validate file type
     my ($format, $file) = @_;
     HgAutomate::verbose(3, "Checking data format for $file: $format\n");
     my $type = $format;
     if ($format =~ m/(bed) (\d+)/) {
         $format = $1;
     }
     if ($format =~ m/(bedGraph) (\d+)/) {
         $format = $1;
     }
     $formatCheckers{$format} || return "Data format \'$format\' is unknown\n";
     return $formatCheckers{$format}->($submitPath, $file, $type);
     HgAutomate::verbose(3, "Done checking data format for $file: $format\n");
 }
 
 sub ddfKey
 {
 # return key for given DDF line (e.g. "antibody=$antibody;cell=$cell" for ChIP-Seq data).
 # The key includes replicate (if applicable) if $includeReplicate is true.
     my ($fields, $ddfHeader, $daf, $includeReplicate) = @_;
 
     if (defined($daf->{variables})) {
         my $delim = ";";
         my $key = join($delim, map("$_=" . $fields->{$_}, sort @{$daf->{variableArray}}));
         if($includeReplicate && defined($fields->{replicate})) {
             $key .= $delim . $fields->{replicate};
         }
         return $key;
     } else {
 	return undef; # Some dafs have no variables, eg, Sanger Gencode
     }
 }
 
 sub isDownloadOnly {
     my ($view, $grant, $lab, $daf) = @_;
     # Added 'downloadOnly' bool to DAF views so these rules can be explicit not hardcoded
     # Dont load any RawData* or Comparative views, 
     # Dont load Alignments unless they are from Gingeras or Wold labs (RNA folks like to  see their RNAs)
     # Riken group have RawData and RawData2 because they have colorspace fasta and quality files
     # Wold group have RawData, RawData[2-7]
     # Wold group alignments are called 'Aligns', 'Splices', 'Paired'
     return ( (($daf->{TRACKS}->{$view}->{downloadOnly} || "") eq 'yes') or ($view =~ m/^RawData\d*$/ or $view eq 'Comparative' 
 	or ($view eq 'Alignments' and $grant ne "Gingeras" and $grant ne "Wold"))) ? 1 : 0;
 }
 
 sub printCompositeTdbSettings {
 # prints out trackDb.ra settings for the composite track
     local *OUT_FILE = shift;
     my ($daf,%ddfSets) = @_;
 
     my $compositeTrack = Encode::compositeTrackName($daf);
 
     print OUT_FILE "track\t\t$compositeTrack\n";
     print OUT_FILE "compositeTrack\ton\n";
 
     my $setting    = "subGroup1\tview Views";
     my $visDefault = "visibilityViewDefaults\t";
     # Cycle through to get best view to default labels and to get all views and terms
     for my $view (keys %{$daf->{TRACKS}}) {
         for my $key (keys %ddfSets) {
             if(defined($ddfSets{$key}{VIEWS}{$view})) {
                 my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
                 if(!$downloadOnly) {
                     $setting = $setting . " " . $view . "=" . $view;
                     $visDefault = $visDefault . " " . $view . "=";
                     if($view eq "Peaks") {
                         $visDefault = $visDefault . "dense";
                     } elsif($view eq "Signal") {
                         $visDefault = $visDefault . "full";
                     } else {
                         $visDefault = $visDefault . "hide";
                     }
                 }
             }
         }
     }
     print OUT_FILE "shortLabel\t" . $daf->{lab} . " " . $daf->{dataType} . "\n"; # Default to  lab datatype
     print OUT_FILE "longLabel\tENCODE " . $daf->{lab} . " " . $daf->{grant} . " " . $daf->{dataType} . "\n";  # Default to lab grant datatype
     print OUT_FILE "group\t\tregulation\n";   # This is just a guess.  Buyer beware
     print OUT_FILE $setting . "\n"; # "subGroup1\tview Views Peaks=Peaks Signal=Signal RawSignal=Raw_Signal\n";
 
     # Need to create N subgroups with M members each
     if (defined($daf->{variables})) {
         my $grpNo = 1;
         my $sortOrder = "sortOrder\t";
         my $dimensions = "dimensions";
         my $controlledVocab = "controlledVocabulary\tencode/cv.ra";
         if (defined($daf->{variables})) {
             my @variables = @{$daf->{variableArray}};
             for my $variable (@variables) {
                 $grpNo++;
                 my $groupVar = $variable;
 	            $groupVar = "factor" if $variable eq "antibody";
                 $groupVar = "cellType" if $variable eq "cell";
                 if($grpNo < 5) {
                     $dimensions .= "\tdimension" . chr(86 + $grpNo) . "=" . $groupVar;
                 }
                 $sortOrder = "$sortOrder$groupVar=+ ";
                 $controlledVocab = "$controlledVocab $groupVar";
                 $setting = "subGroup$grpNo\t$groupVar " . ucfirst($groupVar);
                 $setting = "subGroup$grpNo\t$groupVar " . "Cell_Line" if $variable eq "cell";
                 for my $key (keys %ddfSets) {
                     my @pairs = split(';', $key);
                     for my $pair (@pairs) {
                         my ($var, $term) = split('=', $pair);
                         if($var eq $variable) {
                             $setting = "$setting $term=$term";
                         }
                     }
                 }
                 print OUT_FILE $setting . "\n";     # "subGroup2\cellTyle Cell_Line ???\n;
             }
         }
         $setting = $sortOrder . "view=+";
         print OUT_FILE $dimensions . "\n";         # "dimensions  dimensionX=cellType dimensionY=factor"
         print OUT_FILE $setting . "\n";         # "sortOrder\tcellType=+ factor=+ view=+\n";
         print OUT_FILE $controlledVocab . "\n"; # "controlledVocabulary\tencode/cv.ra cellType factor\n";
     }
     print OUT_FILE "dragAndDrop\tsubTracks\n";
     print OUT_FILE $visDefault . "\n";          #"visibilityViewDefaults\tPeaks=dense Signal=full RawSignal=hide\n";
     print OUT_FILE "priority\t0\n";
     print OUT_FILE "type\t\tbed 3\n";
     print OUT_FILE "wgEncode\t1\n\n";
 }
 
 ############################################################################
 # Main
 
 my $now = time();
 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($now);
 my @ddfHeader;		# list of field names on the first line of DDF file
 my %ddfHeader = ();	# convenience hash version of @ddfHeader (maps name to field index)
 my @ddfLines = ();	# each line in DDF (except for fields header); value is a hash; e.g. {files => 'foo.bed', cell => 'HeLa-S3', ...}
 my %ddfSets = ();	# info about DDF entries broken down by ddfKey
 my %ddfReplicateSets = ();	# info about DDF entries broken down by ddfKey (including replicate)
 my $wd = cwd();
 
 my $ok = GetOptions("allowReloads",
                     "configDir=s",
                     "fileType=s",
                     "metaDataOnly",
                     "outDir=s",
                     "quick",
                     "timing",
                     "skipAll",
                     "skipAutoCreation",
                     "skipOutput",
                     "skipValidateFiles",
                     "skipValidateFastQ",
                     "validateDaf",
                     "validateFile",
                     "sendEmail",
                     "verbose=i",
                     );
 usage() if (!$ok);
 $opt_verbose = 1 if (!defined $opt_verbose);
 $opt_sendEmail = 0 if (!defined $opt_sendEmail);
 
 if($opt_skipAll) {
     $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
 }
 
 if($opt_metaDataOnly) {
     $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
     $opt_allowReloads = 1;
 }
 
 usage() if (scalar(@ARGV) < 2);
 
 # Get command-line args
 my $submitType = $ARGV[0];	# currently not used
 my $submitDir = $ARGV[1];
 
 $ENV{TMPDIR} = $Encode::tempDir;
 
 if($opt_validateFile && $opt_fileType) {
     # kludgy, but we need chromInfo populated to validate files, so we assume we are using hg18
     my $db = HgDb->new(DB => 'hg18');
     $db->getChromInfo(\%chromInfo);
     if(my @errors = checkDataFormat($opt_fileType, $submitDir)) {
         die "Invalid file: " . join(", ", @errors) . "\n";
     } else {
         exit(0);
     }
 }
 
 # Determine submission, configuration, and output directory paths
 HgAutomate::verbose(2, "Validating submission in directory \'$submitDir\'\n");
 if ($submitDir =~ /^\/.*/) {
     $submitPath = $submitDir;
 } else {
     $submitPath = "$wd/$submitDir";
 }
 HgAutomate::verbose(4, "Submission directory path: \'$submitPath\'\n");
 
 if (defined $opt_configDir) {
     if ($opt_configDir =~ /^\//) {
         $configPath = $opt_configDir;
     } else {
         $configPath = "$wd/$opt_configDir";
     }
 } else {
     $configPath = "$submitPath/../config"
 }
 if(!(-d $configPath)) {
     die "configPath '$configPath' is invalid; Can't find the config directory\n";
 }
 HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");
 
 if (defined $opt_outDir) {
     if ($opt_outDir =~ /^\//) {
         $outPath = $opt_outDir;
     } else {
         $outPath = "$wd/$opt_outDir";
     }
 } else {
     $outPath = "$submitPath/out"
 }
 HgAutomate::verbose(4, "Output directory path: '$outPath'; submitPath: '$submitPath'\n");
 
 if(!$opt_validateDaf) {
     # Change dir to submission directory
     if(!chdir($submitPath)) {
         die ("SYS ERR; Can't change to submission directory \'$submitPath\': $OS_ERROR\n");
     }
     HgAutomate::verbose(3, "Creating output in directory \'$outPath\'\n");
     if(!(-d $outPath)) {
         mkdir $outPath || die ("SYS ERR: Can't create out directory \'$outPath\': $OS_ERROR\n");
     }
 }
 
 # labs is now in fact the list of grants (labs are w/n grants, and are not currently validated).
 $grants = Encode::getGrants($configPath);
 $fields = Encode::getFields($configPath);
 
 if($opt_validateDaf) {
     if(-f $submitDir) {
         Encode::parseDaf($submitDir, $grants, $fields);
     } else {
         Encode::getDaf($submitDir, $grants, $fields);
     }
     print STDERR "DAF is valid\n";
     exit(0);
 }
 
 $daf = Encode::getDaf($submitDir, $grants, $fields);
 
 my $db = HgDb->new(DB => $daf->{assembly});
 $db->getChromInfo(\%chromInfo);
 
 if($opt_sendEmail) {
     if($grants->{$daf->{grant}} && $grants->{$daf->{grant}}{wranglerEmail}) {
         my $email = $grants->{$daf->{grant}}{wranglerEmail};
         if($email) {
             `echo "dir: $submitPath" | /bin/mail -s "ENCODE data from $daf->{grant}/$daf->{lab} lab has been submitted for validation." $email`;
         }
     }
 }
 
 # Add the variables in the DAF file to the required fields list
 if (defined($daf->{variables})) {
     for my $variable (keys %{$daf->{variableHash}}) {
         $fields->{$variable}{required} = 1;
         $fields->{$variable}{file} = 'ddf';
     }
 }
 
 # make replicate column required when appropriate.
 my $hasReplicates = 0;
 my $maxOrder = 0;
 for my $view (keys %{$daf->{TRACKS}}) {
     $hasReplicates += $daf->{TRACKS}{$view}{hasReplicates};
     if($daf->{TRACKS}{$view}{order} > $maxOrder) {
         $maxOrder = $daf->{TRACKS}{$view}{order}
     }
 }
 
 if($hasReplicates) {
     $fields->{replicate}{required} = 1;
 }
 
 # Open dataset descriptor file (DDF)
 my @glob = glob "*.DDF";
 push(@glob, glob "*.ddf");
 my $ddfFile = Encode::newestFile(@glob);
 HgAutomate::verbose(2, "Using newest DDF file \'$ddfFile\'\n");
 my $lines = Encode::readFile($ddfFile);
 
 my $ddfLineNumber = 0;
 # Get header containing column names
 while(@{$lines}) {
     my $line = shift(@{$lines});
     $ddfLineNumber++;
     # remove leading and trailing spaces and newline
     $line =~ s/^\s+//;
     $line =~ s/\s+$//;
     # ignore empty lines and comments
     next if $line =~ /^$/;
     next if $line =~ /^#/;
     if($line !~ /\t/) {
         die "ERROR: The DDF header has no tabs; the DDF is required to be tab delimited\n";
     }
     @ddfHeader = split(/\t/, $line);
     for (my $i=0; $i < @ddfHeader; $i++) {
         $ddfHeader{$ddfHeader[$i]} = $i;
     }
     last;
 }
 
 my @errors = Encode::validateFieldList(\@ddfHeader, $fields, 'ddf');
 if(@errors) {
     die "ERROR in DDF '$ddfFile':\n" . join("\n", @errors) . "\n";
 }
 
 %terms = Encode::getControlledVocab($configPath);
 
 my @variables;
 if (defined($daf->{variables})) {
     @variables = @{$daf->{variableArray}};
 } else {
     # Hubbard Sanger Gencode project has no variables
     @variables = ();
 }
 
 my %metadataHash;
 
 # Process lines in DDF file. Create a list with one entry per line;
 # the entry is field/value hash (fields per @ddfHeader).
 
 while (@{$lines}) {
     my $line = shift(@{$lines});
     $ddfLineNumber++;
     my $errorPrefix = "DDF lineNumber $ddfLineNumber:";
     HgAutomate::verbose(2, "Parsing ddf line $ddfLineNumber\n");
 
     $line =~ s/^\s+//;
     $line =~ s/\s+$//;
     next if $line =~ /^#/;
     next if $line =~ /^$/;
 
     if($line !~ /\t/) {
         pushError(\@errors, "$errorPrefix line has no tabs; the DDF is required to be tab delimited");
         next;
     }
     my $i = 0;
     my %line;
     for my $val (split('\t', $line)) {
         $line{$ddfHeader[$i]} = $val;
         $i++;
     }
     if(my @tmp = Encode::validateValueList(\%line, $fields, 'ddf')) {
         pushError(\@errors, $errorPrefix . "\n" . join("\n", @tmp));
         next;
     }
 
     my $view = $line{view};
     HgAutomate::verbose(2,"Parsing $view\n");
     if($daf->{TRACKS}{$view}) {
         my $files = $line{files};
         if($fields->{replicate}{required}) {
             my $replicate = $line{replicate};
             if($daf->{TRACKS}{$view}{hasReplicates} && (!defined($replicate) || !length($replicate))) {
                 pushError(\@errors, "$errorPrefix missing replicate number for view '$view'");
             }
         }
         my @filenames;
         for(split(',', $files)) {
             # Use glob explicitly so our error messages have the list of files actually used.
             if(my @glob = glob) {
                 push(@filenames, @glob);
             } else {
                 push(@filenames, $_);
             }
         }
         $line{files} = \@filenames;
         my @metadataErrors;
         for my $field (keys %line) {
             push(@metadataErrors, validateDdfField($field, $line{$field}, $view, $daf));
         }
         if(@metadataErrors) {
             pushError(\@errors, @metadataErrors);
         } else {
             # avoid spurious errors by not putting invalid lines into %ddfSets
 	    # ddfKey returnes undef if there are no variables defined
 	    if (defined(ddfKey(\%line, \%ddfHeader, $daf, 1))) {
 		$ddfSets{ddfKey(\%line, \%ddfHeader, $daf, 0)}{VIEWS}{$view} = \%line;
 		$ddfReplicateSets{ddfKey(\%line, \%ddfHeader, $daf, 1)}{VIEWS}{$view} = \%line;
 		my $str = join(", ", map($line{$_}, sort(@variables)));
 		$metadataHash{$str} = 1;
 	    }
         }
         push(@ddfLines, \%line);
     } else {
         pushError(\@errors, "$errorPrefix undefined view '$view'");
     }
     HgAutomate::verbose(2, "End of parsing ddf line $ddfLineNumber\n");
 }
 
 my $tmpCount = 1;
 
 if(!@errors) {
     # Look for missing required views and create missing, optional views, but
     # but don't bother if we have already encountered errors.
     # Could also look for replicate inconsistency here (e.g. Alignments for replicate 3 but not fastq for replicate 3).
 
     for my $key (keys %ddfSets) {
         for my $view (keys %{$daf->{TRACKS}}) {
             if($daf->{TRACKS}{$view}{required}) {
                 if(!defined($ddfSets{$key}{VIEWS}{$view})) {
                     pushError(\@errors, "view '$view' missing for $key");
                 }
             }
         }
     }
 
     doTime("beginning ddfReplicateSets loop") if $opt_timing;
     for my $key (keys %ddfReplicateSets) {
         # create missing optional views (e.g. ChIP-Seq RawSignal or transcriptome project PlusRawSignal and MinusRawSignal)
 	# note this loop assumes these are on a per replicate basis.
 	# Also note that any project (like transcriptome) that doesnt have replicates should also use
 	# this for their auto-create signals.
 	HgAutomate::verbose(2, "ddfReplicateSets loop key=[$key] aln=[".(defined($ddfReplicateSets{$key}{VIEWS}{Alignments}))."] rawsig=[".(defined($ddfReplicateSets{$key}{VIEWS}{RawSignal}))."]\n");
         if($daf->{noAutoCreate} ne "yes" && defined($ddfReplicateSets{$key}{VIEWS}{Alignments})
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
 		&& ($daf->{dataType} ne 'MethylSeq')) {
             if($daf->{dataType} eq 'ChipSeq' && !defined($daf->{medianFragmentLength})) {
                 pushError(\@errors, "Missing medianFragmentLength field; this field is required for dataType '$daf->{dataType}' when RawSignal view is not provided");
             } else {
                 # hack for case where they have removed RawSignal view in the DAF
 		# - if no (Plus|Minus|)RawSignal is defined, assume RawSignal is required
                 if(!defined($daf->{TRACKS}{RawSignal}{order})
 			&& !defined($daf->{TRACKS}{PlusRawSignal}{order})
 			&& !defined($daf->{TRACKS}{MinusRawSignal}{order}) ) {
                     $daf->{TRACKS}{RawSignal}{order} = ++$maxOrder;
                 }
 		# Make a list of the PlusRawSignal/MinusRawSignal or RawSignals we are going to have to make
 		my @newViews = ();
 		push @newViews, "RawSignal" if $daf->{TRACKS}{RawSignal}{order};
 		push @newViews, "PlusRawSignal" if $daf->{TRACKS}{PlusRawSignal}{order};
 		push @newViews, "MinusRawSignal" if $daf->{TRACKS}{MinusRawSignal}{order};
 
 		foreach my $newView (@newViews) #loop around making them
 		{
                 my $alignmentLine = $ddfReplicateSets{$key}{VIEWS}{Alignments};
                 my %line = %{$alignmentLine};
                 $line{view} = $newView;
                 $line{type} = 'wig';
                 $ddfReplicateSets{$key}{VIEWS}{$newView} = \%line;
                 my @unzippedFiles = ();
                 doTime("beginning unzipping replicates files for view [$newView] key=[$key]") if $opt_timing;
                 for my $file (@{$alignmentLine->{files}}) {
                     # Unzip any zipped files - only works if they are with .gz suffix
                     my ($fbase,$dir,$suf) = fileparse($file, ".gz");
                     if ($suf eq ".gz") {
                         # If the zipped file exists then unzip it (do this each time, in case zip file is updated
                         # This check is also done above at the stage where we are testign the files in the ddf exist
                         if (-s $file) {
                             my $err = system("gunzip -c $file > $dir/$fbase");
                             if ($err) {
                                 die ("File \'$file\' failed gunzip $file to [$dir/$fbase]\n");
                             }
                             HgAutomate::verbose(2, "File \'$file\' gunzipped to \'$fbase\'\n");
                         }
                         if ( ! -s "$dir/$fbase") {
                             die ("Unzipped file \'$fbase\' does not exist (or is empty) for DDF file \'$file\'\n");
                         }
                         push @unzippedFiles, $fbase;
                     } else {
                         push @unzippedFiles, $file;
                     }
                 }
                 doTime("done unzipping replicates files") if $opt_timing;
                 $alignmentLine->{files} = \@unzippedFiles;
                 # Now we can safely sort these files as none are zipped
                 my $files = join(" ", @{$alignmentLine->{files}});
                 my $tmpFile = $Encode::autoCreatedPrefix . $newView. "$tmpCount.bed"; # add the type of view to the name
                 $tmpCount++;
                 if($opt_skipAutoCreation) {
                     HgAutomate::verbose(2, "Skipping auto-creating view '$newView' for key '$key'\n");
                   } else {
                       HgAutomate::verbose(2, "Auto-creating view '$newView' for key '$key' in file '$tmpFile'\n");
                         doTime("beginning Auto-create of view $newView in file $tmpFile") if $opt_timing;
                         # XXXX gzip before saving to disk?
                         my @cmds;
                         my $sortFiles;
                         if(defined($daf->{medianFragmentLength})) {
                             push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $daf->{medianFragmentLength} $files");
                             $sortFiles = " -";
 			    # sorting stdin, so have to sort in mem (and control how much mem we use)
 			    push @cmds, "sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                         } else {
                             $sortFiles = $files;
 			    # sort each file in place, controling mem usage, then do merge sort
 			    my @sortList = split(/\s+/, $sortFiles);
 			    foreach my $f (@sortList) {
 				my $err = system("sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n -o $f $f ");
 				if ($err) {
 				    die ("File \'$f\' failed sort\n");
 				}
 				HgAutomate::verbose(2, "File \'$f\' sorted\n");
 			    }
 			    # Now do the mergesort in the pipeline
 			    push @cmds, "sort -m $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                         }
 			push @cmds, "grep -v -E \"^track\" ";
 			push @cmds, "gawk '\$6 == \"+\" {print}'" if $newView eq "PlusRawSignal";
 			push @cmds, "gawk '\$6 == \"-\" {print}'" if $newView eq "MinusRawSignal";
                         push @cmds, "bedItemOverlapCount $daf->{assembly} stdin";
                         my $safe = SafePipe->new(CMDS => \@cmds, STDOUT => $tmpFile, DEBUG => $opt_verbose - 1);
                         if(my $err = $safe->exec()) {
                             print STDERR  "ERROR: failed auto bedItemOverlap creation of bedGraph for $key" . $safe->stderr() . "\n";
                             # don't show end-user pipe error(s)
                             pushError(\@errors, "failed creation of wiggle for '$key'");
                         }
                         doTime("done Auto-create of view $newView") if $opt_timing;
                     }
                 $line{files} = [$tmpFile];
                 push(@ddfLines, \%line);
 	    }  # End foreach newView loop
             }
         }
     } # End replicate sets loop
     doTime("done ddfReplicateSets loop") if $opt_timing;
 }
 
 my $compositeTrack = Encode::compositeTrackName($daf);
 ### No good reason to make this an error.  Composite entry can be added when subtracks are 1st added to trackDb.
 #if(!$db->quickQuery("select count(*) from trackDb where tableName = ?", $compositeTrack)) {
 #    pushError(\@errors, "Missing composite track '$compositeTrack'; please contact your data wrangler");
 #}
 my $compositeExists = $db->quickQuery("select count(*) from trackDb where tableName = ?", $compositeTrack);
 
 if(@errors) {
     my $prefix = @errors > 1 ? "Error(s)" : "Error";
     die "$prefix:\n\n" . join("\n\n", @errors) . "\n";
 }
 
 # After this point, we don't use @errors and just die immediately.
 
 # Validate files and metadata fields in all ddfLines using controlled
 # vocabulary.  Create load.ra file for loader and trackDb.ra file for wrangler.
 doTime("beginning out files") if $opt_timing;
 
 if($opt_skipOutput) {
     open(LOADER_RA, ">>/dev/null");
     open(TRACK_RA, ">>/dev/null");
     open(README, ">>/dev/null");
 } else {
     open(LOADER_RA, ">$outPath/$Encode::loadFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::loadFile\' file; error: $!\n";
     open(TRACK_RA, ">$outPath/$Encode::trackFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::trackFile\' file; error: $!\n";
     open(README, ">$outPath/README.txt") || die "SYS ERROR: Can't write '$outPath/READEME.txt' file; error: $!\n";
 }
 
 # Create a composite track entry if the trackDb.ra entry was not found
 if(!$opt_skipOutput && !$compositeExists) {
     printCompositeTdbSettings(*TRACK_RA,$daf,%ddfSets);
 }
 
 # XXXX Calculation of priorities still needs work; we currently don't account for multiple experiments in the same DDF.
 # It may in fact be too much work to do automatic calculation of priorities (i.e. the wrangler may have to do it manually).
 
 my $priority = $db->quickQuery("select max(priority) from trackDb where settings like '%subTrack $compositeTrack%'") || 0;
 $ddfLineNumber = 1;
 
 foreach my $ddfLine (@ddfLines) {
     $ddfLineNumber++;
     my $diePrefix = "ERROR on DDF lineNumber $ddfLineNumber:";
     my $view = $ddfLine->{view};
     my $type = $daf->{TRACKS}{$view}{type} || die "Missing DAF entry for view '$view'\n";
 
     HgAutomate::verbose(2, "  View: $view\n");
     my $replicate;
     if($hasReplicates && $daf->{TRACKS}{$view}{hasReplicates}) {
         $replicate = $ddfLine->{replicate};
         if(defined($replicate) && $replicate > 0) {
         } else {
             die "$diePrefix invalid or missing replicate value\n";
         }
     }
     # Construct table name from track name and variables
     my $tableName = "$compositeTrack$view";
     if(defined($replicate)) {
         $tableName .= "Rep$replicate";
     }
     if(!defined($daf->{TRACKS}{$view}{shortLabelPrefix})) {
         $daf->{TRACKS}{$view}{shortLabelPrefix} = "";
     }
     my $shortLabel = defined($daf->{TRACKS}{$view}{shortLabelPrefix}) ? $daf->{TRACKS}{$view}{shortLabelPrefix} : "";
     my $longLabel = "ENCODE" . (defined($daf->{TRACKS}{$view}{longLabelPrefix}) ? " $daf->{TRACKS}{$view}{longLabelPrefix}" : "");
     if(defined($replicate)) {
         $longLabel .= " Replicate $replicate";
     }
     my $subGroups = "view=$view";
     my $additional = "\n";
     my $pushQDescription = "";
     my $species;
     if (@variables) {
         my %hash = map { $_ => $ddfLine->{$_} } @variables;
         for my $var (@variables) {
             my $val = $hash{$var};
             $val = ucfirst(lc($val));
             # trailing + => Plus, - => Neg (e.g. H9ES-AFP+)
             $val =~ s/\+$/Pos/;
             $val =~ s/\-$/Neg/;
             $tableName = $tableName . $val;
         }
 
         my $shortSuffix = "";
         my $longSuffix;
         my %shortViewMap = (Peaks => 'Pk', Signal => 'Sig', RawSignal => 'Raw', PlusRawSignal => 'PlusRaw', MinusRawSignal => 'MinusRaw');
         if($hash{'antibody'} && $hash{'cell'}) {
             $pushQDescription = "$hash{'antibody'} in $hash{'cell'}";
             $shortSuffix = "$hash{'antibody'} $hash{'cell'}";
             $longSuffix = "$hash{'antibody'} in $hash{'cell'} cells";
         } elsif($hash{'ripAntibody'} && $hash{'ripTgtProtein'} && $hash{'cell'}) {
             $longSuffix = "$hash{'ripTgtProtein'} in $hash{'cell'} cells using $hash{'ripAntibody'}";
             $pushQDescription = $longSuffix;
             $shortSuffix = "$hash{'ripTgtProtein'} $hash{'cell'} $hash{'ripAntibody'}";
         } elsif($hash{'rnaExtract'} && $hash{'localization'} && $hash{'cell'}) {
 	    my $suf = $hash{'mapAlgorithm'} ? "$hash{'mapAlgorithm'}" : "";
             $shortSuffix = "$hash{'rnaExtract'} $hash{'cell'} $hash{'localization'} $suf";
             $longSuffix = "$hash{'rnaExtract'} in $hash{'cell'} cell $hash{'localization'} using $suf";
             $pushQDescription = $longSuffix;
         } elsif($hash{'freezeDate'}) {
             $shortSuffix = $hash{'freezeDate'};
             $longSuffix = $hash{'freezeDate'};
             $pushQDescription = $longSuffix;
         } elsif ($hash{"species"}) {
             $pushQDescription = "$hash{'species'}";
             $shortSuffix = "$hash{'species'}";
             $longSuffix = "in $hash{'species'}";
             $species = "$hash{'species'}";
             $pushQDescription = "$view $daf->{dataType} $longSuffix";
         } elsif ($hash{"cell"}) {
             $pushQDescription = "$hash{'cell'}";
             $shortSuffix = "$hash{'cell'}";
             $longSuffix = "in $hash{'cell'} cells";
         } else {
 	    warn "Warning: variables undefined for pushQDescription,shortSuffix,longSuffix\n";
 	}
         if(defined($shortViewMap{$view})) {
             $shortSuffix .= " " . $shortViewMap{$view};
         }
         if(defined($replicate)) {
             $shortSuffix .= " $replicate";
             $pushQDescription .= " Replicate $replicate";
         }
         if($shortSuffix) {
             $shortLabel = $shortLabel ? "$shortLabel ($shortSuffix)" : $shortSuffix;
         }
         if($longSuffix) {
             $longLabel .= " ($longSuffix)";
         }
 	# make the "subGroups" and "additional" fields from all variables
 	for my $var (sort keys %hash) {
             # The var name is over-ridden for antibody and cell, for historical reasons
             my $groupVar = $var;
 	    $groupVar = "factor" if $var eq "antibody";
 	    $groupVar = "cellType" if $var eq "cell";
             $subGroups .= " $groupVar=$hash{$var}";
             $additional = "    $var $hash{$var}\n" . $additional;
         }
     }
 
     # mysql doesn't allow hyphens in table names and our naming convention doesn't allow underbars; to be
     # safe, we strip non-alphanumerics.
     $tableName =~ s/[^A-Za-z0-9]//g;
 
     die "Table name [$tableName] too long, must be <= 64 chars, got [".length($tableName)."]\n" if length($tableName) > 64;
 
     if($tableNamesUsed{$tableName}++) {
         dieTellWrangler("System Error: identical tableName '$tableName' was generated by multiple data sets\n");
     }
 
     if(!$opt_allowReloads) {
         if($db->quickQuery("select count(*) from trackDb where tableName = ?", $tableName)) {
             die "view '$view' has already been loaded as track '$tableName'\nPlease contact your wrangler if you need to reload this data\n";
         }
     }
 
     # XXXX Move the decision about which views have tracks into the DAF?
     # Already this is used in 2 places so made it a function, 
     # would be better in the DAF except we'd have to go change all the DAFs :(
     my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
 
     print LOADER_RA "tablename $tableName\n";
     print LOADER_RA "view $view\n";
     print LOADER_RA "type $type\n";
     if($species) {
         print LOADER_RA "assembly $species\n";
     } else {
         print LOADER_RA "assembly $daf->{assembly}\n";
     }
     print LOADER_RA "files @{$ddfLine->{files}}\n";
     print LOADER_RA "downloadOnly $downloadOnly\n";
     print LOADER_RA "pushQDescription $pushQDescription\n";
     print LOADER_RA "\n";
 
     my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate($now);
 
     if($downloadOnly || ($type eq "wig" && !grep(/$Encode::autoCreatedPrefix/, @{$ddfLine->{files}}))) {
         # adds entries to README.txt for download only files AND wig data (excepting wig data generated by us)
         print README "file: $tableName.$type.gz\n";
         for my $var (@variables) {
             print README "$var: " . $ddfLine->{$var} . "\n";
         }
         if(defined($replicate)) {
             print README "replicate: $replicate\n";
         }
 
         print README sprintf("data RESTRICTED UNTIL: %d-%02d-%02d\n", 1900 + $rYear, $rMon + 1, $rMDay);
         print README "\n";
     }
     if(!$downloadOnly) {
         print TRACK_RA "    track $tableName\n";
         print TRACK_RA "    release alpha\n";
         print TRACK_RA "    subTrack $compositeTrack\n";
         print TRACK_RA "    shortLabel $shortLabel\n";
         print TRACK_RA "    longLabel $longLabel\n";
         print TRACK_RA "    subGroups $subGroups\n";
         if($type eq 'wig') {
             my $placeHolder = Encode::wigMinMaxPlaceHolder($tableName);
             print TRACK_RA "    type $type $placeHolder\n";
         } elsif($type eq 'gtf') { # GTF is converted to and loaded as genePred
             print TRACK_RA "    type genePred\n";
         } elsif($type eq 'tagAlign') { # tagAligns are bed 6 but with column called 'sequence' instead of 'name'
             print TRACK_RA "    type bed 6\n";
         } else {
             print TRACK_RA "    type $type\n";
         }
         print TRACK_RA sprintf("    dateSubmitted %04d-%02d-%02d\n", 1900 + $year, $mon + 1, $mday);
         print TRACK_RA sprintf("    dateUnrestricted %04d-%02d-%02d\n", 1900 + $rYear, $rMon + 1, $rMDay);
         print TRACK_RA sprintf("    dataVersion %s\n", $Encode::dataVersion);
         if(defined($ddfLine->{accession}) && length($ddfLine->{accession}) > 0) {
             print TRACK_RA sprintf("    accession %s\n",$ddfLine->{accession});
         }
         print TRACK_RA "    priority " . ($priority + $daf->{TRACKS}{$view}{order}) . "\n";
         # noInherit is necessary b/c composite track will often have a different dummy type setting.
         print TRACK_RA "    noInherit on\n";
         if($view eq 'RawSignal' and 0) { # Sorry tim, you will have to list your projects here
             print TRACK_RA "    configurable off\n";
         } else {
             print TRACK_RA "    configurable on\n";
         }
         if($type eq 'wig') {
             print TRACK_RA <<END;
     spanList first
     windowingFunction mean
     maxHeightPixels 100:16:16
 END
 	} elsif($type eq 'bed 5 +') {
 		print TRACK_RA "    useScore 1\n";
 	}
         print TRACK_RA $additional;
     }
 }
 close(LOADER_RA);
 close(TRACK_RA);
 close(README);
 doTime("done out files") if $opt_timing;
 
 if($submitDir =~ /(\d+)$/) {
     my $id = $1;
     if(dirname($submitDir) =~ /_(.*)/) {
         my $instance = $1;
         # XXXX rubyDb logic s/d probably be moved to Encode.pm
         my $rubyDb = HgDb->new(DB => "encpipeline_$instance");
         my @tmp = keys %metadataHash;
         my $count = scalar(@tmp);
         my $metadata = join("; ", @tmp);
         HgAutomate::verbose(2, "Updating id '$id'; metdata: '$metadata'; count: 'count'\n");
         $rubyDb->execute("update projects set count = ?, metadata = ?, lab = ?, data_type = ?, track = ? where id = ?",
                          $count, $metadata, $daf->{lab}, $daf->{dataType}, $compositeTrack, $id);
     }
 }
 $time0=$timeStart;
 doTime("done. ") if $opt_timing;
 exit 0;