src/hg/encode/encodeValidate/doEncodeValidate.pl 1.177

1.177 2009/04/16 19:03:30 tdreszer
Support for DDF level fragLength for Alignments to RawSignal. Now creating metadata setting for trackDb and new fileDb.ra
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.176
retrieving revision 1.177
diff -b -B -U 4 -r1.176 -r1.177
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	7 Apr 2009 18:21:30 -0000	1.176
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	16 Apr 2009 19:03:30 -0000	1.177
@@ -41,8 +41,9 @@
 use vars qw/
     $opt_allowReloads
     $opt_configDir
     $opt_fileType
+    $opt_justFileDb
     $opt_metaDataOnly
     $opt_outDir
     $opt_quick
     $opt_skipAll
@@ -86,8 +87,9 @@
     -allowReloads       Allow reloads of existing tables
     -configDir=dir      Path of configuration directory, containing
                         metadata .ra files (default: submission-dir/../config)
     -fileType=type	used only with validateFile option; e.g. narrowPeak
+    -justFileDb         Just generate the fileDb.ra file which contains all metadata
     -metaDataOnly       Process DAF/DDF and just update the projects.metadata field;
                         equal to -allowReloads -skipAll
     -quick		Validate only first $quickCount lines of files
     -skipAll            Turn on all "-skip..." options
@@ -1079,9 +1082,8 @@
 ############################################################################
 # Main
 
 my $now = time();
-my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($now);
 my @ddfHeader;		# list of field names on the first line of DDF file
 my %ddfHeader = ();	# convenience hash version of @ddfHeader (maps name to field index)
 my @ddfLines = ();	# each line in DDF (except for fields header); value is a hash; e.g. {files => 'foo.bed', cell => 'HeLa-S3', ...}
 my %ddfSets = ();	# info about DDF entries broken down by ddfKey
@@ -1096,8 +1098,9 @@
                     "quick",
                     "timing",
                     "skipAll",
                     "skipAutoCreation",
+                    "justFileDb",
                     "skipOutput",
                     "skipValidateFiles",
                     "skipValidateFastQ",
                     "validateDaf",
@@ -1107,8 +1110,11 @@
                     );
 usage() if (!$ok);
 $opt_verbose = 1 if (!defined $opt_verbose);
 $opt_sendEmail = 0 if (!defined $opt_sendEmail);
+if($opt_justFileDb) {
+   $opt_skipAll = $opt_quick = $opt_allowReloads =1;
+}
 $quickOpt = " -quick " if defined ($opt_quick);
 
 if($opt_skipAll) {
     $opt_skipAutoCreation = $opt_skipOutput = $opt_skipValidateFiles = 1;
@@ -1236,8 +1242,11 @@
 # Open dataset descriptor file (DDF)
 my @glob = glob "*.DDF";
 push(@glob, glob "*.ddf");
 my $ddfFile = Encode::newestFile(@glob);
+my $fileTime = (stat($ddfFile))->ctime;
+my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime($fileTime);
+
 HgAutomate::verbose(2, "Using newest DDF file \'$ddfFile\'\n");
 my $lines = Encode::readFile($ddfFile);
 
 my $ddfLineNumber = 0;
@@ -1298,8 +1307,12 @@
     }
     my $i = 0;
     my %line;
     for my $val (split('\t', $line)) {
+        if($ddfHeader[$i] ne "files" && $val =~ / /) {
+            $val =~ s/\"/\\"/g if $val =~ /\"/;
+            $val = '"' . $val . '"';
+        }
         $line{$ddfHeader[$i]} = $val;
         $i++;
     }
     if(my @tmp = Encode::validateValueList(\%line, $fields, 'ddf')) {
@@ -1378,11 +1391,8 @@
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{RawSignal})
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{PlusRawSignal})
 		&& !defined($ddfReplicateSets{$key}{VIEWS}{MinusRawSignal})
 		&& ($daf->{dataType} ne 'MethylSeq')) {
-            if($daf->{dataType} eq 'ChipSeq' && !defined($daf->{medianFragmentLength})) {
-                pushError(\@errors, "Missing medianFragmentLength field; this field is required for dataType '$daf->{dataType}' when RawSignal view is not provided");
-            } else {
                 # hack for case where they have removed RawSignal view in the DAF
 		# - if no (Plus|Minus|)RawSignal is defined, assume RawSignal is required
                 if(!defined($daf->{TRACKS}{RawSignal}{order})
 			&& !defined($daf->{TRACKS}{PlusRawSignal}{order})
@@ -1397,8 +1407,21 @@
 
 		foreach my $newView (@newViews) #loop around making them
 		{
                 my $alignmentLine = $ddfReplicateSets{$key}{VIEWS}{Alignments};
+                # Time to check for fragLength by replicate (in alignments line) (die, don't just push error and build anyway)
+                if($newView eq "RawSignal") {
+                    if(!defined($alignmentLine->{fragLength})) {
+                        if(!defined($daf->{medianFragmentLength})) {
+                            die (\@errors, "Missing fragLength field for building $daf->{dataType} '$newView' for replicate $alignmentLine->{replicate}\nThe fragLength is required and is the median fragment length used in generating this replicate.\n");
+                        } else { # Letting medianFramentLength stand in for per relicate fragLength
+                            $alignmentLine->{fragLength} = $daf->{medianFragmentLength};
+                        }
+                    }
+                    if ($alignmentLine->{fragLength} < 0 || $alignmentLine->{fragLength} > 10000) {
+                        die (\@errors, "Missing or invalid fragLength field for building $daf->{dataType} '$newView' for replicate $alignmentLine->{replicate}\nThe fragLength is required and is the median fragment length used in generating this replicate.\n");
+                    }
+                }
                 my %line = %{$alignmentLine};
                 $line{view} = $newView;
                 $line{type} = 'wig';
                 $ddfReplicateSets{$key}{VIEWS}{$newView} = \%line;
@@ -1435,13 +1458,14 @@
                     HgAutomate::verbose(2, "Skipping auto-creating view '$newView' for key '$key'\n");
                   } else {
                       HgAutomate::verbose(2, "Auto-creating view '$newView' for key '$key' in file '$tmpFile'\n");
                         doTime("beginning Auto-create of view $newView in file $tmpFile") if $opt_timing;
+
                         # XXXX gzip before saving to disk?
                         my @cmds;
                         my $sortFiles;
-                        if(defined($daf->{medianFragmentLength})) {
-                            push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $daf->{medianFragmentLength} $files");
+                    if(defined($alignmentLine->{fragLength})) {
+                        push(@cmds, "/cluster/bin/x86_64/bedExtendRanges $daf->{assembly} $alignmentLine->{fragLength} $files");
                             $sortFiles = " -";
 			    # sorting stdin, so have to sort in mem (and control how much mem we use)
 			    push @cmds, "sort $SORT_BUF -T $Encode::tempDir -k1,1 -k2,2n $sortFiles";
                         } else {
@@ -1473,9 +1497,8 @@
                 $line{files} = [$tmpFile];
                 push(@ddfLines, \%line);
 	    }  # End foreach newView loop
             }
-        }
     } # End replicate sets loop
     doTime("done ddfReplicateSets loop") if $opt_timing;
 }
 
@@ -1505,8 +1528,13 @@
     open(LOADER_RA, ">$outPath/$Encode::loadFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::loadFile\' file; error: $!\n";
     open(TRACK_RA, ">$outPath/$Encode::trackFile") || die "SYS ERROR: Can't write \'$outPath/$Encode::trackFile\' file; error: $!\n";
     open(README, ">$outPath/README.txt") || die "SYS ERROR: Can't write '$outPath/READEME.txt' file; error: $!\n";
 }
+if($opt_justFileDb || !$opt_skipOutput) {
+    open(FILE_RA, ">$outPath/$Encode::fileDbFile") || die "SYS ERROR: Can't write '$outPath/$Encode::fileDbFile' file; error: $!\n";
+} else {
+    open(FILE_RA, ">>/dev/null");
+}
 
 # Create a composite track entry if the trackDb.ra entry was not found
 if(!$opt_skipOutput && !$compositeExists) {
     printCompositeTdbSettings(*TRACK_RA,$daf,%ddfSets);
@@ -1522,8 +1550,35 @@
     $ddfLineNumber++;
     my $diePrefix = "ERROR on DDF lineNumber $ddfLineNumber:";
     my $view = $ddfLine->{view};
     my $type = $daf->{TRACKS}{$view}{type} || die "Missing DAF entry for view '$view'\n";
+    my $metadata = "project=wgEncode grant=$daf->{grant} lab=$daf->{lab} dataType=$daf->{dataType}";
+    $metadata .= " cell=$ddfLine->{cell}" if $ddfLine->{cell}; # force some order
+    $metadata .= " antibody=$ddfLine->{antibody}" if $ddfLine->{antibody};
+    for my $key (keys %{$ddfLine}) {
+        my $value = $ddfLine->{$key};
+        if($value
+        && $key ne 'files'
+        && $key ne 'cell'
+        && $key ne 'antibody'
+        && $key ne 'view'
+        && $key ne 'replicate'
+        && $key ne 'labVersion'
+        && $key ne 'softwareVersion') {
+            $metadata .= " $key=$value"; # and the rest
+        }
+    }
+    $metadata .= " view=$view";
+    $metadata .= " replicate=$ddfLine->{replicate}" if $ddfLine->{replicate} && $daf->{TRACKS}{$view}{hasReplicates};
+    $metadata .= " labVersion=$ddfLine->{labVersion}" if $ddfLine->{labVersion};
+    $metadata .= " softwareVersion=$ddfLine->{softwareVersion}" if $ddfLine->{softwareVersion};
+    $metadata .= ' dataVersion="' . $Encode::dataVersion .'"';
+    if($submitDir =~ /(\d+)$/) {
+        $metadata .= " subId=$1";
+    }
+    my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate($now);
+    $metadata .= sprintf(" dateSubmitted=%04d-%02d-%02d", 1900 + $year, $mon + 1, $mday);
+    $metadata .= sprintf(" dateUnrestricted=%04d-%02d-%02d", 1900 + $rYear, $rMon + 1, $rMDay);
 
     HgAutomate::verbose(2, "  View: $view\n");
     my $replicate;
     if($hasReplicates && $daf->{TRACKS}{$view}{hasReplicates}) {
@@ -1533,12 +1588,15 @@
             die "$diePrefix invalid or missing replicate value\n";
         }
     }
     # Construct table name from track name and variables
-    my $tableName = "$compositeTrack$view";
+    my $tableName = "$compositeTrack";
+    if($Encode::dafVersion le "1.0") {
+        $tableName .= $view;
     if(defined($replicate)) {
         $tableName .= "Rep$replicate";
     }
+    }
     if(!defined($daf->{TRACKS}{$view}{shortLabelPrefix})) {
         $daf->{TRACKS}{$view}{shortLabelPrefix} = "";
     }
     my $shortLabel = defined($daf->{TRACKS}{$view}{shortLabelPrefix}) ? $daf->{TRACKS}{$view}{shortLabelPrefix} : "";
@@ -1546,9 +1604,9 @@
     if(defined($replicate)) {
         $longLabel .= " Replicate $replicate";
     }
     my $subGroups = "view=$view";
-    my $additional = "\n";
+    my $additional = "";
     my $pushQDescription = "";
     my $species;
     my $tier1 = 0;
     if (@variables) {
@@ -1566,9 +1624,9 @@
         my $longSuffix;
         my %shortViewMap = (Peaks => 'Pk', Signal => 'Sig', RawSignal => 'Raw', PlusRawSignal => 'PlusRaw', MinusRawSignal => 'MinusRaw');
         if($hash{'antibody'} && $hash{'cell'}) {
             $pushQDescription = "$hash{'antibody'} in $hash{'cell'}";
-            $shortSuffix = "$hash{'antibody'} $hash{'cell'}";
+            $shortSuffix = "$hash{'cell'} $hash{'antibody'}";
             $longSuffix = "$hash{'antibody'} in $hash{'cell'} cells";
         } elsif($hash{'ripAntibody'} && $hash{'ripTgtProtein'} && $hash{'cell'}) {
             $longSuffix = "$hash{'ripTgtProtein'} in $hash{'cell'} cells using $hash{'ripAntibody'}";
             $pushQDescription = $longSuffix;
@@ -1618,8 +1676,14 @@
             $subGroups .= " $groupVar=$hash{$var}";
             $additional = "    $var $hash{$var}\n" . $additional;
         }
     }
+    if($Encode::dafVersion gt "1.0") {
+        $tableName .= "$view";
+        if(defined($replicate)) {
+            $tableName .= "Rep$replicate";
+        }
+    }
 
     # mysql doesn't allow hyphens in table names and our naming convention doesn't allow underbars; to be
     # safe, we strip non-alphanumerics.
     $tableName =~ s/[^A-Za-z0-9]//g;
@@ -1640,8 +1704,26 @@
     # Already this is used in 2 places so made it a function, 
     # would be better in the DAF except we'd have to go change all the DAFs :(
     my $downloadOnly = isDownloadOnly($view, $daf->{grant}, $daf->{lab}, $daf);
 
+    print FILE_RA "    filename $tableName.$type.gz\n";
+    $metadata .= " composite=$compositeTrack";
+
+    if($downloadOnly) {
+        my $parentTable = $tableName;
+        $parentTable =~ s/RawData/RawSignal/    if $parentTable =~ /RawData/;
+        $parentTable =~ s/Alignments/RawSignal/ if $parentTable =~ /Alignments/;
+        print FILE_RA "    parentTable $parentTable\n";
+        $metadata .= " parentTable=$parentTable";
+    } else {
+        print FILE_RA "    tableName $tableName\n";
+        $metadata .= " tableName=$tableName";
+    }
+    print FILE_RA "    composite $compositeTrack\n";
+    my $fileType = $type;
+    $fileType =~ s/ //g;
+    $metadata .= " fileName=$tableName.$fileType.gz";
+
     print LOADER_RA "tablename $tableName\n";
     print LOADER_RA "view $view\n";
     print LOADER_RA "type $type\n";
     if($species) {
@@ -1653,9 +1735,9 @@
     print LOADER_RA "downloadOnly $downloadOnly\n";
     print LOADER_RA "pushQDescription $pushQDescription\n";
     print LOADER_RA "\n";
 
-    my (undef, undef, undef, $rMDay, $rMon, $rYear) = Encode::restrictionDate($now);
+    print FILE_RA sprintf("    metadata %s\n\n", $metadata);
 
     if($downloadOnly || ($type eq "wig" && !grep(/$Encode::autoCreatedPrefix/, @{$ddfLine->{files}}))) {
         # adds entries to README.txt for download only files AND wig data (excepting wig data generated by us)
         print README "file: $tableName.$type.gz\n";
@@ -1697,32 +1779,17 @@
         print TRACK_RA sprintf("    dataVersion %s\n", $Encode::dataVersion);
         if(defined($ddfLine->{accession}) && length($ddfLine->{accession}) > 0) {
             print TRACK_RA sprintf("    accession %s\n",$ddfLine->{accession});
         }
-        print TRACK_RA "    priority " . ($priority + $daf->{TRACKS}{$view}{order}) . "\n";
-        # noInherit is necessary b/c composite track will often have a different dummy type setting.
-        print TRACK_RA "    noInherit on\n";
-        # removing default individual subtrack configurability, for performance reasons
-        # add back as needed, by wrangler discretion
-        #if($view eq 'RawSignal' and 0) { # Sorry tim, you will have to list your projects here
-            #print TRACK_RA "    configurable off\n";
-        #} else {
-            #print TRACK_RA "    configurable on\n";
-        #}
-        if($type eq 'wig') {
-            print TRACK_RA <<END;
-    spanList first
-    windowingFunction mean
-    maxHeightPixels 100:16:16
-END
-	} elsif($type eq 'bed 5 +') {
-		print TRACK_RA "    useScore 1\n";
-	}
         print TRACK_RA $additional;
+        # metadata proj=wgEncode lab=Yale cell=GM12878 antiBody=Pol2 labVersion="PeakSeq 1.2 ..." dataVersion="ENCODE Feb 2009 Freeze"
+        print TRACK_RA sprintf("    metadata %s\n", $metadata);
+        print TRACK_RA "\n";
     }
 }
 close(LOADER_RA);
 close(TRACK_RA);
+close(FILE_RA);
 close(README);
 doTime("done out files") if $opt_timing;
 
 if($submitDir =~ /(\d+)$/) {