src/hg/encode/encodeValidate/doEncodeValidate.pl 1.202

1.202 2009/11/30 23:13:40 kate
1. Remove hg18 hardcoding 2. Use newly added 'tag' field from CV instead of 'term'
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.201
retrieving revision 1.202
diff -b -B -U 4 -r1.201 -r1.202
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	30 Nov 2009 19:34:05 -0000	1.201
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	30 Nov 2009 23:13:40 -0000	1.202
@@ -61,9 +61,10 @@
 # Global variables
 our $submitPath;        # full path of data submission directory
 our $configPath;        # full path of configuration directory
 our $outPath;           # full path of output directory
-our %terms;             # controlled vocabulary
+our %terms;             # controlled vocabulary, indexed by type and term
+our %tags;              # controlled vocabulary, indexed by tag
 our $quickCount=100;
 our $quickOpt = "";     # option to pass to validateFiles prog
 our $time0 = time;
 our $timeStart = time;
@@ -71,9 +72,9 @@
 our $maxBedRows=80_000_000; # number of rows to allow in a bed-type file
 our %tableNamesUsed;
 our ($grants, $fields, $daf);
 our $SORT_BUF = " -S 5G ";
-our $assembly = "hg18";
+our $assembly = NULL;
 
 sub usage {
     print STDERR <<END;
 usage: encodeValidate.pl submission-type project-submission-dir
@@ -645,9 +646,9 @@
         die "We don't currently supporte gzipped gene files\n";
     }
     # XXXX Add support for $opt_quick
     my $err = system (
-        "cd $outPath; egrep -v '^track|browser' $filePath | ldHgGene -out=genePred.tab -genePredExt hg18 testTable stdin >$outFile 2>&1");
+        "cd $outPath; egrep -v '^track|browser' $filePath | ldHgGene -out=genePred.tab -genePredExt $assembly testTable stdin >$outFile 2>&1");
     if ($err) {
         print STDERR  "File \'$file\' failed GFF validation\n";
         open(ERR, "$outPath/$outFile") || die "ERROR: Can't open GFF validation file \'$outPath/$outFile\': $!\n";
         my @err = <ERR>;
@@ -661,9 +662,9 @@
 
 sub validateTagAlign
 {
     my ($path, $file, $type) = @_;
-    # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+    # validate chroms, chromSize, etc.
     my $paramList = validationSettings("validateFiles","tagAlign",$assembly);
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=tagAlign $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateTagAlign : " . $safe->stderr() . "\n";
@@ -676,11 +677,11 @@
 sub validatePairedTagAlign
 # This is like tag align but with two additional sequence fields appended; seq1 and seq2
 {
     my ($path, $file, $type) = @_;
-    # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+    # validate chroms, chromSize, etc.
     my $paramList = validationSettings("validateFiles","pairedTagAlign",$assembly);
-    my $safe = SafePipe->new(CMDS => ["validateFiles $paramList $quickOpt -chromDb=hg18 -type=pairedTagAlign $file"]);
+    my $safe = SafePipe->new(CMDS => ["validateFiles $paramList $quickOpt -chromDb=$assembly -type=pairedTagAlign $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validatePairedTagAlign : " . $safe->stderr() . "\n";
 	# don't show end-user pipe error(s)
 	return("failed validatePairedTagAlign for '$file'");
@@ -706,9 +707,9 @@
 
 sub validateBroadPeak
 {
     my ($path, $file, $type) = @_;
-    # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+    # validate chroms, chromSize, etc.
     my $paramList = validationSettings("validateFiles","broadPeak",$assembly);
     my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=broadPeak $file"]);
     if(my $err = $safe->exec()) {
 	print STDERR  "ERROR: failed validateBroadPeak : " . $safe->stderr() . "\n";
@@ -1056,15 +1057,24 @@
         my $grpNo = 1;
         my $sortOrder = "sortOrder ";
         my $dimensions = "dimensions";
         my $controlledVocab = "controlledVocabulary encode/cv.ra";
+        my %tags = ();
         if (defined($daf->{variables})) {
             my @variables = @{$daf->{variableArray}};
             for my $variable (@variables) {
                 $grpNo++;
                 my $groupVar = $variable;
-	            $groupVar = "factor" if $variable eq "antibody";
-                $groupVar = "cellType" if $variable eq "cell";
+                my $cvTypeVar = $variable;
+                # special names for cell and antibody
+                if ($variable eq "cell") {
+                    $groupVar = "cellType";
+                    $cvTypeVar = "Cell Line";
+                }
+                if ($variable eq "antibody") {
+                    $groupVar = "factor";
+                    $cvTypeVar = "Antibody";
+                }
                 if($grpNo < 5) {
                     $dimensions .= " dimension" . chr(86 + $grpNo) . "=" . $groupVar;
                 }
                 $sortOrder = "$sortOrder$groupVar=+ ";
@@ -1074,10 +1084,15 @@
                 for my $key (keys %ddfSets) {
                     my @pairs = split(';', $key);
                     for my $pair (@pairs) {
                         my ($var, $term) = split('=', $pair);
-                        if($var eq $variable) {
-                            $setting = "$setting $term=$term";
+                        my $tag = $terms{$cvTypeVar}->{$term}->{'tag'};
+                        if ($var eq $variable) {
+                            if (!defined($tags{$tag})) {
+                                # suppress dups, requested by Brian
+                                $setting = "$setting $tag=$term";
+                                $tags{$tag} = $term;
+                            }
                         }
                     }
                 }
                 print OUT_FILE $setting . "\n";     # "subGroup2\cellTyle Cell_Line ???\n;
@@ -1212,10 +1227,9 @@
 
 $ENV{TMPDIR} = $Encode::tempDir;
 
 if($opt_validateFile && $opt_fileType) {
-    # kludgy, but we need chromInfo populated to validate files, so we assume we are using hg18
-    my $db = HgDb->new(DB => 'hg18');
+    my $db = HgDb->new(DB => $assembly);
     $db->getChromInfo(\%chromInfo);
     if(my @errors = checkDataFormat($opt_fileType, $submitDir)) {
         die "Invalid file: " . join(", ", @errors) . "\n";
     } else {
@@ -1691,9 +1705,8 @@
     if(defined($replicate)) {
         $longLabel .= " Replicate $replicate";
     }
     my $subGroups = "view=$view";
-    my $additional = "";
     my $pushQDescription = "";
     my $species;
     my $tier1 = 0;
     if (@variables) {
@@ -1756,16 +1769,23 @@
         }
         if($longSuffix) {
             $longLabel .= " ($longSuffix)";
         }
-    	# make the "subGroups" and "additional" fields from all variables
+    	# make the "subGroups" setting from all variables
 	   for my $var (sort keys %hash) {
             # The var name is over-ridden for antibody and cell, for historical reasons
             my $groupVar = $var;
-	    $groupVar = "factor" if $var eq "antibody";
-	    $groupVar = "cellType" if $var eq "cell";
-            $subGroups .= " $groupVar=$hash{$var}";
-            $additional = "    $var $hash{$var}\n" . $additional;
+            my $cvTypeVar = $groupVar;
+            # handle inconsistent naming for antibody & cell type
+	    if ($var eq "antibody") {
+                $groupVar = "factor";
+                $cvTypeVar = "Antibody";
+            }
+	    if ($var eq "cell") {
+                $groupVar = "cellType";
+                $cvTypeVar = "Cell Line";
+            }
+            $subGroups .= " $groupVar=$terms{$cvTypeVar}->{$hash{$var}}->{'tag'}";
         }
     }
     #if($Encode::dafVersion gt "1.0") {
     #    $tableName .= "$view";
@@ -1917,9 +1937,8 @@
         # Obsolete: now in metadata
         # print TRACK_RA sprintf("    dateSubmitted %04d-%02d-%02d\n", 1900 + $year, $mon + 1, $mday);
         # print TRACK_RA sprintf("    dateUnrestricted %04d-%02d-%02d\n", 1900 + $rYear, $rMon + 1, $rMDay);
         # print TRACK_RA sprintf("    dataVersion %s\n", $Encode::dataVersion);
-        # print TRACK_RA $additional;
         if(defined($ddfLine->{accession}) && length($ddfLine->{accession}) > 0) {
             print TRACK_RA sprintf("    accession %s\n",$ddfLine->{accession});
         }
         # color track by color setting for cell type in cv.ra