src/hg/encode/encodeValidate/doEncodeValidate.pl 1.208

1.208 2009/12/15 20:25:31 tdreszer
Simplified controlled vocab validation, tableNames based on cv tags not terms, tag 'None' not included in naem, special case replicate subGroup
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.207
retrieving revision 1.208
diff -b -B -U 4 -r1.207 -r1.208
--- src/hg/encode/encodeValidate/doEncodeValidate.pl	9 Dec 2009 19:58:17 -0000	1.207
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl	15 Dec 2009 20:25:31 -0000	1.208
@@ -161,34 +161,24 @@
     view => \&validateDatasetName,
     labVersion => \&validateNoValidation,
     softwareVersion => \&validateNoValidation,
     accession => \&validateNoValidation,
-    cell => \&validateCellLine,
-    gene => \&validateGeneType,
-    promoter => \&validatePromoter,
-    antibody => \&validateAntibody,
-    rnaExtract => \&validateRnaExtract,
-    localization => \&validateLocalization,
-    mapAlgorithm => \&validateMapAlgorithm,
-    ripAntibody => \&validateRipAntibody,
-    ripTgtProtein => \&validateRipTgtProtein,
-    fragSize => \&validateFragSize,
-    readType => \&validateReadType,
-    freezeDate => \&validateFreezeDate,
-    replicate => \&validateReplicate,
-    species => \&validateSpecies,
+    replicate => \&validateNoValidation,
     fragLength => \&validateNoValidation,
-    treatment => \&validateNoValidation,
-    protocol => \&validateNoValidation,
-    restrictionEnzyme => \&validateNoValidation,
+    cell => \&validateControlledVocabOrControl,
+    antibody => \&validateControlledVocabOrControl,
+    ripAntibody => \&validateControlledVocabOrControl,
+    treatment => \&validateControlledVocabOrControl,
+    protocol => \&validateControlledVocabOrControl,
+    default => \&validateControlledVocab,
     );
 
 # standard validators (required or optional for all projects)
 
 sub validateFiles {
     # Validate array of filenames, ordered by part
     # Check files exist and are of correct data format
-    my ($files, $track, $daf) = @_;
+    my ($files, $type, $track, $daf) = @_;
     my @newFiles;
     my @errors;
     my $regex = "\`\|\\\|\|\"\|\'";
     doTime("beginning validateFiles") if $opt_timing;
@@ -248,83 +238,23 @@
 }
 
 # project-specific validators
 
-sub validateCellLine {
-    my ($val) = @_;
-    return defined($terms{'Cell Line'}{$val} || $terms{'control'}{$val}) ? () : ("Cell line \'$val\' is not known");
-}
-
-sub validateRnaExtract {
-    my ($val) = @_;
-    return defined($terms{'rnaExtract'}{$val}) ? () : ("rnaExtract \'$val\' is not known");
-}
-
-sub validateLocalization {
-    my ($val) = @_;
-    return defined($terms{'localization'}{$val}) ? () : ("localization \'$val\' is not known");
-}
-
-sub validateMapAlgorithm {
-    my ($val) = @_;
-    return defined($terms{'mapAlgorithm'}{$val}) ? () : ("mapAlgorithm \'$val\' is not known");
-}
-
-sub validateRipAntibody {
-    my ($val) = @_;
-    # TODO: Remove Encode::isControlInput after testing
-    # return defined(lc($val) eq 'input' || lc($val) eq 'control' || $terms{'ripAntibody'}{$val}) ? () : ("ripAntibody \'$val\' is not known");
-    return defined($terms{'ripAntibody'}{$val} || $terms{'control'}{$val}) ? () : ("ripAntibody \'$val\' is not known");
-}
-
-sub validateRipTgtProtein {
-    my ($val) = @_;
-    return defined($terms{'ripTgtProtein'}{$val}) ? () : ("ripTgtProtein \'$val\' is not known");
-}
-
-sub validateFragSize {
-    my ($val) = @_;
-    return defined($terms{'fragSize'}{$val}) ? () : ("fragSize \'$val\' is not known");
-}
-
-sub validateReadType {
-    my ($val) = @_;
-    return defined($terms{'readType'}{$val}) ? () : ("readType \'$val\' is not known");
-}
-
-sub validateGeneType {
-    my ($val) = @_;
-    return defined($terms{'Gene Type'}{$val}) ? () : ("Gene type \'$val\' is not known");
+sub validateControlledVocabOrControl {
+    my ($val, $type) = @_;
+    if($type eq 'cell') {
+        $type = 'Cell Line';
+    } elsif ($type eq 'antibody') {
+        $type = 'Antibody';
+    }
+    return defined($terms{$type}{$val} || $terms{'control'}{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");
 }
 
-sub validatePromoter {
-    my ($val) = @_;
-    return defined($terms{'promoter'}{$val}) ? () : ("promoter \'$val\' is not known");
-}
-
-sub validateAntibody {
-    my ($val) = @_;
-    if(defined($terms{'Antibody'}{$val}) || defined($terms{'control'}{$val})) {
-        return ();
-    } else {
-        return ("Antibody \'$val\' is not known");
-    }
-}
-
-sub validateFreezeDate {
-    my ($val) = @_;
-    return defined($terms{'freezeDate'}{$val}) ? () : ("freezeDate \'$val\' is not known");
-}
-
-sub validateReplicate {
-    return ();
+sub validateControlledVocab {
+    my ($val, $type) = @_;
+    return defined($terms{$type}{$val}) ? () : ("Controlled Vocabulary \'$type\' value \'$val\' is not known");
 }
 
-
-sub validateSpecies {
-    my ($val) = @_;
-    return defined($terms{'species'}{$val}) ? () : ("species \'$val\' is not known");
-}
 ############################################################################
 # Format checkers - check file format for given types; extend when adding new
 # data formats
 #
@@ -995,11 +925,11 @@
     my ($type, $val, $track, $daf) = @_;
     $type =~ s/ /_/g;
     HgAutomate::verbose(4, "Validating $type: " . (defined($val) ? $val : "") . "\n");
     if($validators{$type}) {
-        return $validators{$type}->($val, $track, $daf);
+        return $validators{$type}->($val, $type, $track, $daf);
     } else {
-        die "Validator for type '$type' is missing";
+        return $validators{'default'}->($val, $type, $track, $daf); # Considers the term controlled vocab
     }
 }
 
 sub checkDataFormat {
@@ -1754,15 +1684,26 @@
     my $tier1 = 0;
     if (@variables) {
         my %hash = map { $_ => $ddfLine->{$_} } @variables;
         for my $var (@variables) {
-            my $val = $hash{$var};
+            my $cvTypeVar = $var;
+            if ($var eq "antibody") {
+                $cvTypeVar = "Antibody";
+            } elsif ($var eq "cell") {
+                $cvTypeVar = "Cell Line";
+            }
+            if(!defined($terms{$cvTypeVar}->{$hash{$var}})) {
+                $cvTypeVar = "control";
+            }
+            my $val = $terms{$cvTypeVar}->{$hash{$var}}->{'tag'};
             $val = ucfirst(lc($val));
+            if($val ne 'None') {  # Special control term does not show up in the name!
             # trailing + => Plus, - => Neg (e.g. H9ES-AFP+)
             $val =~ s/\+$/Pos/;
             $val =~ s/\-$/Neg/;
             $tableName = $tableName . $val;
         }
+        }
 
         my $shortSuffix = "";
         my $longSuffix;
         my %shortViewMap = (Peaks => 'Pk', Signal => 'Sig', RawSignal => 'Raw', PlusRawSignal => 'PlusRaw', MinusRawSignal => 'MinusRaw');
@@ -1821,15 +1762,20 @@
             # handle inconsistent naming for antibody & cell type
 	    if ($var eq "antibody") {
                 $groupVar = "factor";
                 $cvTypeVar = "Antibody";
-            }
-	    if ($var eq "cell") {
+            } elsif ($var eq "cell") {
                 $groupVar = "cellType";
                 $cvTypeVar = "Cell Line";
             }
+            if(!defined($terms{$cvTypeVar}->{$hash{$var}})) {
+                $cvTypeVar = "control";
+            }
             $subGroups .= " $groupVar=$terms{$cvTypeVar}->{$hash{$var}}->{'tag'}";
         }
+        if(defined($replicate) && ($daf->{lab} eq "HudsonAlpha" || $daf->{lab} eq "Uw") || $daf->{lab} eq "Gis") {
+            $subGroups .= " rep=rep$replicate"; # UGLY special casing
+        }
     }
     #if($Encode::dafVersion gt "1.0") {
     #    $tableName .= "$view";
     #    if(defined($replicate)) {