src/hg/encode/encodeValidate/doEncodeValidate.pl 1.202
1.202 2009/11/30 23:13:40 kate
1. Remove hg18 hardcoding 2. Use newly added 'tag' field from CV instead of 'term'
Index: src/hg/encode/encodeValidate/doEncodeValidate.pl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeValidate/doEncodeValidate.pl,v
retrieving revision 1.201
retrieving revision 1.202
diff -b -B -U 4 -r1.201 -r1.202
--- src/hg/encode/encodeValidate/doEncodeValidate.pl 30 Nov 2009 19:34:05 -0000 1.201
+++ src/hg/encode/encodeValidate/doEncodeValidate.pl 30 Nov 2009 23:13:40 -0000 1.202
@@ -61,9 +61,10 @@
# Global variables
our $submitPath; # full path of data submission directory
our $configPath; # full path of configuration directory
our $outPath; # full path of output directory
-our %terms; # controlled vocabulary
+our %terms; # controlled vocabulary, indexed by type and term
+our %tags; # controlled vocabulary, indexed by tag
our $quickCount=100;
our $quickOpt = ""; # option to pass to validateFiles prog
our $time0 = time;
our $timeStart = time;
@@ -71,9 +72,9 @@
our $maxBedRows=80_000_000; # number of rows to allow in a bed-type file
our %tableNamesUsed;
our ($grants, $fields, $daf);
our $SORT_BUF = " -S 5G ";
-our $assembly = "hg18";
+our $assembly = NULL;
sub usage {
print STDERR <<END;
usage: encodeValidate.pl submission-type project-submission-dir
@@ -645,9 +646,9 @@
die "We don't currently supporte gzipped gene files\n";
}
# XXXX Add support for $opt_quick
my $err = system (
- "cd $outPath; egrep -v '^track|browser' $filePath | ldHgGene -out=genePred.tab -genePredExt hg18 testTable stdin >$outFile 2>&1");
+ "cd $outPath; egrep -v '^track|browser' $filePath | ldHgGene -out=genePred.tab -genePredExt $assembly testTable stdin >$outFile 2>&1");
if ($err) {
print STDERR "File \'$file\' failed GFF validation\n";
open(ERR, "$outPath/$outFile") || die "ERROR: Can't open GFF validation file \'$outPath/$outFile\': $!\n";
my @err = <ERR>;
@@ -661,9 +662,9 @@
sub validateTagAlign
{
my ($path, $file, $type) = @_;
- # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+ # validate chroms, chromSize, etc.
my $paramList = validationSettings("validateFiles","tagAlign",$assembly);
my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=tagAlign $file"]);
if(my $err = $safe->exec()) {
print STDERR "ERROR: failed validateTagAlign : " . $safe->stderr() . "\n";
@@ -676,11 +677,11 @@
sub validatePairedTagAlign
# This is like tag align but with two additional sequence fields appended; seq1 and seq2
{
my ($path, $file, $type) = @_;
- # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+ # validate chroms, chromSize, etc.
my $paramList = validationSettings("validateFiles","pairedTagAlign",$assembly);
- my $safe = SafePipe->new(CMDS => ["validateFiles $paramList $quickOpt -chromDb=hg18 -type=pairedTagAlign $file"]);
+ my $safe = SafePipe->new(CMDS => ["validateFiles $paramList $quickOpt -chromDb=$assembly -type=pairedTagAlign $file"]);
if(my $err = $safe->exec()) {
print STDERR "ERROR: failed validatePairedTagAlign : " . $safe->stderr() . "\n";
# don't show end-user pipe error(s)
return("failed validatePairedTagAlign for '$file'");
@@ -706,9 +707,9 @@
sub validateBroadPeak
{
my ($path, $file, $type) = @_;
- # validate chroms, chromSize, etc. Assume hg18 like elsewhere
+ # validate chroms, chromSize, etc.
my $paramList = validationSettings("validateFiles","broadPeak",$assembly);
my $safe = SafePipe->new(CMDS => ["validateFiles $quickOpt $paramList -type=broadPeak $file"]);
if(my $err = $safe->exec()) {
print STDERR "ERROR: failed validateBroadPeak : " . $safe->stderr() . "\n";
@@ -1056,15 +1057,24 @@
my $grpNo = 1;
my $sortOrder = "sortOrder ";
my $dimensions = "dimensions";
my $controlledVocab = "controlledVocabulary encode/cv.ra";
+ my %tags = ();
if (defined($daf->{variables})) {
my @variables = @{$daf->{variableArray}};
for my $variable (@variables) {
$grpNo++;
my $groupVar = $variable;
- $groupVar = "factor" if $variable eq "antibody";
- $groupVar = "cellType" if $variable eq "cell";
+ my $cvTypeVar = $variable;
+ # special names for cell and antibody
+ if ($variable eq "cell") {
+ $groupVar = "cellType";
+ $cvTypeVar = "Cell Line";
+ }
+ if ($variable eq "antibody") {
+ $groupVar = "factor";
+ $cvTypeVar = "Antibody";
+ }
if($grpNo < 5) {
$dimensions .= " dimension" . chr(86 + $grpNo) . "=" . $groupVar;
}
$sortOrder = "$sortOrder$groupVar=+ ";
@@ -1074,10 +1084,15 @@
for my $key (keys %ddfSets) {
my @pairs = split(';', $key);
for my $pair (@pairs) {
my ($var, $term) = split('=', $pair);
- if($var eq $variable) {
- $setting = "$setting $term=$term";
+ my $tag = $terms{$cvTypeVar}->{$term}->{'tag'};
+ if ($var eq $variable) {
+ if (!defined($tags{$tag})) {
+ # suppress dups, requested by Brian
+ $setting = "$setting $tag=$term";
+ $tags{$tag} = $term;
+ }
}
}
}
print OUT_FILE $setting . "\n"; # "subGroup2\cellTyle Cell_Line ???\n;
@@ -1212,10 +1227,9 @@
$ENV{TMPDIR} = $Encode::tempDir;
if($opt_validateFile && $opt_fileType) {
- # kludgy, but we need chromInfo populated to validate files, so we assume we are using hg18
- my $db = HgDb->new(DB => 'hg18');
+ my $db = HgDb->new(DB => $assembly);
$db->getChromInfo(\%chromInfo);
if(my @errors = checkDataFormat($opt_fileType, $submitDir)) {
die "Invalid file: " . join(", ", @errors) . "\n";
} else {
@@ -1691,9 +1705,8 @@
if(defined($replicate)) {
$longLabel .= " Replicate $replicate";
}
my $subGroups = "view=$view";
- my $additional = "";
my $pushQDescription = "";
my $species;
my $tier1 = 0;
if (@variables) {
@@ -1756,16 +1769,23 @@
}
if($longSuffix) {
$longLabel .= " ($longSuffix)";
}
- # make the "subGroups" and "additional" fields from all variables
+ # make the "subGroups" setting from all variables
for my $var (sort keys %hash) {
# The var name is over-ridden for antibody and cell, for historical reasons
my $groupVar = $var;
- $groupVar = "factor" if $var eq "antibody";
- $groupVar = "cellType" if $var eq "cell";
- $subGroups .= " $groupVar=$hash{$var}";
- $additional = " $var $hash{$var}\n" . $additional;
+ my $cvTypeVar = $groupVar;
+ # handle inconsistent naming for antibody & cell type
+ if ($var eq "antibody") {
+ $groupVar = "factor";
+ $cvTypeVar = "Antibody";
+ }
+ if ($var eq "cell") {
+ $groupVar = "cellType";
+ $cvTypeVar = "Cell Line";
+ }
+ $subGroups .= " $groupVar=$terms{$cvTypeVar}->{$hash{$var}}->{'tag'}";
}
}
#if($Encode::dafVersion gt "1.0") {
# $tableName .= "$view";
@@ -1917,9 +1937,8 @@
# Obsolete: now in metadata
# print TRACK_RA sprintf(" dateSubmitted %04d-%02d-%02d\n", 1900 + $year, $mon + 1, $mday);
# print TRACK_RA sprintf(" dateUnrestricted %04d-%02d-%02d\n", 1900 + $rYear, $rMon + 1, $rMDay);
# print TRACK_RA sprintf(" dataVersion %s\n", $Encode::dataVersion);
- # print TRACK_RA $additional;
if(defined($ddfLine->{accession}) && length($ddfLine->{accession}) > 0) {
print TRACK_RA sprintf(" accession %s\n",$ddfLine->{accession});
}
# color track by color setting for cell type in cv.ra