src/hg/encode/expTblTools/loadCVTbl 1.2
1.2 2010/06/03 23:30:30 vsmalladi
Updated schema to support types Antibody, protocol, control, treatment
Index: src/hg/encode/expTblTools/loadCVTbl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/expTblTools/loadCVTbl,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/encode/expTblTools/loadCVTbl 3 Jun 2010 00:04:58 -0000 1.1
+++ src/hg/encode/expTblTools/loadCVTbl 3 Jun 2010 23:30:30 -0000 1.2
@@ -1,327 +1,371 @@
#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Long;
use Cwd;
use lib "/cluster/bin/scripts";
use Encode;
use RAFile;
use HgAutomate;
use HgDb;
use vars qw/
$opt_configDir
$opt_verbose
$opt_instance
/;
sub usage {
print STDERR <<END;
usage: loadCellTbl database table [term]
Populate the a CV table in the given database from the cv.ra file.
-instance=s Use ENCODE pipeline instance s (default prod).
-configDir=dir Path of configuration directory, containing metadata
.ra files (default: /hive/groups/encode/dcc/pipeline/encpipeline_prod/config/)
-verbose=num Set verbose level to num (default 1).
END
exit 1;
}
################################################################################
my $commentedout = <<EOS;
+# cell table
CREATE TABLE cell (
name VARCHAR(255) NOT NULL,
tag VARCHAR(255) NOT NULL,
organism VARCHAR(255) NOT NULL,
- category VARCHAR(20),
+ Category VARCHAR(20),
description TEXT,
+ tissue VARCHAR(255),
vendorName TEXT,
vendorId VARCHAR(255),
orderUrl VARCHAR(255),
karyotype VARCHAR(255),
lineage TEXT,
termId VARCHAR(255),
termUrl VARCHAR(255),
color VARCHAR(255),
sex VARCHAR(20),
tier INT,
protocol VARCHAR(255),
PRIMARY KEY (name)
);
+
+#antibody table
+CREATE TABLE antibody (
+ name VARCHAR(255) NOT NULL,
+ tag VARCHAR(255) NOT NULL,
+ displayName VARCHAR(255) DEFAULT "Not Decided",
+ antibodyDescription TEXT,
+ validation VARCHAR(255),
+ lab VARCHAR(255),
+ targetId VARCHAR(255),
+ targetDescription TEXT,
+ targetUrl VARCHAR(255),
+ geneId VARCHAR(255),
+ vendorName VARCHAR(255),
+ vendorId VARCHAR(255),
+ lots VARCHAR(255),
+ orderUrl VARCHAR(255),
+ PRIMARY KEY (name)
+);
+
+#protocol table
+CREATE TABLE protocol (
+ name VARCHAR(255) NOT NULL,
+ tag VARCHAR(255) NOT NULL,
+ description VARCHAR(255) NOT NULL
+);
+
+#treatment table
+CREATE TABLE treatment (
+ name VARCHAR(255) NOT NULL,
+ tag VARCHAR(255) NOT NULL,
+ description VARCHAR(255),
+ label VARCHAR(255) DEFAULT "Not Decided"
+);
+
+#control table
+CREATE TABLE control (
+ name VARCHAR(255) NOT NULL,
+ tag VARCHAR(255) NOT NULL,
+ description VARCHAR(255)
+);
+
EOS
################################################################################
my %termToColumnMapper = (
"term" => "name",
- "Category" => "category",
+ "Lots" => "lots" # not sure why the term is spelled different need to investigate
);
sub mapTermToColumn {
my $term = shift;
if (exists $termToColumnMapper{$term}) {
return $termToColumnMapper{$term};
} else {
return $term;
}
}
sub mapColumnToTerm {
my $column = shift;
for my $k (keys %termToColumnMapper) {
if ($column eq $termToColumnMapper{$k}) {
return $k;
}
}
return $column;
}
# return a list of warning
sub getDbWarnings {
my $dbHandle = shift;
my @warnings = ();
my $results = $dbHandle->execute("SHOW WARNINGS") or die $dbHandle->errstr;
while (my @row = $results->fetchrow_array()) {
push @warnings, $row[2];
}
return @warnings;
}
# return a list of the fields in a database table
sub getDbFields {
my $dbHandle = shift;
my $tableName = shift;
my @fieldsList = ();
my $cmd = "SHOW COLUMNS FROM $tableName";
my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
if ($results) {
while (my @row = $results->fetchrow_array()) {
push @fieldsList, $row[0];
}
} else {
die $dbHandle->errstr;
}
return sort @fieldsList;
}
# returns all the possible fields for the given CV terms
sub getCvFields {
my %records = @_;
my %fieldsHash = ();
for my $k (keys %records) {
my @fields = keys(%{$records{$k}});
for my $f (@fields) {
$fieldsHash{$f} = 0;
}
}
return sort keys %fieldsHash;
}
sub checkUpdateFields {
my $dbHandle = shift;
my $tableName = shift;
my %termInfo = @_;
my $name = $termInfo{"term"};
my @columnNames = ();
my @cvValues = ();
my @dbValues = ();
# get all columns that are in the CV
for my $k (keys %termInfo) {
next if ($k eq "type");
push @columnNames, mapTermToColumn($k);
if (defined $termInfo{$k}) {
push @cvValues, $termInfo{$k};
} else {
push @cvValues, "";
}
}
# get all columns fron the database
my $cmd = "SELECT " . join(",", @columnNames) . " FROM $tableName WHERE name = " . $dbHandle->quote($name);
my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
if ($results) {
my @row = $results->fetchrow_array();
for (my $i = 0; $i < scalar(@row); ++$i) {
# check if database has empty column
if (not defined $row[$i] or $row[$i] eq "") {
# check if there is anything in the CV to back fill
if ($cvValues[$i] ne "") {
HgAutomate::verbose(2, "Backfilling field " . $columnNames[$i] . " with " . $cvValues[$i] . " in $name\n");
my $updateCmd = "UPDATE $tableName" .
" SET " . $columnNames[$i] . " = " . $dbHandle->quote($cvValues[$i]) .
" WHERE name = " . $dbHandle->quote($name);
$dbHandle->execute($updateCmd) or die $dbHandle->errstr;
my @warnings = getDbWarnings($dbHandle);
if (scalar(@warnings) > 0) {
print STDERR "Got a warning when executing $cmd.\n";
for my $w (@warnings) {
print STDERR "\t$w\n";
}
}
}
} elsif ($cvValues[$i] ne $row[$i]) { # check to make sure the CV and database tables are equal
print STDERR "WARNING: term $name in $tableName contrains different $columnNames[$i], $cvValues[$i] != $row[$i]\n";
}
}
} else {
die $dbHandle->errstr;
}
}
# add a new term to the database table
sub addNewTerm {
my $dbHandle = shift;
my $tableName = shift;
my %termInfo = @_;
my @columnNames = ();
my @columnValues = ();
# get the defined column and it's value
for my $k (keys %termInfo) {
next if ($k eq "type");
push @columnNames, mapTermToColumn($k);
if (defined $termInfo{$k}) {
push @columnValues, $dbHandle->quote($termInfo{$k});
} else {
push @columnValues, $dbHandle->quote("");
}
}
my $cmd = "INSERT INTO $tableName (" . join(",", @columnNames) . ") VALUES " .
"(" . join(",", @columnValues) . ")";
my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
my $warnCount = $dbHandle->quickQuery("SHOW COUNT(*) WARNINGS");
my @warnings = getDbWarnings($dbHandle);
if (scalar(@warnings) > 0) {
print STDERR "Got a warning when executing $cmd.\n";
for my $w (@warnings) {
print STDERR "\t$w\n";
}
}
}
############################################################################
# Main
############################################################################
my $now = time();
my $wd = cwd();
my $ok = GetOptions("instance=s",
"configDir=s",
"verbose=i"
);
# parse options
usage() if (!$ok);
usage() if (scalar(@ARGV) != 2 and scalar(@ARGV) != 3);
# get options or set defaults
if (not defined $opt_instance) {
$opt_instance = "prod";
}
my $configPath;
if (defined $opt_configDir) {
if ($opt_configDir =~ /^\//) {
$configPath = $opt_configDir;
} else {
$configPath = "$wd/$opt_configDir";
}
} else {
$configPath = "/hive/groups/encode/dcc/pipeline/encpipeline_$opt_instance/config/";
}
if(!(-d $configPath)) {
die "configPath '$configPath' is invalid; Can't find the config directory\n";
}
if (not defined $opt_verbose) {
$opt_verbose = 1;
}
HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");
my $database = $ARGV[0];
my $tableName = $ARGV[1];
my $termName;
if (scalar(@ARGV) == 3) {
$termName = $ARGV[2];
} else {
$termName = $tableName;
}
# read the cv.ra file
my %cvTerms = Encode::getControlledVocab($configPath);
# connect to the database and read the metadata table for the obj
my $dbHandle = HgDb->new(DB => $database);
# get hashes contraining the fields and term names
my %dbFields = map { $_ => 0 } getDbFields($dbHandle, $tableName);
my %cvFields = map { $_ => 0 } getCvFields(%{$cvTerms{$termName}});
delete $cvFields{"type"};
# check if there are any fields in the CV that aren't in the table
my $fatalError = 0;
for my $f (keys %cvFields) {
if (defined $dbFields{mapTermToColumn($f)}) {
} else {
print STDERR "ERROR: Found field $f in CV that is missing from database table.\n";
$fatalError = 1;
}
}
# check if there are any fields in the table that aren't in the CV
for my $f (keys %dbFields) {
if (defined $cvFields{mapColumnToTerm($f)}) {
} else {
print STDERR "WARNING: Found field $f in database table that is missing from CV.\n";
}
}
if ($fatalError) {
exit 10;
}
# read the names present in the database
my %dbNames = ();
my $results = $dbHandle->execute("SELECT name FROM $tableName") or die $dbHandle->errstr;
if ($results) {
while(my @row = $results->fetchrow_array()) {
my $name = $row[0];
$dbNames{$name} = 0;
}
}
# read the names from the CV
my %cvNames = ();
my %terms = %{$cvTerms{$termName}};
for my $t (keys %terms) {
$cvNames{$t} = 0; # store the name for later diff'ing
my %termInfo = %{$terms{$t}};
if (defined $dbNames{$t}) { # if it's already in the table,
HgAutomate::verbose(3, "Term $t already in table $tableName.\n");
checkUpdateFields($dbHandle, $tableName, %termInfo);
} else { # add it
HgAutomate::verbose(2, "Term $t not in table $tableName, adding it.\n");
addNewTerm($dbHandle, $tableName, %termInfo);
}
}
# check for terms in the database that are not in the cv
for my $n (sort keys %dbNames) {
if (defined $cvNames{$n}) { # if in cv
# do nothing
} else { # otherwise give a warning
print STDERR "Term $n found in table $tableName but not in cv.\n";
}
}