src/hg/encode/expTblTools/loadCVTbl 1.1
1.1 2010/06/03 00:04:58 krish
added some tool under dev'ment to deal with exp table
Index: src/hg/encode/expTblTools/loadCVTbl
===================================================================
RCS file: src/hg/encode/expTblTools/loadCVTbl
diff -N src/hg/encode/expTblTools/loadCVTbl
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/encode/expTblTools/loadCVTbl 3 Jun 2010 00:04:58 -0000 1.1
@@ -0,0 +1,327 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Cwd;
+
+use lib "/cluster/bin/scripts";
+use Encode;
+use RAFile;
+use HgAutomate;
+use HgDb;
+
+use vars qw/
+ $opt_configDir
+ $opt_verbose
+ $opt_instance
+ /;
+
+sub usage {
+ print STDERR <<END;
+usage: loadCellTbl database table [term]
+ Populate the a CV table in the given database from the cv.ra file.
+
+ -instance=s Use ENCODE pipeline instance s (default prod).
+ -configDir=dir Path of configuration directory, containing metadata
+ .ra files (default: /hive/groups/encode/dcc/pipeline/encpipeline_prod/config/)
+ -verbose=num Set verbose level to num (default 1).
+END
+exit 1;
+}
+
+################################################################################
+
+my $commentedout = <<EOS;
+CREATE TABLE cell (
+ name VARCHAR(255) NOT NULL,
+ tag VARCHAR(255) NOT NULL,
+ organism VARCHAR(255) NOT NULL,
+ category VARCHAR(20),
+ description TEXT,
+ vendorName TEXT,
+ vendorId VARCHAR(255),
+ orderUrl VARCHAR(255),
+ karyotype VARCHAR(255),
+ lineage TEXT,
+ termId VARCHAR(255),
+ termUrl VARCHAR(255),
+ color VARCHAR(255),
+ sex VARCHAR(20),
+ tier INT,
+ protocol VARCHAR(255),
+ PRIMARY KEY (name)
+);
+EOS
+
+################################################################################
+
+my %termToColumnMapper = (
+ "term" => "name",
+ "Category" => "category",
+);
+
+sub mapTermToColumn {
+ my $term = shift;
+ if (exists $termToColumnMapper{$term}) {
+ return $termToColumnMapper{$term};
+ } else {
+ return $term;
+ }
+}
+
+sub mapColumnToTerm {
+ my $column = shift;
+ for my $k (keys %termToColumnMapper) {
+ if ($column eq $termToColumnMapper{$k}) {
+ return $k;
+ }
+ }
+
+ return $column;
+}
+
+# return a list of warning
+sub getDbWarnings {
+ my $dbHandle = shift;
+ my @warnings = ();
+ my $results = $dbHandle->execute("SHOW WARNINGS") or die $dbHandle->errstr;
+ while (my @row = $results->fetchrow_array()) {
+ push @warnings, $row[2];
+ }
+ return @warnings;
+}
+
+# return a list of the fields in a database table
+sub getDbFields {
+ my $dbHandle = shift;
+ my $tableName = shift;
+
+ my @fieldsList = ();
+
+ my $cmd = "SHOW COLUMNS FROM $tableName";
+ my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
+ if ($results) {
+ while (my @row = $results->fetchrow_array()) {
+ push @fieldsList, $row[0];
+ }
+ } else {
+ die $dbHandle->errstr;
+ }
+
+ return sort @fieldsList;
+}
+
+# returns all the possible fields for the given CV terms
+sub getCvFields {
+ my %records = @_;
+
+ my %fieldsHash = ();
+
+ for my $k (keys %records) {
+ my @fields = keys(%{$records{$k}});
+ for my $f (@fields) {
+ $fieldsHash{$f} = 0;
+ }
+ }
+
+ return sort keys %fieldsHash;
+}
+
+sub checkUpdateFields {
+ my $dbHandle = shift;
+ my $tableName = shift;
+ my %termInfo = @_;
+
+ my $name = $termInfo{"term"};
+ my @columnNames = ();
+ my @cvValues = ();
+ my @dbValues = ();
+
+ # get all columns that are in the CV
+ for my $k (keys %termInfo) {
+ next if ($k eq "type");
+ push @columnNames, mapTermToColumn($k);
+ if (defined $termInfo{$k}) {
+ push @cvValues, $termInfo{$k};
+ } else {
+ push @cvValues, "";
+ }
+ }
+
+ # get all columns fron the database
+ my $cmd = "SELECT " . join(",", @columnNames) . " FROM $tableName WHERE name = " . $dbHandle->quote($name);
+ my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
+ if ($results) {
+ my @row = $results->fetchrow_array();
+ for (my $i = 0; $i < scalar(@row); ++$i) {
+ # check if database has empty column
+ if (not defined $row[$i] or $row[$i] eq "") {
+ # check if there is anything in the CV to back fill
+ if ($cvValues[$i] ne "") {
+ HgAutomate::verbose(2, "Backfilling field " . $columnNames[$i] . " with " . $cvValues[$i] . " in $name\n");
+ my $updateCmd = "UPDATE $tableName" .
+ " SET " . $columnNames[$i] . " = " . $dbHandle->quote($cvValues[$i]) .
+ " WHERE name = " . $dbHandle->quote($name);
+ $dbHandle->execute($updateCmd) or die $dbHandle->errstr;
+ my @warnings = getDbWarnings($dbHandle);
+ if (scalar(@warnings) > 0) {
+ print STDERR "Got a warning when executing $cmd.\n";
+ for my $w (@warnings) {
+ print STDERR "\t$w\n";
+ }
+ }
+ }
+ } elsif ($cvValues[$i] ne $row[$i]) { # check to make sure the CV and database tables are equal
+ print STDERR "WARNING: term $name in $tableName contrains different $columnNames[$i], $cvValues[$i] != $row[$i]\n";
+ }
+ }
+ } else {
+ die $dbHandle->errstr;
+ }
+}
+
+# add a new term to the database table
+sub addNewTerm {
+ my $dbHandle = shift;
+ my $tableName = shift;
+ my %termInfo = @_;
+
+ my @columnNames = ();
+ my @columnValues = ();
+
+ # get the defined column and it's value
+ for my $k (keys %termInfo) {
+ next if ($k eq "type");
+ push @columnNames, mapTermToColumn($k);
+ if (defined $termInfo{$k}) {
+ push @columnValues, $dbHandle->quote($termInfo{$k});
+ } else {
+ push @columnValues, $dbHandle->quote("");
+ }
+ }
+
+ my $cmd = "INSERT INTO $tableName (" . join(",", @columnNames) . ") VALUES " .
+ "(" . join(",", @columnValues) . ")";
+ my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
+ my $warnCount = $dbHandle->quickQuery("SHOW COUNT(*) WARNINGS");
+ my @warnings = getDbWarnings($dbHandle);
+ if (scalar(@warnings) > 0) {
+ print STDERR "Got a warning when executing $cmd.\n";
+ for my $w (@warnings) {
+ print STDERR "\t$w\n";
+ }
+ }
+}
+
+############################################################################
+# Main
+############################################################################
+
+my $now = time();
+my $wd = cwd();
+my $ok = GetOptions("instance=s",
+ "configDir=s",
+ "verbose=i"
+ );
+# parse options
+usage() if (!$ok);
+usage() if (scalar(@ARGV) != 2 and scalar(@ARGV) != 3);
+# get options or set defaults
+if (not defined $opt_instance) {
+ $opt_instance = "prod";
+}
+my $configPath;
+if (defined $opt_configDir) {
+ if ($opt_configDir =~ /^\//) {
+ $configPath = $opt_configDir;
+ } else {
+ $configPath = "$wd/$opt_configDir";
+ }
+} else {
+ $configPath = "/hive/groups/encode/dcc/pipeline/encpipeline_$opt_instance/config/";
+}
+if(!(-d $configPath)) {
+ die "configPath '$configPath' is invalid; Can't find the config directory\n";
+}
+if (not defined $opt_verbose) {
+ $opt_verbose = 1;
+}
+HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");
+
+my $database = $ARGV[0];
+my $tableName = $ARGV[1];
+my $termName;
+if (scalar(@ARGV) == 3) {
+ $termName = $ARGV[2];
+} else {
+ $termName = $tableName;
+}
+
+# read the cv.ra file
+my %cvTerms = Encode::getControlledVocab($configPath);
+
+# connect to the database and read the metadata table for the obj
+my $dbHandle = HgDb->new(DB => $database);
+
+# get hashes contraining the fields and term names
+my %dbFields = map { $_ => 0 } getDbFields($dbHandle, $tableName);
+my %cvFields = map { $_ => 0 } getCvFields(%{$cvTerms{$termName}});
+delete $cvFields{"type"};
+
+# check if there are any fields in the CV that aren't in the table
+my $fatalError = 0;
+for my $f (keys %cvFields) {
+ if (defined $dbFields{mapTermToColumn($f)}) {
+ } else {
+ print STDERR "ERROR: Found field $f in CV that is missing from database table.\n";
+ $fatalError = 1;
+ }
+}
+
+# check if there are any fields in the table that aren't in the CV
+for my $f (keys %dbFields) {
+ if (defined $cvFields{mapColumnToTerm($f)}) {
+ } else {
+ print STDERR "WARNING: Found field $f in database table that is missing from CV.\n";
+ }
+}
+
+if ($fatalError) {
+ exit 10;
+}
+
+# read the names present in the database
+my %dbNames = ();
+my $results = $dbHandle->execute("SELECT name FROM $tableName") or die $dbHandle->errstr;
+if ($results) {
+ while(my @row = $results->fetchrow_array()) {
+ my $name = $row[0];
+ $dbNames{$name} = 0;
+ }
+}
+
+# read the names from the CV
+my %cvNames = ();
+my %terms = %{$cvTerms{$termName}};
+for my $t (keys %terms) {
+ $cvNames{$t} = 0; # store the name for later diff'ing
+ my %termInfo = %{$terms{$t}};
+ if (defined $dbNames{$t}) { # if it's already in the table,
+ HgAutomate::verbose(3, "Term $t already in table $tableName.\n");
+ checkUpdateFields($dbHandle, $tableName, %termInfo);
+ } else { # add it
+ HgAutomate::verbose(2, "Term $t not in table $tableName, adding it.\n");
+ addNewTerm($dbHandle, $tableName, %termInfo);
+ }
+}
+
+# check for terms in the database that are not in the cv
+for my $n (sort keys %dbNames) {
+ if (defined $cvNames{$n}) { # if in cv
+ # do nothing
+ } else { # otherwise give a warning
+ print STDERR "Term $n found in table $tableName but not in cv.\n";
+ }
+}