src/hg/encode/expTblTools/loadCVTbl 1.1

1.1 2010/06/03 00:04:58 krish
added some tool under dev'ment to deal with exp table
Index: src/hg/encode/expTblTools/loadCVTbl
===================================================================
RCS file: src/hg/encode/expTblTools/loadCVTbl
diff -N src/hg/encode/expTblTools/loadCVTbl
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/encode/expTblTools/loadCVTbl	3 Jun 2010 00:04:58 -0000	1.1
@@ -0,0 +1,327 @@
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+use Getopt::Long;
+use Cwd;
+
+use lib "/cluster/bin/scripts";
+use Encode;
+use RAFile;
+use HgAutomate;
+use HgDb;
+
+use vars qw/
+    $opt_configDir
+    $opt_verbose
+    $opt_instance
+    /;
+
+sub usage {
+    print STDERR <<END;
+usage: loadCellTbl database table [term]
+    Populate the a CV table in the given database from the cv.ra file.    
+
+    -instance=s         Use ENCODE pipeline instance s (default prod).
+    -configDir=dir      Path of configuration directory, containing metadata
+                        .ra files (default: /hive/groups/encode/dcc/pipeline/encpipeline_prod/config/)
+    -verbose=num        Set verbose level to num (default 1).
+END
+exit 1;
+}
+
+################################################################################
+
+my $commentedout = <<EOS;
+CREATE TABLE cell (
+    name VARCHAR(255) NOT NULL,
+    tag VARCHAR(255) NOT NULL,
+    organism VARCHAR(255) NOT NULL,
+    category VARCHAR(20),
+    description TEXT,
+    vendorName TEXT,
+    vendorId VARCHAR(255),
+    orderUrl VARCHAR(255),
+    karyotype VARCHAR(255),
+    lineage TEXT,
+    termId VARCHAR(255),
+    termUrl VARCHAR(255),
+    color VARCHAR(255),
+    sex VARCHAR(20),
+    tier INT,
+    protocol VARCHAR(255),
+    PRIMARY KEY (name)
+);
+EOS
+
+################################################################################
+
+my %termToColumnMapper = (
+    "term" => "name",
+    "Category" => "category",
+);
+
+sub mapTermToColumn { 
+    my $term = shift;
+    if (exists $termToColumnMapper{$term}) {
+        return $termToColumnMapper{$term};
+    } else {
+        return $term;
+    }
+}
+
+sub mapColumnToTerm {
+    my $column = shift;
+    for my $k (keys %termToColumnMapper) {
+        if ($column eq $termToColumnMapper{$k}) {
+            return $k;
+        }
+    }
+
+    return $column;
+}
+
+# return a list of warning
+sub getDbWarnings {
+    my $dbHandle = shift;
+    my @warnings = ();
+    my $results = $dbHandle->execute("SHOW WARNINGS") or die $dbHandle->errstr;
+    while (my @row = $results->fetchrow_array()) {
+        push @warnings, $row[2]; 
+    }
+    return @warnings;
+}
+
+# return a list of the fields in a database table
+sub getDbFields {
+    my $dbHandle = shift;
+    my $tableName = shift;
+
+    my @fieldsList = ();
+
+    my $cmd = "SHOW COLUMNS FROM $tableName";
+    my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
+    if ($results) {
+        while (my @row = $results->fetchrow_array()) {
+            push @fieldsList, $row[0]; 
+        }
+    } else {
+        die $dbHandle->errstr;
+    }
+
+    return sort @fieldsList;
+}
+
+# returns all the possible fields for the given CV terms
+sub getCvFields {
+    my %records = @_;
+
+    my %fieldsHash = ();
+
+    for my $k (keys %records) {
+        my @fields = keys(%{$records{$k}});
+        for my $f (@fields) {
+            $fieldsHash{$f} = 0;
+        }
+    }
+    
+    return sort keys %fieldsHash;
+}
+
+sub checkUpdateFields {
+    my $dbHandle = shift;
+    my $tableName = shift;
+    my %termInfo = @_;
+
+    my $name = $termInfo{"term"};
+    my @columnNames = ();
+    my @cvValues = ();
+    my @dbValues = ();
+
+    # get all columns that are in the CV
+    for my $k (keys %termInfo) {
+        next if ($k eq "type");
+        push @columnNames,  mapTermToColumn($k);
+        if (defined $termInfo{$k}) {
+            push @cvValues, $termInfo{$k};
+        } else {
+            push @cvValues, "";
+        }
+    }
+
+    # get all columns fron the database
+    my $cmd = "SELECT " . join(",", @columnNames) . " FROM $tableName WHERE name = " . $dbHandle->quote($name);
+    my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
+    if ($results) {
+        my @row = $results->fetchrow_array();
+        for (my $i = 0; $i < scalar(@row); ++$i) {
+            # check if database has empty column
+            if (not defined $row[$i] or $row[$i] eq "") {
+                # check if there is anything in the CV to back fill
+                if ($cvValues[$i] ne "") {
+                    HgAutomate::verbose(2, "Backfilling field " . $columnNames[$i] . " with " . $cvValues[$i] . " in $name\n");
+                    my $updateCmd = "UPDATE $tableName" .
+                                    " SET " . $columnNames[$i] . " = " . $dbHandle->quote($cvValues[$i]) .
+                                    " WHERE name = " . $dbHandle->quote($name);
+                    $dbHandle->execute($updateCmd) or die $dbHandle->errstr;
+                    my @warnings = getDbWarnings($dbHandle);
+                    if (scalar(@warnings) > 0) {
+                        print STDERR "Got a warning when executing $cmd.\n";
+                        for my $w (@warnings) {
+                            print STDERR "\t$w\n";
+                        }
+                    }
+                }
+            } elsif ($cvValues[$i] ne $row[$i]) {    # check to make sure the CV and database tables are equal
+                print STDERR "WARNING: term $name in $tableName contrains different $columnNames[$i], $cvValues[$i] != $row[$i]\n";
+            }
+        }
+    } else {
+        die $dbHandle->errstr;
+    }
+}
+
+# add a new term to the database table
+sub addNewTerm {
+    my $dbHandle = shift;
+    my $tableName = shift;
+    my %termInfo = @_;
+
+    my @columnNames = ();
+    my @columnValues = ();
+
+    # get the defined column and it's value
+    for my $k (keys %termInfo) {
+        next if ($k eq "type");
+        push @columnNames,  mapTermToColumn($k);
+        if (defined $termInfo{$k}) {
+            push @columnValues, $dbHandle->quote($termInfo{$k});
+        } else {
+            push @columnValues, $dbHandle->quote("");
+        }
+    }
+
+    my $cmd = "INSERT INTO $tableName (" . join(",", @columnNames)  . ") VALUES " .
+                                     "(" . join(",", @columnValues) . ")";
+    my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
+    my $warnCount = $dbHandle->quickQuery("SHOW COUNT(*) WARNINGS");
+    my @warnings = getDbWarnings($dbHandle);
+    if (scalar(@warnings) > 0) {
+        print STDERR "Got a warning when executing $cmd.\n";
+        for my $w (@warnings) {
+            print STDERR "\t$w\n";
+        }
+    }
+}
+
+############################################################################
+# Main
+############################################################################
+
+my $now = time();
+my $wd = cwd();
+my $ok = GetOptions("instance=s",
+                    "configDir=s",
+                    "verbose=i"
+                    );
+# parse options
+usage() if (!$ok);
+usage() if (scalar(@ARGV) != 2 and scalar(@ARGV) != 3);
+# get options or set defaults
+if (not defined $opt_instance) {
+    $opt_instance = "prod";
+}
+my $configPath;
+if (defined $opt_configDir) {
+    if ($opt_configDir =~ /^\//) {
+        $configPath = $opt_configDir;
+    } else {
+        $configPath = "$wd/$opt_configDir";
+    }
+} else {
+    $configPath = "/hive/groups/encode/dcc/pipeline/encpipeline_$opt_instance/config/";
+}
+if(!(-d $configPath)) {
+    die "configPath '$configPath' is invalid; Can't find the config directory\n";
+}
+if (not defined $opt_verbose) {
+    $opt_verbose = 1;
+}
+HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");
+
+my $database  = $ARGV[0];
+my $tableName = $ARGV[1];
+my $termName;
+if (scalar(@ARGV) == 3) {
+    $termName = $ARGV[2];
+} else {
+    $termName = $tableName;
+}
+
+# read the cv.ra file
+my %cvTerms = Encode::getControlledVocab($configPath);
+
+# connect to the database and read the metadata table for the obj
+my $dbHandle = HgDb->new(DB => $database);
+
+# get hashes contraining the fields and term names
+my %dbFields = map { $_ => 0 } getDbFields($dbHandle, $tableName);
+my %cvFields = map { $_ => 0 } getCvFields(%{$cvTerms{$termName}});
+delete $cvFields{"type"};
+
+# check if there are any fields in the CV that aren't in the table
+my $fatalError = 0;
+for my $f (keys %cvFields) {
+    if (defined $dbFields{mapTermToColumn($f)}) {
+    } else {
+        print STDERR "ERROR: Found field $f in CV that is missing from database table.\n";
+        $fatalError = 1;
+    }
+}
+
+# check if there are any fields in the table that aren't in the CV
+for my $f (keys %dbFields) {
+    if (defined $cvFields{mapColumnToTerm($f)}) {
+    } else {
+        print STDERR "WARNING: Found field $f in database table that is missing from CV.\n";
+    }
+}
+
+if ($fatalError) {
+    exit 10;
+}
+
+# read the names present in the database
+my %dbNames = ();
+my $results = $dbHandle->execute("SELECT name FROM $tableName") or die $dbHandle->errstr;
+if ($results) {
+    while(my @row = $results->fetchrow_array()) {
+        my $name = $row[0];
+        $dbNames{$name} = 0;
+    }
+}
+
+# read the names from the CV
+my %cvNames = ();
+my %terms = %{$cvTerms{$termName}};
+for my $t (keys %terms) {
+    $cvNames{$t} = 0;   # store the name for later diff'ing
+    my %termInfo = %{$terms{$t}};
+    if (defined $dbNames{$t}) {     # if it's already in the table, 
+        HgAutomate::verbose(3, "Term $t already in table $tableName.\n");
+        checkUpdateFields($dbHandle, $tableName, %termInfo);
+    } else {        # add it
+        HgAutomate::verbose(2, "Term $t not in table $tableName, adding it.\n");
+        addNewTerm($dbHandle, $tableName, %termInfo);
+    }
+}
+
+# check for terms in the database that are not in the cv
+for my $n (sort keys %dbNames) {
+    if (defined $cvNames{$n}) {     # if in cv
+        # do nothing
+    } else {        # otherwise give a warning
+        print STDERR "Term $n found in table $tableName but not in cv.\n";
+    }
+}