src/hg/encode/expTblTools/loadCVTbl 1.2

1.2 2010/06/03 23:30:30 vsmalladi
Updated schema to support types Antibody, protocol, control, treatment
Index: src/hg/encode/expTblTools/loadCVTbl
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/expTblTools/loadCVTbl,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/encode/expTblTools/loadCVTbl	3 Jun 2010 00:04:58 -0000	1.1
+++ src/hg/encode/expTblTools/loadCVTbl	3 Jun 2010 23:30:30 -0000	1.2
@@ -1,327 +1,371 @@
 #!/usr/bin/env perl
 
 use warnings;
 use strict;
 
 use Getopt::Long;
 use Cwd;
 
 use lib "/cluster/bin/scripts";
 use Encode;
 use RAFile;
 use HgAutomate;
 use HgDb;
 
 use vars qw/
     $opt_configDir
     $opt_verbose
     $opt_instance
     /;
 
 sub usage {
     print STDERR <<END;
 usage: loadCellTbl database table [term]
     Populate the a CV table in the given database from the cv.ra file.    
 
     -instance=s         Use ENCODE pipeline instance s (default prod).
     -configDir=dir      Path of configuration directory, containing metadata
                         .ra files (default: /hive/groups/encode/dcc/pipeline/encpipeline_prod/config/)
     -verbose=num        Set verbose level to num (default 1).
 END
 exit 1;
 }
 
 ################################################################################
 
 my $commentedout = <<EOS;
+# cell table
 CREATE TABLE cell (
     name VARCHAR(255) NOT NULL,
     tag VARCHAR(255) NOT NULL,
     organism VARCHAR(255) NOT NULL,
-    category VARCHAR(20),
+    Category VARCHAR(20),
     description TEXT,
+    tissue VARCHAR(255),
     vendorName TEXT,
     vendorId VARCHAR(255),
     orderUrl VARCHAR(255),
     karyotype VARCHAR(255),
     lineage TEXT,
     termId VARCHAR(255),
     termUrl VARCHAR(255),
     color VARCHAR(255),
     sex VARCHAR(20),
     tier INT,
     protocol VARCHAR(255),
     PRIMARY KEY (name)
 );
+
+#antibody table
+CREATE TABLE antibody (
+    name VARCHAR(255) NOT NULL,
+    tag VARCHAR(255) NOT NULL,
+    displayName VARCHAR(255) DEFAULT "Not Decided",
+    antibodyDescription TEXT,
+    validation VARCHAR(255),
+    lab VARCHAR(255),
+    targetId VARCHAR(255),
+    targetDescription TEXT,
+    targetUrl VARCHAR(255),
+    geneId VARCHAR(255),
+    vendorName VARCHAR(255),
+    vendorId VARCHAR(255),
+    lots VARCHAR(255),
+    orderUrl VARCHAR(255),
+    PRIMARY KEY (name)
+);
+
+#protocol table
+CREATE TABLE protocol (
+    name VARCHAR(255) NOT NULL,
+    tag VARCHAR(255) NOT NULL,
+    description VARCHAR(255) NOT NULL
+);
+
+#treatment table
+CREATE TABLE treatment (
+    name VARCHAR(255) NOT NULL,
+    tag VARCHAR(255) NOT NULL,
+    description VARCHAR(255),
+    label VARCHAR(255) DEFAULT "Not Decided"
+);
+
+#control table
+CREATE TABLE control (
+    name VARCHAR(255) NOT NULL,
+    tag VARCHAR(255) NOT NULL,
+    description VARCHAR(255)
+);
+
 EOS
 
 ################################################################################
 
 my %termToColumnMapper = (
     "term" => "name",
-    "Category" => "category",
+    "Lots" => "lots" # not sure why the term is spelled different need to investigate
 );
 
 sub mapTermToColumn { 
     my $term = shift;
     if (exists $termToColumnMapper{$term}) {
         return $termToColumnMapper{$term};
     } else {
         return $term;
     }
 }
 
 sub mapColumnToTerm {
     my $column = shift;
     for my $k (keys %termToColumnMapper) {
         if ($column eq $termToColumnMapper{$k}) {
             return $k;
         }
     }
 
     return $column;
 }
 
 # return a list of warning
 sub getDbWarnings {
     my $dbHandle = shift;
     my @warnings = ();
     my $results = $dbHandle->execute("SHOW WARNINGS") or die $dbHandle->errstr;
     while (my @row = $results->fetchrow_array()) {
         push @warnings, $row[2]; 
     }
     return @warnings;
 }
 
 # return a list of the fields in a database table
 sub getDbFields {
     my $dbHandle = shift;
     my $tableName = shift;
 
     my @fieldsList = ();
 
     my $cmd = "SHOW COLUMNS FROM $tableName";
     my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
     if ($results) {
         while (my @row = $results->fetchrow_array()) {
             push @fieldsList, $row[0]; 
         }
     } else {
         die $dbHandle->errstr;
     }
 
     return sort @fieldsList;
 }
 
 # returns all the possible fields for the given CV terms
 sub getCvFields {
     my %records = @_;
 
     my %fieldsHash = ();
 
     for my $k (keys %records) {
         my @fields = keys(%{$records{$k}});
         for my $f (@fields) {
             $fieldsHash{$f} = 0;
         }
     }
     
     return sort keys %fieldsHash;
 }
 
 sub checkUpdateFields {
     my $dbHandle = shift;
     my $tableName = shift;
     my %termInfo = @_;
 
     my $name = $termInfo{"term"};
     my @columnNames = ();
     my @cvValues = ();
     my @dbValues = ();
 
     # get all columns that are in the CV
     for my $k (keys %termInfo) {
         next if ($k eq "type");
         push @columnNames,  mapTermToColumn($k);
         if (defined $termInfo{$k}) {
             push @cvValues, $termInfo{$k};
         } else {
             push @cvValues, "";
         }
     }
 
     # get all columns fron the database
     my $cmd = "SELECT " . join(",", @columnNames) . " FROM $tableName WHERE name = " . $dbHandle->quote($name);
     my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
     if ($results) {
         my @row = $results->fetchrow_array();
         for (my $i = 0; $i < scalar(@row); ++$i) {
             # check if database has empty column
             if (not defined $row[$i] or $row[$i] eq "") {
                 # check if there is anything in the CV to back fill
                 if ($cvValues[$i] ne "") {
                     HgAutomate::verbose(2, "Backfilling field " . $columnNames[$i] . " with " . $cvValues[$i] . " in $name\n");
                     my $updateCmd = "UPDATE $tableName" .
                                     " SET " . $columnNames[$i] . " = " . $dbHandle->quote($cvValues[$i]) .
                                     " WHERE name = " . $dbHandle->quote($name);
                     $dbHandle->execute($updateCmd) or die $dbHandle->errstr;
                     my @warnings = getDbWarnings($dbHandle);
                     if (scalar(@warnings) > 0) {
                         print STDERR "Got a warning when executing $cmd.\n";
                         for my $w (@warnings) {
                             print STDERR "\t$w\n";
                         }
                     }
                 }
             } elsif ($cvValues[$i] ne $row[$i]) {    # check to make sure the CV and database tables are equal
                 print STDERR "WARNING: term $name in $tableName contrains different $columnNames[$i], $cvValues[$i] != $row[$i]\n";
             }
         }
     } else {
         die $dbHandle->errstr;
     }
 }
 
 # add a new term to the database table
 sub addNewTerm {
     my $dbHandle = shift;
     my $tableName = shift;
     my %termInfo = @_;
 
     my @columnNames = ();
     my @columnValues = ();
 
     # get the defined column and it's value
     for my $k (keys %termInfo) {
         next if ($k eq "type");
         push @columnNames,  mapTermToColumn($k);
         if (defined $termInfo{$k}) {
             push @columnValues, $dbHandle->quote($termInfo{$k});
         } else {
             push @columnValues, $dbHandle->quote("");
         }
     }
 
     my $cmd = "INSERT INTO $tableName (" . join(",", @columnNames)  . ") VALUES " .
                                      "(" . join(",", @columnValues) . ")";
     my $results = $dbHandle->execute($cmd) or die $dbHandle->errstr;
     my $warnCount = $dbHandle->quickQuery("SHOW COUNT(*) WARNINGS");
     my @warnings = getDbWarnings($dbHandle);
     if (scalar(@warnings) > 0) {
         print STDERR "Got a warning when executing $cmd.\n";
         for my $w (@warnings) {
             print STDERR "\t$w\n";
         }
     }
 }
 
 ############################################################################
 # Main
 ############################################################################
 
 my $now = time();
 my $wd = cwd();
 my $ok = GetOptions("instance=s",
                     "configDir=s",
                     "verbose=i"
                     );
 # parse options
 usage() if (!$ok);
 usage() if (scalar(@ARGV) != 2 and scalar(@ARGV) != 3);
 # get options or set defaults
 if (not defined $opt_instance) {
     $opt_instance = "prod";
 }
 my $configPath;
 if (defined $opt_configDir) {
     if ($opt_configDir =~ /^\//) {
         $configPath = $opt_configDir;
     } else {
         $configPath = "$wd/$opt_configDir";
     }
 } else {
     $configPath = "/hive/groups/encode/dcc/pipeline/encpipeline_$opt_instance/config/";
 }
 if(!(-d $configPath)) {
     die "configPath '$configPath' is invalid; Can't find the config directory\n";
 }
 if (not defined $opt_verbose) {
     $opt_verbose = 1;
 }
 HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");
 
 my $database  = $ARGV[0];
 my $tableName = $ARGV[1];
 my $termName;
 if (scalar(@ARGV) == 3) {
     $termName = $ARGV[2];
 } else {
     $termName = $tableName;
 }
 
 # read the cv.ra file
 my %cvTerms = Encode::getControlledVocab($configPath);
 
 # connect to the database and read the metadata table for the obj
 my $dbHandle = HgDb->new(DB => $database);
 
 # get hashes contraining the fields and term names
 my %dbFields = map { $_ => 0 } getDbFields($dbHandle, $tableName);
 my %cvFields = map { $_ => 0 } getCvFields(%{$cvTerms{$termName}});
 delete $cvFields{"type"};
 
 # check if there are any fields in the CV that aren't in the table
 my $fatalError = 0;
 for my $f (keys %cvFields) {
     if (defined $dbFields{mapTermToColumn($f)}) {
     } else {
         print STDERR "ERROR: Found field $f in CV that is missing from database table.\n";
         $fatalError = 1;
     }
 }
 
 # check if there are any fields in the table that aren't in the CV
 for my $f (keys %dbFields) {
     if (defined $cvFields{mapColumnToTerm($f)}) {
     } else {
         print STDERR "WARNING: Found field $f in database table that is missing from CV.\n";
     }
 }
 
 if ($fatalError) {
     exit 10;
 }
 
 # read the names present in the database
 my %dbNames = ();
 my $results = $dbHandle->execute("SELECT name FROM $tableName") or die $dbHandle->errstr;
 if ($results) {
     while(my @row = $results->fetchrow_array()) {
         my $name = $row[0];
         $dbNames{$name} = 0;
     }
 }
 
 # read the names from the CV
 my %cvNames = ();
 my %terms = %{$cvTerms{$termName}};
 for my $t (keys %terms) {
     $cvNames{$t} = 0;   # store the name for later diff'ing
     my %termInfo = %{$terms{$t}};
     if (defined $dbNames{$t}) {     # if it's already in the table, 
         HgAutomate::verbose(3, "Term $t already in table $tableName.\n");
         checkUpdateFields($dbHandle, $tableName, %termInfo);
     } else {        # add it
         HgAutomate::verbose(2, "Term $t not in table $tableName, adding it.\n");
         addNewTerm($dbHandle, $tableName, %termInfo);
     }
 }
 
 # check for terms in the database that are not in the cv
 for my $n (sort keys %dbNames) {
     if (defined $cvNames{$n}) {     # if in cv
         # do nothing
     } else {        # otherwise give a warning
         print STDERR "Term $n found in table $tableName but not in cv.\n";
     }
 }