src/hg/encode/encodeMkGeoPkg/encodeMkGeoPkg 1.3

1.3 2010/05/13 08:25:11 krish
more general characteristics system
Index: src/hg/encode/encodeMkGeoPkg/encodeMkGeoPkg
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/encode/encodeMkGeoPkg/encodeMkGeoPkg,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/encode/encodeMkGeoPkg/encodeMkGeoPkg	18 Apr 2010 09:37:14 -0000	1.2
+++ src/hg/encode/encodeMkGeoPkg/encodeMkGeoPkg	13 May 2010 08:25:11 -0000	1.3
@@ -11,79 +11,348 @@
 use RAFile;
 use HgAutomate;
 use HgDb;
 
+# maps metadata terms to the terms used in cv.ra
+my %expVarCVMapper = (
+    "cell" => "Cell Line",
+    "antibody" => "Antibody",
+    "treatment" => "treatment",
+    "control" => "control",
+);
+
+# for each metadata term, what fields do we want to output
+my %expVarDetails = (
+    "cell"      => ["organism", "description", "karyotype", "lineage", "sex"],
+    "antibody"  => ["antibodyDescription", "targetDescription", "vendorName"],
+    "treatment" => ["description"],
+    "control"   => ["description"],
+);
+
+# translates cv.ra fields into english labels
+my %detailsCleaner = (
+    "antibodyDescription" => "description",
+    "targetDescription"   => "target description",
+    "vendorName"          => "vendor name",
+);
+
+# takes dirty metadata and outputs normalized form
+my %dataTypeCleaner = (
+    "ChIPseq"   => "ChIP-seq",
+    "ChipSeq"   => "ChIP-seq",
+    "DNaseSeq"  => "DNase-seq",
+    "FAIREseq"  => "FAIRE-seq",
+);
+
+# given a normalized datatype (see above), give the molecle involved
+my %dataTypeMolecule = (
+    "ChIP-seq"  => "genomic DNA",
+    "DNase-seq" => "genomic DNA",
+    "FAIRE-seq" => "genomic DNA",
+);
+
+# given a normalized datatype, give the library strategy
+my %dataTypeLibraryStrategy = (
+    "ChIP-seq"  => "ChIP-Seq",
+);
+
+# given a normalized datatype, give the library source
+my %dataTypeLibrarySource = (
+    "ChIP-seq"  => "genomic",
+);
+
+# given a normalized datatype, give the library selection method
+my %dataTypeLibrarySelection = (
+    "ChIP-seq"  => "ChIP",
+);
+
 use vars qw/
     $opt_configDir
     $opt_verbose
+    $opt_instance
     /;
 
 sub usage {
     print STDERR <<END;
-usage: encodeMkGeoPkg database table
-
+usage: encodeMkGeoPkg database composite
+    -instance=s         Use ENCODE pipeline instance s (default prod).
+    -configDir=dir      Path of configuration directory, containing metadata
+                        .ra files (default: /hive/groups/encode/dcc/pipeline/encpipeline_prod/config/)
     -verbose=num        Set verbose level to num (default 1).
 END
 exit 1;
 }
 
 ############################################################################
+# Function
+
+sub getObjMetadata {
+    my ($db, $obj) = @_;
+    my $results = $db->execute("SELECT var, val FROM metaDb WHERE obj = '$obj'");
+    my %metadata = ();
+    if($results) {
+        while(my @row = $results->fetchrow_array()) {
+            my $key = $row[0];
+            my $value = $row[1];
+            $metadata{$key} =  $value;
+            #print "\t$key = $value\n";
+        }
+    }
+
+    return %metadata;
+}
+
+sub getObjsMatching {
+    my ($db, $var, $val) = @_;
+    # get the list of objects matching var=val
+    my $results = $db->execute("SELECT obj FROM metaDb WHERE var = \"$var\" and val = \"$val\"");
+    my @objects = ();
+    if($results) {
+        while(my @row = $results->fetchrow_array()) {
+            my $obj = $row[0];
+            push @objects, $obj;
+        }
+    }
+
+    return @objects;
+}
+
+sub nextTuple(\@\@) {
+    my $current = $_[0];
+    my $last = $_[1];
+    
+    my $i = scalar(@{$current}) - 1;
+    while ($i >= 0 and @{$current}[$i] == @{$last}[$i] - 1) {
+        @{$current}[$i] = 0;
+        --$i;
+    }
+    if ($i < 0) {
+        return 0
+    } else {
+        ++@{$current}[$i];
+        return 1;
+    }
+}
+
+sub getSameMetadata {
+    my $field = shift;
+    my @metadataList = @_;
+
+    my %metadata;
+
+    %metadata = %{$metadataList[0]};
+    my $firstValue = $metadata{$field};
+    for (my $i = 1; $i < scalar(@metadataList); ++$i) {
+        %metadata = %{$metadataList[$i]};
+        die "$firstValue != $metadata{$field}" unless $firstValue eq $metadata{$field};
+    }
+    return $firstValue;
+}
+
+############################################################################
 # Main
 
 my $now = time();
 my $wd = cwd();
-my $ok = GetOptions("configDir=s",
+my $ok = GetOptions("instance=s",
+                    "configDir=s",
                     "verbose=i"
                     );
+# parse options
 usage() if (!$ok);
 usage() if (scalar(@ARGV) < 2);
-$opt_verbose = 1 if (!defined $opt_verbose);
+# get options or set defaults
+if (not defined $opt_instance) {
+    $opt_instance = "prod";
+}
 my $configPath;
 if (defined $opt_configDir) {
     if ($opt_configDir =~ /^\//) {
         $configPath = $opt_configDir;
     } else {
         $configPath = "$wd/$opt_configDir";
     }
 } else {
-    $configPath = "/usr/local/apache/cgi-bin/encode/";
+    $configPath = "/hive/groups/encode/dcc/pipeline/encpipeline_$opt_instance/config/";
 }
 if(!(-d $configPath)) {
     die "configPath '$configPath' is invalid; Can't find the config directory\n";
 }
+if (not defined $opt_verbose) {
+    $opt_verbose = 1;
+}
 HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");
 
 my $database  = $ARGV[0];
-my $tableName = $ARGV[1];
+my $compositeName = $ARGV[1];
 
-# connect to the database and read the metadata table for the obj
-my $db = HgDb->new(DB => $database);
-
-my $results = $db->execute("SELECT var, val FROM $database.mdb WHERE obj = '$tableName'");
-my %metadata = ();
-if($results) {
-    while(my @row = $results->fetchrow_array()) {
-        my $key = $row[0];
-        my $value = $row[1];
-        $metadata{$key} =  $value;
-    }
-}
+# some counters we use
+my $i;
+my $j;
+my $c;
+my $f;
+my $o;
 
 # read the cv.ra file
-my %terms = Encode::getControlledVocab($configPath);
-my %cellLines = %{$terms{"Cell Line"}};
+my %cvTerms = Encode::getControlledVocab($configPath);
 
-my $title;
-my $organism;
+# connect to the database and read the metadata table for the obj
+my $dbHandle = HgDb->new(DB => $database);
+
+# read the experimental variables from expVars.ra
+my @expVars = Encode::getExpVars($configPath, $compositeName);
+my $numDim = scalar(@expVars);
+
+# read all the possible experiment variables
+#my @expSpace;
+#my @currentTuple;
+#my @lastTuple;
+#for($i = 0; $i < $numDim; ++$i) {
+#    push @expSpace, [];
+#    push @currentTuple, 0;
+#    my $var = $expVars[$i];
+#    my $results = $dbHandle->execute("SELECT val FROM metaDb WHERE obj IN (SELECT obj FROM metaDb WHERE var = \"composite\" and val = \"$compositeName\") AND var = \"$var\" GROUP BY val;");
+#    if($results) {
+#        while(my @row = $results->fetchrow_array()) {
+#            my $val = $row[0];
+#            push @{$expSpace[$i]}, $val;
+#        }
+#    }
+#    push @{$expSpace[$i]}, "";
+#    push @lastTuple, scalar(@{$expSpace[$i]});
+#}
+
+#$, = "\t";
+#print @currentTuple, "\n";
+#while(nextTuple(@currentTuple, @lastTuple)) {
+#    print @currentTuple, "\n";
+#}
+#print @lastTuple, "\n";
+#exit;
+
+# get the list of objects of this composite and store it in a set (hash)
+my %objsSet = map { $_ => 0 } getObjsMatching($dbHandle, "composite", $compositeName);
+my @objects = keys %objsSet;
+
+print STDERR "Number of objects in composite track: ", scalar(keys %objsSet), "\n";
+print STDERR "Number of experimental variables: ", scalar(@expVars), "\n";
+for $i (@expVars) {
+    print STDERR "\t", $i, "\n";
+}
 
-if(defined $metadata{"lab"} and defined $metadata{"cell"} and defined $metadata{"antibody"}) {
-    my $lab = $metadata{"lab"};
-    my $cell = $metadata{"cell"};
-    my $antibody = $metadata{"antibody"};
-    $title = "ENCODE $lab $cell $antibody";
+# while there are objects left
+while(scalar(keys %objsSet) > 0) {
+    # get the 1st object
+    my $firstObject = (keys(%objsSet))[0];
+    my @currentObjects = ($firstObject);
+    delete $objsSet{$firstObject};
+
+    my %metadata;
+    # get the experimental variables for that object
+    %metadata = getObjMetadata($dbHandle, $firstObject);
+    my @currentMetadata = {%metadata};
+    my @currentExpVars;
+    print $firstObject, "\n";
+    for $i (@expVars) {
+        defined $metadata{$i} or die "can't find field for experimental variable " . $i;
+        push @currentExpVars, $metadata{$i};
+    }
 
+    #for($i = 0; $i < scalar(@expVars); ++$i) {
+    #    print $expVars[$i], ": ", $currentExpVars[$i], "\t";
+    #}
+    #print "\n";
+    
+    # get other objs with the same metadata
+    for $o (keys %objsSet) {
+        %metadata = getObjMetadata($dbHandle, $o);
+        my $match = 1;
+        for($i = 0; $i < scalar(@expVars); ++$i) {
+            defined $metadata{$expVars[$i]} or die "can't find field for experimental variable " . $i;
+            if ($metadata{$expVars[$i]} ne $currentExpVars[$i]) {
+                $match = 0;
+            }
+        }
+        if ($match) {
+            push @currentObjects, $o;
+            push @currentMetadata, {%metadata};
+            delete $objsSet{$o};
+        }
+    }
+    #for $o (@currentObjects) {
+    #    print "\t", $o, "\n";
+    #}
+    #for my $objectName (@currentObjects) {
+    #    my %metadata = getObjMetadata($dbHandle, $objectName);
+    #    print $objectName, "\n";
+    #    for my $k (keys %metadata) {
+    #        print "\t", $k, " = ", $metadata{$k}, "\n";
+    #    }
+    #}
+
+    my $grant = getSameMetadata("grant", @currentMetadata);
+    my $sample_identifier = $grant . "_" . join "_", @currentExpVars;
+    my $sample_title = $sample_identifier;
+
+    # read the organism for this cell line
+    my $cell = getSameMetadata("cell", @currentMetadata);
+    my %cellLines = %{$cvTerms{"Cell Line"}};
     my %cellLineInfo = %{$cellLines{$cell}};
-    $organism = $cellLineInfo{"organism"};
-}
+    my $organism = $cellLineInfo{"organism"};
+    my $provider = $cellLineInfo{"vendorName"};
+    my $growthProtocolUrl = "http://genome.ucsc.edu/ENCODE/protocols/cell/" . $cellLineInfo{"protocol"};
+    my $growthProtocol = "Cells were grown according to the approved ENCODE cell culture protocols: $growthProtocolUrl";
+
+    my $dataType = $dataTypeCleaner{getSameMetadata("dataType", @currentMetadata)};
+    my $molecule = $dataTypeMolecule{$dataType};
+
+    my $libraryStrategy = $dataTypeLibraryStrategy{$dataType};
+    my $ibrarySource    = $dataTypeLibrarySource{$dataType};
+    my $ibrarySelection = $dataTypeLibrarySelection{$dataType};
+
+    # generate the POST
+    print "^SAMPLE=$sample_identifier" . "\n";
+    print "!Sample_title = $sample_title" . "\n";
+    print "!Sample_source_name = $cell" . "\n";
+    print "!Sample_organism = $organism" . "\n";
+    # iterate over exp vars
+    for ($i = 0; $i < scalar(@expVars); ++$i) {
+        my $e = getSameMetadata($expVars[$i], @currentMetadata);
+        # output the base exp var value
+        print "!Sample_characteristics = $expVars[$i]: $e" . "\n";
+        # if the currnet exp var is none
+        if ($currentExpVars[$i] eq "None" or $currentExpVars[$i] eq "none") {
+            # do nothing more
+        } else {
+            # otherwise output the fields given by %expVarDetails from the cv.ra
+            my %cvType = %{$cvTerms{$expVarCVMapper{$expVars[$i]}}};
+            my %cvData = %{$cvType{$currentExpVars[$i]}};
+            for $j (@{$expVarDetails{$expVars[$i]}}) {
+                # if there's a better label for this term use it
+                if (defined $detailsCleaner{$j}) {
+                    print "!Sample_characteristics = $expVars[$i] $detailsCleaner{$j}: ",  $cvData{$j}, "\n";
+                } else {
+                    print "!Sample_characteristics = $expVars[$i] $j: ",  $cvData{$j}, "\n";
+                }
+            }
+        }
+    }
+    print "!Sample_biomaterial_provider = $provider" . "\n";
+    #for $i (@currentMetadata) {
+    #    my %metadata = %{$i};
+    #    if ($metadata{"objType"} eq "file") {
+    #        my $file = $metadata{"fileName"};
+    #        print "!Sample_supplementary_file = $file" . "\n";
+    #    }
+    #}
+    print "!Sample_growth_protocol = $growthProtocol" . "\n";
+    print "!Sample_molecule = $molecule" . "\n";
+    print "!Sample_extract_protocol = [required]" . "\n";
+    print "!Sample_data_processing = [required; list_of_values]" . "\n";
+    print "!Sample_library_strategy = $libraryStrategy" . "\n";
+    print "!Sample_library_source = $ibrarySource" . "\n";
+    print "!Sample_library_selection = $ibrarySelection" . "\n";
+    print "!Sample_instrument_model = [required]" . "\n";
+    print "\n";
 
-print $title, "\n";
-print $organism, "\n";
+    exit;
+}