src/hg/archaeStuff/scripts/extract-genome-info 1.3

1.3 2009/03/25 21:51:26 pchan
add options to retrieve data from centraldb
Index: src/hg/archaeStuff/scripts/extract-genome-info
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/archaeStuff/scripts/extract-genome-info,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/archaeStuff/scripts/extract-genome-info	23 Jul 2008 03:59:49 -0000	1.2
+++ src/hg/archaeStuff/scripts/extract-genome-info	25 Mar 2009 21:51:26 -0000	1.3
@@ -1,55 +1,223 @@
 #! /usr/bin/perl -w
 
-# extract-genome-info <field names> > <output file>
+# extract-genome-info [options] > <output file>
 
 use strict;
+use Getopt::Long;
 use archaeBrowser::Utils;
 use archaeBrowser::Constant;
 use archaeBrowser::Organism;
 use archaeBrowser::GenomeInfo;
 
 if (scalar(@ARGV) <= 0) {
-    die "Usage: extract-genome-info <field names> > <output file>\n";
+    die "Usage: extract-genome-info [options] > <output file>\n",
+        "Options:\n",
+        "--file   list of space-delimited fields in Genome-info-db to be included\n",
+        "--db     list of space-delimited fields in centraldb to be included\n",
+        "--dbonly limit output to genomes included in browser database\n\n";
 }
 
+#options
+our $opt_file = "";
+our $opt_db = "";
+our $opt_dbonly = 0;
+
+Getopt::Long::GetOptions("file=s", "db=s", "dbonly");
+
 # Global constants
 our $global_constants = archaeBrowser::Constant->new;
 
 # Utilities and error handling
 our $utils = archaeBrowser::Utils->new;
 
 # Organisms
 our $organism;
 our %organism_AR;
 our %org_abbr_AR;
 our %domain_ct_AR;
 our @org_list;
 
 # Module-sharing variable hash
 our %global_vars = (global_constants => $global_constants,
                     utils => $utils,
                     organism => \$organism,
                     organism_AR => \%organism_AR,
                     org_abbr_AR => \%org_abbr_AR,
                     domain_ct_AR => \%domain_ct_AR
                    );
 
 &Read_genome_info($global_constants->genome_info(), \%global_vars);       # default genome information
 &Read_genome_info($global_constants->genome_info_euk(), \%global_vars);   # manual changes to genome info
 &Read_genome_info($global_constants->genome_info_vir(), \%global_vars);   # manual changes to genome info
 &Read_genome_info($global_constants->genome_info_mods(), \%global_vars);  # manual changes to genome info
 
-my $field = "";
-my $ct = 0;
-print join("\t", @ARGV) . "\n";
-foreach my $abbr (keys %organism_AR) {
+my $db_genomes = &read_centraldb();
+my $file_genomes = &read_genome_info($db_genomes);
+&write_output($db_genomes, $file_genomes);
+
+exit;
+
+# end main
+
+
+sub read_centraldb
+{
+    my %genomes = ();
+    my $genome = {};
+    my $col = 0;
+    my @cols = ();
+    my @headers = ();
+    my $name_col = 0;
+    my $cmd = "select ";
+    my @fields = split(/ /, $opt_db);
+    
+    if (scalar(@fields) > 0)
+    {
+        for (my $i = 0; $i < scalar(@fields); $i++)
+        {
+            if ($i > 0) { $cmd .= ","; }
+            if ($fields[$i] =~ /^clade$/i) { $cmd .= "b.clade"; }
+            else { $cmd .= "a.$fields[$i]"; }
+        }
+        if ($opt_db !~ /name/) { $cmd .= ",a.name"; }
+        $cmd .= " from dbDb a, genomeClade b where a.genome = b.genome;";
+        my $result = `hgsql "centraldb" -e "$cmd"`;
+        my @results = split(/\n/, $result);
+
+        for (my $line = 0; $line < scalar(@results); $line++)
+        {
+            chomp($line);
+            if ($line == 0)
+            {
+                @headers = split(/\t/, $results[$line]);
+                for (my $i = 0; $i < scalar(@headers); $i++)
+                {
+                    if ($headers[$i] eq "name")
+                    {
+                        $name_col = $i;
+                        last;
+                    }
+                }
+            }
+            else
+            {
+                @cols = split(/\t/, $results[$line]);
+                $genome = {};
+                for ($col = 0; $col < scalar(@cols); $col++)
+                {
+                    $genome->{$headers[$col]} = $cols[$col];
+                }
+                $genomes{$cols[$name_col]} = $genome;
+            }
+        }
+    }
+    
+    return \%genomes;
+}
+
+sub read_genome_info
+{
+    my ($db_genomes) = @_;
+    my %genomes = ();
+    my $genome = {};
+    my $field = "";
+    my $dummy = "";
+    my @fields = split(/ /, $opt_file);
+    if (scalar(@fields) > 0)
+    {
+        foreach my $abbr (keys %organism_AR)
+        {
+            if (($opt_dbonly && defined($db_genomes->{$organism_AR{$abbr}->{db_name}})) || !$opt_dbonly)
+            {
+                $genome = {};
+                foreach $field (@fields)
+                {
+                    if ($field eq "tax_ID")
+                    {
+                        ($genome->{$field}, $dummy) = split(/:/, $organism_AR{$abbr}->{$field});
+                    }
+                    elsif (defined $organism_AR{$abbr}->{$field})
+                    {
+                        $genome->{$field} = $organism_AR{$abbr}->{$field};
+                    }
+                    elsif ($field eq "tax")
+                    {
+                        $genome->{tax} = $organism_AR{$abbr}->{tax_ID};
+                    }
+                    else
+                    {
+                        $genome->{$field} = "";
+                    }
+                }
+                if (defined $organism_AR{$abbr}->{db_name})
+                {
+                    $genomes{$organism_AR{$abbr}->{db_name}} = $genome;
+                }
+                else
+                {
+                    $genomes{$organism_AR{$abbr}->{org_name}} = $genome;
+                }
+            }
+        }
+    }
+    return \%genomes;
+}
+
+sub write_output
+{
+    my ($db_genomes, $file_genomes) = @_;
+    my $ct = 0;
+    my $col = "";
+    
+    my @db_fields = split(/ /, $opt_db);
+    foreach $col (@db_fields)
+    {
+        if ($ct > 0) { print STDOUT "\t"; }
+        print STDOUT $col;
+        $ct++;
+    }
+    my @file_fields = split(/ /, $opt_file);
+    foreach $col (@file_fields)
+    {
+        if ($ct > 0) { print STDOUT "\t"; }
+        print STDOUT $col;
+        $ct++;
+    }
+    print STDOUT "\n";
+    
+    if (scalar(@db_fields) > 0)
+    {
+        foreach my $org (sort keys %$db_genomes)
+        {
     $ct = 0;
-    foreach $field (@ARGV) {
-        if ($ct > 0) { print "\t"; }
-        if (defined $organism_AR{$abbr}->{$field}) {
-            print $organism_AR{$abbr}->{$field};
+            foreach $col (@db_fields)
+            {
+                if ($ct > 0) { print STDOUT "\t"; }
+                print STDOUT $db_genomes->{$org}->{$col};
+                $ct++;
         }
+            foreach $col (@file_fields)
+            {
+                if ($ct > 0) { print STDOUT "\t"; }
+                if (!defined $file_genomes->{$org}->{$col}) { print STDERR "$org\t $col\n"; }
+                print STDOUT $file_genomes->{$org}->{$col};
         $ct++;
     }
-    print "\n";
+            print STDOUT "\n";
+        }
+    }
+    else
+    {
+        foreach my $org (sort keys %$file_genomes)
+        {
+            $ct = 0;
+            foreach $col (@file_fields)
+            {
+                if ($ct > 0) { print STDOUT "\t"; }
+                print STDOUT $file_genomes->{$org}->{$col};
+                $ct++;
+            }
+            print STDOUT "\n";
+        }        
+    }
 }
\ No newline at end of file