src/hg/archaeStuff/scripts/extract-genome-info 1.3
1.3 2009/03/25 21:51:26 pchan
add options to retrieve data from centraldb
Index: src/hg/archaeStuff/scripts/extract-genome-info
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/archaeStuff/scripts/extract-genome-info,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 4 -r1.2 -r1.3
--- src/hg/archaeStuff/scripts/extract-genome-info 23 Jul 2008 03:59:49 -0000 1.2
+++ src/hg/archaeStuff/scripts/extract-genome-info 25 Mar 2009 21:51:26 -0000 1.3
@@ -1,18 +1,30 @@
#! /usr/bin/perl -w
-# extract-genome-info <field names> > <output file>
+# extract-genome-info [options] > <output file>
use strict;
+use Getopt::Long;
use archaeBrowser::Utils;
use archaeBrowser::Constant;
use archaeBrowser::Organism;
use archaeBrowser::GenomeInfo;
if (scalar(@ARGV) <= 0) {
- die "Usage: extract-genome-info <field names> > <output file>\n";
+ die "Usage: extract-genome-info [options] > <output file>\n",
+ "Options:\n",
+ "--file list of space-delimited fields in Genome-info-db to be included\n",
+ "--db list of space-delimited fields in centraldb to be included\n",
+ "--dbonly limit output to genomes included in browser database\n\n";
}
+#options
+our $opt_file = "";
+our $opt_db = "";
+our $opt_dbonly = 0;
+
+Getopt::Long::GetOptions("file=s", "db=s", "dbonly");
+
# Global constants
our $global_constants = archaeBrowser::Constant->new;
# Utilities and error handling
@@ -38,18 +50,174 @@
&Read_genome_info($global_constants->genome_info_euk(), \%global_vars); # manual changes to genome info
&Read_genome_info($global_constants->genome_info_vir(), \%global_vars); # manual changes to genome info
&Read_genome_info($global_constants->genome_info_mods(), \%global_vars); # manual changes to genome info
-my $field = "";
-my $ct = 0;
-print join("\t", @ARGV) . "\n";
-foreach my $abbr (keys %organism_AR) {
+my $db_genomes = &read_centraldb();
+my $file_genomes = &read_genome_info($db_genomes);
+&write_output($db_genomes, $file_genomes);
+
+exit;
+
+# end main
+
+
+sub read_centraldb
+{
+ my %genomes = ();
+ my $genome = {};
+ my $col = 0;
+ my @cols = ();
+ my @headers = ();
+ my $name_col = 0;
+ my $cmd = "select ";
+ my @fields = split(/ /, $opt_db);
+
+ if (scalar(@fields) > 0)
+ {
+ for (my $i = 0; $i < scalar(@fields); $i++)
+ {
+ if ($i > 0) { $cmd .= ","; }
+ if ($fields[$i] =~ /^clade$/i) { $cmd .= "b.clade"; }
+ else { $cmd .= "a.$fields[$i]"; }
+ }
+ if ($opt_db !~ /name/) { $cmd .= ",a.name"; }
+ $cmd .= " from dbDb a, genomeClade b where a.genome = b.genome;";
+ my $result = `hgsql "centraldb" -e "$cmd"`;
+ my @results = split(/\n/, $result);
+
+ for (my $line = 0; $line < scalar(@results); $line++)
+ {
+ chomp($line);
+ if ($line == 0)
+ {
+ @headers = split(/\t/, $results[$line]);
+ for (my $i = 0; $i < scalar(@headers); $i++)
+ {
+ if ($headers[$i] eq "name")
+ {
+ $name_col = $i;
+ last;
+ }
+ }
+ }
+ else
+ {
+ @cols = split(/\t/, $results[$line]);
+ $genome = {};
+ for ($col = 0; $col < scalar(@cols); $col++)
+ {
+ $genome->{$headers[$col]} = $cols[$col];
+ }
+ $genomes{$cols[$name_col]} = $genome;
+ }
+ }
+ }
+
+ return \%genomes;
+}
+
+sub read_genome_info
+{
+ my ($db_genomes) = @_;
+ my %genomes = ();
+ my $genome = {};
+ my $field = "";
+ my $dummy = "";
+ my @fields = split(/ /, $opt_file);
+ if (scalar(@fields) > 0)
+ {
+ foreach my $abbr (keys %organism_AR)
+ {
+ if (($opt_dbonly && defined($db_genomes->{$organism_AR{$abbr}->{db_name}})) || !$opt_dbonly)
+ {
+ $genome = {};
+ foreach $field (@fields)
+ {
+ if ($field eq "tax_ID")
+ {
+ ($genome->{$field}, $dummy) = split(/:/, $organism_AR{$abbr}->{$field});
+ }
+ elsif (defined $organism_AR{$abbr}->{$field})
+ {
+ $genome->{$field} = $organism_AR{$abbr}->{$field};
+ }
+ elsif ($field eq "tax")
+ {
+ $genome->{tax} = $organism_AR{$abbr}->{tax_ID};
+ }
+ else
+ {
+ $genome->{$field} = "";
+ }
+ }
+ if (defined $organism_AR{$abbr}->{db_name})
+ {
+ $genomes{$organism_AR{$abbr}->{db_name}} = $genome;
+ }
+ else
+ {
+ $genomes{$organism_AR{$abbr}->{org_name}} = $genome;
+ }
+ }
+ }
+ }
+ return \%genomes;
+}
+
+sub write_output
+{
+ my ($db_genomes, $file_genomes) = @_;
+ my $ct = 0;
+ my $col = "";
+
+ my @db_fields = split(/ /, $opt_db);
+ foreach $col (@db_fields)
+ {
+ if ($ct > 0) { print STDOUT "\t"; }
+ print STDOUT $col;
+ $ct++;
+ }
+ my @file_fields = split(/ /, $opt_file);
+ foreach $col (@file_fields)
+ {
+ if ($ct > 0) { print STDOUT "\t"; }
+ print STDOUT $col;
+ $ct++;
+ }
+ print STDOUT "\n";
+
+ if (scalar(@db_fields) > 0)
+ {
+ foreach my $org (sort keys %$db_genomes)
+ {
$ct = 0;
- foreach $field (@ARGV) {
- if ($ct > 0) { print "\t"; }
- if (defined $organism_AR{$abbr}->{$field}) {
- print $organism_AR{$abbr}->{$field};
+ foreach $col (@db_fields)
+ {
+ if ($ct > 0) { print STDOUT "\t"; }
+ print STDOUT $db_genomes->{$org}->{$col};
+ $ct++;
}
+ foreach $col (@file_fields)
+ {
+ if ($ct > 0) { print STDOUT "\t"; }
+ if (!defined $file_genomes->{$org}->{$col}) { print STDERR "$org\t $col\n"; }
+ print STDOUT $file_genomes->{$org}->{$col};
$ct++;
}
- print "\n";
+ print STDOUT "\n";
+ }
+ }
+ else
+ {
+ foreach my $org (sort keys %$file_genomes)
+ {
+ $ct = 0;
+ foreach $col (@file_fields)
+ {
+ if ($ct > 0) { print STDOUT "\t"; }
+ print STDOUT $file_genomes->{$org}->{$col};
+ $ct++;
+ }
+ print STDOUT "\n";
+ }
+ }
}
\ No newline at end of file