src/hg/archaeStuff/scripts/make-genome-info 1.19
1.19 2010/02/11 01:22:07 pchan
fix duplicate dbname problem introduced by similar substrain names
Index: src/hg/archaeStuff/scripts/make-genome-info
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/archaeStuff/scripts/make-genome-info,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -B -U 1000000 -r1.18 -r1.19
--- src/hg/archaeStuff/scripts/make-genome-info 10 Aug 2009 04:20:14 -0000 1.18
+++ src/hg/archaeStuff/scripts/make-genome-info 11 Feb 2010 01:22:07 -0000 1.19
@@ -1,759 +1,773 @@
#! /usr/bin/perl -w
## make-genome-info
## Todd Lowe October 2005
##
## A Perl script adapted from setting up scans of all prokaryotic genomes for tRNAs,
## now used to collect genome info to set up prokaryotic browsers in one program with
## the companion script "make-browser"
## Sends genome info to STDOUT, but "Genome-info-db" is the suggested output file name
## Warnings and error messages sent STDERR
## Known to not work correctly with draft genomes which have incomplete file sets at NCBI
##
## Warning -- globals are used throughout from sheer laziness, but program is
## reasonably well tested
# CVS version tag
# $Id$
use Getopt::Std;
# Constants
# tRNAscan-SE run options for various domains
$prog_options{"Archaea"} = "-o# -f# -m# -F# -HyA";
$prog_options{"Bacteria"} = "-o# -f# -m# -F# -HyB";
$prog_options{"Eukaryota"} = "-o# -f# -m# -F# -Hy";
$prog_options{"Viruses"} = "-o# -f# -m# -F# -HyB";
# TIGR CMR pages info
my $tigr_genome_list_link = "http://cmr.tigr.org/tigr-scripts/CMR/shared/Genomes.cgi";
our $tigr_id_page = "TIGR_id_page.html";
# JGI genome list
my $jgi_genome_list_link = "http://img.jgi.doe.gov/cgi-bin/pub/main.cgi?section=TaxonList&mainPageStats=1";
#my $jgi_genome_list_link = "http://img.jgi.doe.gov/cgi-bin/pub/main.cgi?section=TaxonList&page=taxonListAlpha&mainPageStats=1";
# old JGI info link: "http://img.jgi.doe.gov/cgi-bin/pub/main.cgi?page=taxonListAlpha";
our $jgi_id_page = "JGI_id_page.html";
our $swissProt_species_list_link = "http://www.expasy.ch/cgi-bin/speclist";
our $sp_id_page = "SP_id_page.html";
our $stringDb_list_link = "http://string32.embl.de/";
our $stringDb_id_page = "stringDB_id_page.html";
our @Blat_ID = ();
# Starting priority number for arranging species in the browser (will increment for each new species)
$Domain_ct{"Archaea"} = 200;
$Domain_ct{"Bacteria"} = 500;
$Domain_ct{"Eukaryota"} = 100;
$Domain_ct{"Viruses"} = 2000;
$opt_k = '';
&getopts('k:');
$search_key = $opt_k;
if ($#ARGV < 0) {
die "\nUsage: make-genome-info [-k <key>] <genomes directory>\n\n",
" Outputs basic info on all completed prokaryotes from local mirror of NCBI \n",
" genome directory (ftp://ftp.ncbi.nih.gov/genomes/Bacteria/)\n\n",
" Hint: An availabe local mirror is /projects/lowelab/db/genomes/Bacteria\n",
" Default usage: make-genome-info /projects/lowelab/db/genomes/Bacteria\n\n";
}
$genomes_dir = shift(@ARGV);
opendir(GB_GENOMES,$genomes_dir) ||
die "Could not read $genomes_dir\n\n";
@all_orgs = readdir(GB_GENOMES);
splice(@all_orgs,0,2); # delete ".", "..","README","accession" entries
# Get current list of TIGR CMR genomes
#system("lynx -source $tigr_genome_list_link |grep 'GenomePage' > $tigr_id_page");
print STDERR "\nGrabbing current copy of JGI species IDs\n";
# Get current list of JGI IMG genomes
if (!-r $jgi_id_page) {
system("lynx -source \"$jgi_genome_list_link\" > $jgi_id_page");
}
# Get current list of SwissProt species names
if (!-r $sp_id_page) {
print STDERR "\nGrabbing current copy of Swissprot species list names\n";
system("lynx -dump -nolist -width=300 \"$swissProt_species_list_link\" > $sp_id_page");
}
# Get current list of StringDB IDs
system("lynx -source \"$stringDb_list_link\" > $stringDb_id_page");
print "## This file is generated automatically with 'make-genome-info' Perl script\n",
"## Do NOT edit this file -- edit a copy (i.e. Genome-info-mods) which only has fields that you\n",
"## have hand-edited. Manual changes to Genome-info-mods will always override changes to this file\n",
"## in 'make-browser' script\n\n";
$org_name = '';
$species_ct = 0;
# Loop through all the species directories found
foreach $org_name (sort @all_orgs) {
#Skip non-directories and non-organism directories
if ((!-d "$genomes_dir/$org_name") ||
($org_name !~ /$search_key/))
{
next;
}
if ($org_name =~ /^(TEMP|CLUSTER)/) {
next;
}
# Get Organism domain info
opendir(ORG_DIR,"$genomes_dir/$org_name/") ||
die "Unable to open $genomes_dir/$org_name/ to read seq files\n";
my @all_seq_files = readdir(ORG_DIR);
local $first_gbk = '';
for (my $filect=0; $filect <= $#all_seq_files; $filect++)
{
if ($all_seq_files[$filect] =~ /\.gbk/) {
$first_gbk = $all_seq_files[$filect];
last;
}
}
$full_tax_list = `head -20 "$genomes_dir/$org_name/$first_gbk" | grep -A2 "ORGANISM" | grep -v ORGANISM | grep -v REFERENCE`;
$full_tax_list =~ s/\n/ /g;
$full_tax_list =~ s/[ ]+/ /g;
$full_tax_list =~ s/^[ ]*//g;
($org_domain,@rest) = split(/; /,$full_tax_list);
&Set_org_abbrev;
&Read_report_files;
&Read_summary_files;
&Read_ncbi_summary_txt;
if (!$tax_id) {
&Get_tax_id_from_gbk;
}
&Create_new_seqfile_names;
&Get_GC_content;
# &Get_TIGR_Genome_ID;
&Get_SwissProt_ID;
&Get_JGI_Genome_ID;
&Get_StringDb_ID;
if ($org_domain eq "Archaea") {
$Domain_ct{$org_domain}+= 2;
}
else {
$Domain_ct{$org_domain}+= 1;
}
&Output_entry;
# Uncomment this if you want to see the genome names scroll by
# print STDERR "\nCreated entry for $org_abbr\n";
$species_ct++;
print STDERR ".";
}
print STDERR "\nCompleted outputing genome information for $species_ct species\n";
#############
## End Main #
#############
sub Set_org_abbrev {
$org_name_str = $org_name;
$org_name_str =~ s/_/ /g;
($org_genus,$org_species,@rest) = split(/_/,$org_name);
$org_species =~ s/-//g;
$org_genus =~ s/-//g;
# Get search string for last word in full name for later matching duties
$org_str_tail = $rest[$#rest];
if (!defined($org_str_tail)) {
$org_str_tail = '';
}
# Shorten strain name for $org_dbname
$strain_info = uc(join("_",@rest));
$strain_info =~ s/ATCC_/ATCC/;
$strain_info =~ s/DSM_/DSM/;
$strain_info =~ s/SEROVAR//;
$strain_info =~ s/BIOVAR//;
$strain_info =~ s/-/_/;
$strain_info =~ s/\./_/;
$strain_str = '';
if ($strain_info ne '') {
$strain_str = substr($strain_info,0,12);
}
if ($org_domain eq "Viruses") {
$org_search_string = $org_name;
}
else {
$org_search_string = $org_genus." ".$org_species;
}
$org_abbr = join("_",substr($org_genus,0,4),substr($org_species,0,4),@rest);
$org_dbname = join("",
lc(substr($org_genus,0,4)),
ucfirst(substr($org_species,0,4)),
"_",
$strain_str);
$org_dbname =~ s/__/_/g;
if (substr($org_dbname,length($org_dbname)-1,1) eq '_') {
chop($org_dbname);
}
if (defined($dbnames{$org_dbname})) {
$dbnames{$org_dbname}++;
- $org_dbname =~ s/$strain_str/$strain_info/;
+# $org_dbname =~ s/$strain_str/$strain_info/;
+ if (length($strain_info) > 22)
+ {
+ $strain_str = substr($strain_info,0,22);
+ }
+ else
+ {
+ $strain_str = $strain_info;
+ }
+ $org_dbname = join("",
+ lc(substr($org_genus,0,4)),
+ ucfirst(substr($org_species,0,4)),
+ "_",
+ $strain_str);
+
$org_dbname =~ s/__/_/g;
}
else {
$dbnames{$org_dbname} = 1;
}
}
# Read *.rpt files in each organism's directory to collect more stats
sub Read_report_files
{
$genome_id_list = 0;
$seq_acc = '';
$full_tax_name = '';
$orig_tax_name = '';
$tax_id = 0;
$last_genome_id = 0;
$gene_count = '';
$seq_len = '';
$pub_id = 0;
$report_info = `cat $genomes_dir/$org_name/[AN][ZC]*.rpt`;
@report_lines = split(/\n/,$report_info);
foreach $line (@report_lines) {
if ($line =~ /^Taxname:\s(.+)$/) {
$temp_tax_name = $1;
if (($temp_tax_name =~ /phage/) &&
($org_domain ne "Viruses"))
{
last;
}
$full_tax_name = $1;
$orig_tax_name = $1;
}
elsif ($line =~ /^Taxid:\s(\d+)/) {
$tax_id = $1;
}
elsif ($line =~ /^Publications:\s+(\d+)/) {
$pub_id = $1;
}
elsif ($line =~ /^Genome_ID = (\d+)/) {
$genome_id_list .= $1." ";
$last_genome_id = $1;
}
elsif ($line =~ /^DNA\s+length\s=\s+(\d+)/) {
$seq_len .= $1." ";
}
elsif ($line =~ /^Accession: (\S+)\.\d+/) {
$seq_acc .= $1." ";
}
elsif ($line =~ /^Gene count:\s+(\d+)/) {
$gene_count .= $1." ";
}
}
chop($seq_acc);
}
## Read info in NCBI Genomes /Bacteria directory with summary info for prok orgs
sub Read_summary_files {
if ($full_tax_name eq '') {
$full_tax_name = $org_search_string;
$full_tax_name =~ s/_/ /g;
}
$domain_letter = "X";
$tax_id_2 = 0;
$full_tax_name_2 = '';
$seq_source = '';
$genome_mb = 0;
$seq_acc_2 = '';
$seq_date = "-";
# $org_domain = "Unknown";
if (!-r "$genomes_dir/Complete.txt") {
print STDERR "Could not find Complete.txt for this genome -- skipping summary file read\n";
return 0;
}
$summary_info = `grep -i "$full_tax_name" $genomes_dir/Complete.txt`;
$summary_info .= `grep -i "$full_tax_name" $genomes_dir/InProgress.txt`;
@summary_lines = split(/\n/,$summary_info);
$seq_acc_2 = 'Unknown';
$line_ct=0;
if (defined($summary_lines[$line_ct]) &&
(($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]+)\t(\S+)\t\S+\,(N\S+)\t(\S+)/) ||
($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)\t(N\S+)/) ||
($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)/))) {
$domain_letter = $1;
$tax_id_2 = $2;
$full_tax_name_2 = $3;
$seq_source = $4;
$genome_mb = $5;
$seq_acc_2 = $6;
$seq_date = $7;
while (($line_ct < $#summary_lines) && ($seq_acc !~ /$seq_acc_2/))
{
$line_ct++;
if (($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)\t\S+\,(N\S+)\t(\S+)/) ||
($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)\t(N\S+)/) ||
($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)/)) {
$domain_letter = $1;
$tax_id_2 = $2;
$full_tax_name_2 = $3;
$seq_source = $4;
$genome_mb = $5;
$seq_acc_2 = $6;
$seq_date = $7;
}
}
if ($tax_id == 0) {
$tax_id = $tax_id_2;
}
if ($domain_letter eq "A") {
$org_domain = "Archaea";
}
elsif ($domain_letter eq "B") {
$org_domain = "Bacteria";
}
elsif ($domain_letter eq "E") {
$org_domain = "Eukaryota";
}
elsif ($domain_letter eq "V") {
$org_domain = "Viruses";
}
# Pick the longer taxonomy name which hopefully has strain specification in it
if (length($full_tax_name_2) > length($full_tax_name)) {
$full_tax_name = $full_tax_name_2;
}
if ($seq_acc !~ /$seq_acc_2/) {
print STDERR "\nWARNING: Seqence accession numbers ($seq_acc -> $seq_acc_2) do not match for $full_tax_name between *.rpt and Complete.txt\n";
}
if (!defined($seq_date)) {
$seq_date = "-";
}
if ($tax_id != $tax_id_2) {
print STDERR "\nWARNING: Tax ids do not match ($tax_id -> $tax_id_2) between *.rpt and Complete.txt for $org_name\n";
}
}
else {
print STDERR "\nUnable to find $full_tax_name in Complete.txt or InProgress.txt files\n";
}
}
## Read NCBI genome summary.txt to obtain GenBank accession numbers
sub Read_ncbi_summary_txt {
%genbank_acc_2 = ();
$genbank_acc = '';
$gb_acc = '';
$seq_acc_2 = '';
$tax_id_2 = 0;
@refseq_acc = split(/ /, $seq_acc);
if (!-r "$genomes_dir/summary.txt") {
print STDERR "Could not find summary.txt file for this genome -- skipping\n";
return 0;
}
$summary_info = `grep -i "$full_tax_name" $genomes_dir/summary.txt`;
@summary_lines = split(/\n/,$summary_info);
if (defined $summary_lines[0]) {
foreach $summary_line (@summary_lines)
{
if (($summary_line =~ /^(\S+)\t(\S+)\t(\d+)\t(\d+)\t(\d+)\t(\S[^\t]*)\t(\S[^\t]*)\t(\S[^\t]*)\t(\S[^\t]*)/) ||
($summary_line =~ /^(\S+)\t(\S+)\t(\d+)\t(\d+)\t(\d+)\t([^\t]*)[\t]*([^\t]*)[\t]*([^\t]*)[\t]*([^\t]*)/))
{
$seq_acc_2 = $1;
$gb_acc = $2;
$tax_id_2 = $4;
if ($seq_acc_2 =~ /./) {
$seq_acc_2 = substr($seq_acc_2, 0, index($seq_acc_2, '.'));
}
if ($gb_acc =~ /./) {
$gb_acc = substr($gb_acc, 0, index($gb_acc, '.'));
}
if ($seq_acc !~ /$seq_acc_2/) {
print STDERR "\nWARNING: Seqence accession numbers ($seq_acc -> $seq_acc_2) do not match for $full_tax_name between *.rpt and summary.txt\n";
}
if ($tax_id != $tax_id_2) {
print STDERR "\nWARNING: Tax ids do not match ($tax_id -> $tax_id_2) between *.rpt and summary.txt for $org_name\n";
}
$genbank_acc_2{$seq_acc_2} = $gb_acc;
}
}
foreach $acc (@refseq_acc) {
if (defined $genbank_acc_2{$acc}) {
$genbank_acc .= $genbank_acc_2{$acc} . " ";
}
}
}
else {
print STDERR "\nUnable to find $full_tax_name in summary.txt files\n";
}
}
sub Get_tax_id_from_gbk
{
my $tax_id_line = `grep taxon: $genomes_dir/$org_name/$first_gbk`;
if ($tax_id_line =~ /db_xref=\"taxon\:(\d+)\"/) {
$tax_id = $1;
print STDERR "Found tax ID in Genbank file: $tax_id\n";
}
}
sub Create_new_seqfile_names {
my (@all_seq_files, $seq_file, $cur_gene_ct);
$blat_DNA_ID = 0;
$blat_prot_ID = 0;
$new_name = '';
$name_map = '';
@gen_seqs = split(/ /,$seq_acc);
if ($#gen_seqs < 0)
{
opendir(ORG_DIR,"$genomes_dir/$org_name") ||
die "Unable to open $genomes_dir/$org_name to read seq files\n";
@all_seq_files = readdir(ORG_DIR);
foreach $seq_file (@all_seq_files) {
if ($seq_file =~ /^(N\S_\S+)\.gbk/) {
push (@gen_seqs,$1);
$seq_acc .= $1." ";
$cur_gene_ct = int(`grep ' CDS ' $genomes_dir/$org_name/$seq_file | wc -l | awk '{print \$1}'`);
$gene_count .= $cur_gene_ct." ";
$cur_seq_len = int(`head -1 $genomes_dir/$org_name/$seq_file | awk '{print \$3}'`);
$seq_len .= $cur_seq_len." ";
}
}
}
$short_name = $org_name;
$short_name =~ s/_/ /g;
$orig_tax_name =~ s/(\W)/\\$1/g;
for $seq (@gen_seqs)
{
$seq_def_tmp = `head "$genomes_dir/$org_name/$seq.gbk" | grep -A2 "^DEFINITION"`;
$seq_def_tmp =~ s/\n/ /g;
$seq_def_tmp =~ s/[ ]+/ /g;
($seq_def,@rest) = split(/ACCESSION/,$seq_def_tmp);
$seq_def_simplified = $seq_def;
$seq_def_simplified =~ s/(\W)/\\$1/g;
if (($seq_def =~ /DEFINITION\s+$full_tax_name\s*(.*)\,/) ||
($seq_def_simplified =~ /DEFINITION\s+$orig_tax_name\s*(.*)\,/) ||
($seq_def_simplified =~ /DEFINITION\s+$short_name\s*(.*)\,/) ||
($seq_def =~ /DEFINITION\s+$org_genus\s+$org_species\s+.*$org_str_tail\s*(.*)\,/) ||
($seq_def =~ /DEFINITION\s+$org_genus\s+$org_species\s+(.*)\,/) ||
($seq_def_simplified =~ /DEFINITION\s+$org_genus\s+$org_species\s+.*$org_str_tail\s*(.*)\,/))
{
$new_name = $1;
$new_name =~ s/chromosomal /chrom/g;
$new_name =~ s/chromosome /chr/g;
$new_name =~ s/ /_/g;
if (($new_name eq '') ||
($new_name eq ')'))
{
$new_name = "chr";
}
$name_map .= "$new_name ";
}
else {
print STDERR "\nWarning: Could not grep ($orig_tax_name) OR ($short_name) from $seq_def_simplified\n";
if ($org_domain eq "Viruses") {
$name_map .= "chr ";
}
else {
$name_map .= "$seq ";
}
}
# Get full taxonomy from genbank sequence
$full_tax_list = `head -20 "$genomes_dir/$org_name/$seq.gbk" | grep -A2 "ORGANISM" | grep -v ORGANISM | grep -v REFERENCE`;
$full_tax_list =~ s/\n/ /g;
$full_tax_list =~ s/[ ]+/ /g;
$full_tax_list =~ s/^[ ]*//g;
if ($org_domain eq "Unknown") {
($org_domain,@rest) = split(/; /,$full_tax_list);
$Domain_ct{$org_domain}+= 1;
}
}
if ($org_domain eq "Viruses") {
$db_src = "VIRUS";
}
elsif ($org_domain eq "Eukaryota") {
$db_src = "EUKARYA";
}
else {
$db_src = "NCBI";
}
# Pick a unique BLAT port number based on the NC number for the RefSeq sequence
# If no RefSeq seqs available or not a standard NC number, start with a high port number
if (!defined($gen_seqs[0]) || ($gen_seqs[0] !~ /NC/)) {
$cand_ID = 60000;
}
else {
# Pick last 4 digits of NC refseq number
$cand_ID = int(substr($gen_seqs[0],5,4));
# Port numbers less than 5000 seem to have conflicts, so bump up to higher ports
if ($cand_ID < 5000) {
$cand_ID += 20000;
}
}
while (defined($Blat_ID[$cand_ID])) {
$cand_ID++;
}
$blat_DNA_ID = $cand_ID;
$Blat_ID[$cand_ID] = 1;
$blat_prot_ID = $blat_DNA_ID+1;
while (defined($Blat_ID[$blat_prot_ID])) {
$blat_prot_ID++;
}
$Blat_ID[$blat_prot_ID] = 1;
}
sub Get_GC_content
{
$seq_gc = '';
@gen_seqs = split(/ /,$seq_acc);
for $seq (@gen_seqs) {
$cur_gc = `/projects/lowelab/share/bin/i386/seq2stat -f "$genomes_dir/$org_name/$seq.gbk" | grep "GC content" | awk '{print \$3}'`;
chop($cur_gc);
$seq_gc .= $cur_gc." ";
}
}
sub Get_TIGR_Genome_ID
{
$tigr_genome_id = '';
my $genus = '';
my $species = '';
my @rest;
my $strain = '';
my $org_name_str = $full_tax_name;
$org_name_str =~ s/-//g;
($genus,$species,@rest) = split(/[ .']/,$org_name_str);
if ($#rest >= 0) {
$strain = pop(@rest);
}
else {
$strain = ' ';
}
my $genome_id_lines = `grep -i "$full_tax_name" $tigr_id_page`;
if ($genome_id_lines eq '') {
$genome_id_lines = `grep -i $genus $tigr_id_page | grep -i $species | grep -i "$strain"`;
}
@genome_id_list = split(/\n/,$genome_id_lines);
foreach my $line (@genome_id_list) {
if ($line =~ /org=(\S+)\".+$genus/) {
$tigr_genome_id = $1;
}
}
if ($tigr_genome_id eq '') {
print STDERR "\nCould not find TIGR genome ID for $full_tax_name\n";
}
}
sub Get_SwissProt_ID {
my $tax_name = $org_search_string;
$tax_name =~ s/_/ /g;
($genus,$species,@rest) = split(/[ .']/,$tax_name);
if ($#rest >= 0) {
$strain = pop(@rest);
}
else {
$strain = '';
}
my $sp_id_lines = `egrep -i "($tax_name|$genus|$species)" $sp_id_page`;
$sp_species_id = '';
my $sp_id_ct = 0;
@sp_id_list = split(/\n/,$sp_id_lines);
foreach my $line (@sp_id_list) {
if (($line =~ /^(\S+)\s+(\S)\s+($tax_id)\:\s+\S\=(\S+)/) ||
($line =~ /^(\S+)\s+(\S)\s+(\d+)\:\s+\S\=$genus.+$species.+$strain/) ||
($line =~ /^(\S+)\s+(\S)\s+($tax_id)\:\s+\S\=.+$species.+$strain/))
{
$sp_species_id .= $1." ";
$sp_id_ct++;
}
}
if ($sp_id_ct == 0) {
foreach my $line (@sp_id_list) {
if ($line =~ /^(\S+)\s+(\S)\s+(\S+)\:\s+\S\=$tax_name/) {
$sp_species_id .= $1." ";
$sp_id_ct++;
}
}
}
if ($sp_id_ct == 0) {
print STDERR "\nCould not find a SwissProt species ID for $tax_name $strain\n";
}
elsif ($sp_id_ct > 5) {
print STDERR "\nWarning: Many ($sp_id_ct) SwissProt species IDs for $tax_name $strain\n";
}
}
sub Get_JGI_Genome_ID
{
$jgi_genome_id = 0;
my $genus = '';
my $species = '';
my @rest;
my $strain = '';
my $org_name_str = $full_tax_name;
$org_name_str =~ s/-//g;
($genus,$species,@rest) = split(/[ .']/,$org_name_str);
if ($#rest >= 0) {
$strain = pop(@rest);
}
else {
$strain = ' ';
}
my $genome_id_lines = `grep -i "$full_tax_name" $jgi_id_page`;
if ($genome_id_lines eq '') {
$genome_id_lines = `grep -i $genus $jgi_id_page | grep -i $species | grep -i "$strain"`;
}
@genome_id_list = split(/\n/,$genome_id_lines);
foreach my $line (@genome_id_list) {
if ($line =~ /taxon_oid=(\d+).+$genus/) {
$jgi_genome_id = $1;
last;
}
}
if ($jgi_genome_id == 0) {
print STDERR "\nCould not find JGI genome ID for $full_tax_name\n";
}
}
sub Get_StringDb_ID
{
$stringDb_id = 0;
my $genus = '';
my $species = '';
my @rest;
my $strain = '';
my $org_name_str = $full_tax_name;
$org_name_str =~ s/-//g;
($genus,$species,@rest) = split(/[ .\']/,$org_name_str);
if ($#rest >= 0) {
$strain = pop(@rest);
}
else {
$strain = ' ';
}
my $genome_id_lines = `grep -i "$full_tax_name" $stringDb_id_page`;
if ($genome_id_lines eq '') {
$genome_id_lines = `grep -i $genus $stringDb_id_page | grep -i $species | grep -i "$strain"`;
}
if ($genome_id_lines eq '') {
$genome_id_lines = `grep -i $genus $stringDb_id_page | grep -i $species`;
}
@genome_id_list = split(/\n/,$genome_id_lines);
foreach my $line (@genome_id_list) {
if ($line =~ /value=\'(\d+)\'\>$genus $species/)
{
$stringDb_id = $1;
last;
}
}
if (!$stringDb_id) {
print STDERR "\nCould not find String DB ID for $full_tax_name\n";
}
}
sub Output_entry {
print
"Org_abr: $org_abbr\n",
"Org_name: $org_name_str\n",
"Menu_name: $org_name_str\n",
"Domain: $org_domain\n",
"DB_name: $org_dbname\n",
"DB_src: $db_src\n",
"Tax_name: $full_tax_name\n",
"BLAT_IDs: $blat_DNA_ID $blat_prot_ID\n",
"Tax_ID: $tax_id: $full_tax_list\n",
# "TIGR_ID: $tigr_genome_id\n",
"TIGR_ID: 0\n",
"JGI_ID: $jgi_genome_id\n",
"SP_ID: $sp_species_id\n",
"String_ID: $stringDb_id\n",
"Dom_ct: $Domain_ct{$org_domain}\n",
"Gen_ID: $genome_id_list\n",
"Seq_acc: $seq_acc\n",
"Name_chg: $name_map\n",
"Gb_acc: $genbank_acc\n",
"Seq_size: $seq_len\n",
"Gene_ct: $gene_count\n",
"Seq_GC: $seq_gc\n",
"Pub_id: $pub_id\n",
"Rel_date: $seq_date\n",
"Seq_src: $seq_source\n",
"Web_seq: ftp://ftp.ncbi.nih.gov/genomes/Bacteria/$org_name/\n",
"Web_db: http://www.ncbi.nlm.nih.gov/genomes/framik.cgi?db=Genome&gi=$last_genome_id\n",
"Trna_opt: $prog_options{$org_domain}\n",
"Trna_abr: $org_abbr"."-tRNAs\n",
"Gen_seq: $genomes_dir"."/".$org_name."/".$org_abbr.".fa\n",
"Prot_seq: $genomes_dir"."/".$org_name."/".$org_abbr."-prot.fa\n",
"Feat_seq: $genomes_dir"."/".$org_name."/".$org_abbr."-feat.fa\n\n";
}