src/hg/archaeStuff/scripts/make-genome-info 1.18

1.18 2009/08/10 04:20:14 pchan
reverse changes to r 1.16
Index: src/hg/archaeStuff/scripts/make-genome-info
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/archaeStuff/scripts/make-genome-info,v
retrieving revision 1.17
retrieving revision 1.18
diff -b -B -U 1000000 -r1.17 -r1.18
--- src/hg/archaeStuff/scripts/make-genome-info	10 Aug 2009 03:12:10 -0000	1.17
+++ src/hg/archaeStuff/scripts/make-genome-info	10 Aug 2009 04:20:14 -0000	1.18
@@ -1,759 +1,759 @@
 #! /usr/bin/perl -w
 
 ## make-genome-info
 ## Todd Lowe  October 2005
 ##
 ## A Perl script adapted from setting up scans of all prokaryotic genomes for tRNAs,
 ## now used to collect genome info to set up prokaryotic browsers in one program with 
 ## the companion script "make-browser"
 ## Sends genome info to STDOUT, but "Genome-info-db" is the suggested output file name
 ## Warnings and error messages sent STDERR
 ## Known to not work correctly with draft genomes which have incomplete file sets at NCBI
 ##
 ## Warning -- globals are used throughout from sheer laziness, but program is 
 ##   reasonably well tested
 
 # CVS version tag
 # $Id$
 
 use Getopt::Std;
 
 # Constants
 
 # tRNAscan-SE run options for various domains
 $prog_options{"Archaea"}  = "-o# -f# -m# -F# -HyA";
 $prog_options{"Bacteria"} = "-o# -f# -m# -F# -HyB";
 $prog_options{"Eukaryota"} =  "-o# -f# -m# -F# -Hy";
 $prog_options{"Viruses"} =  "-o# -f# -m# -F# -HyB";
 
 # TIGR CMR pages info
 my $tigr_genome_list_link = "http://cmr.tigr.org/tigr-scripts/CMR/shared/Genomes.cgi";
 our $tigr_id_page = "TIGR_id_page.html";
 
 # JGI genome list
 my $jgi_genome_list_link = "http://img.jgi.doe.gov/cgi-bin/pub/main.cgi?section=TaxonList&mainPageStats=1";
 #my $jgi_genome_list_link = "http://img.jgi.doe.gov/cgi-bin/pub/main.cgi?section=TaxonList&page=taxonListAlpha&mainPageStats=1";
 # old JGI info link: "http://img.jgi.doe.gov/cgi-bin/pub/main.cgi?page=taxonListAlpha";
 our $jgi_id_page = "JGI_id_page.html";
 
 our $swissProt_species_list_link = "http://www.expasy.ch/cgi-bin/speclist";
 our $sp_id_page = "SP_id_page.html";
 our $stringDb_list_link = "http://string32.embl.de/";
 our $stringDb_id_page = "stringDB_id_page.html";
 
 our @Blat_ID = ();
 
 # Starting priority number for arranging species in the browser (will increment for each new species)
 $Domain_ct{"Archaea"} = 200;
 $Domain_ct{"Bacteria"} = 500;  
 $Domain_ct{"Eukaryota"} = 100;
 $Domain_ct{"Viruses"} = 2000;
 
 $opt_k = '';
 &getopts('k:');
 
 $search_key = $opt_k;
 
 if ($#ARGV < 0) {
     die "\nUsage: make-genome-info  [-k <key>] <genomes directory>\n\n",
     "  Outputs basic info on all completed prokaryotes from local mirror of NCBI \n",
     "         genome directory (ftp://ftp.ncbi.nih.gov/genomes/Bacteria/)\n\n",
     "     Hint: An availabe local mirror is /projects/lowelab/db/genomes/Bacteria\n",
     "     Default usage: make-genome-info /projects/lowelab/db/genomes/Bacteria\n\n"; 
 }				
 
 $genomes_dir = shift(@ARGV);
 
 opendir(GB_GENOMES,$genomes_dir) || 
     die "Could not read $genomes_dir\n\n";
 
 @all_orgs = readdir(GB_GENOMES);
 splice(@all_orgs,0,2);   # delete ".", "..","README","accession" entries 
 
 # Get current list of TIGR CMR genomes 
 #system("lynx -source $tigr_genome_list_link |grep 'GenomePage' > $tigr_id_page");
 
 print STDERR "\nGrabbing current copy of JGI species IDs\n";
 # Get current list of JGI IMG genomes 
 if (!-r $jgi_id_page) {
     system("lynx -source \"$jgi_genome_list_link\" > $jgi_id_page");
 }
 
 # Get current list of SwissProt species names
 if (!-r $sp_id_page) {
     print STDERR "\nGrabbing current copy of Swissprot species list names\n";
     system("lynx -dump -nolist -width=300 \"$swissProt_species_list_link\" > $sp_id_page");
 }
 
 # Get current list of StringDB IDs
 system("lynx -source \"$stringDb_list_link\" > $stringDb_id_page");
 
 print "## This file is generated automatically with 'make-genome-info' Perl script\n",
     "## Do NOT edit this file -- edit a copy (i.e. Genome-info-mods) which only has fields that you\n",
     "## have hand-edited.  Manual changes to Genome-info-mods will always override changes to this file\n",
     "## in 'make-browser' script\n\n";
 
 $org_name = '';
 $species_ct = 0;
 
 # Loop through all the species directories found
 foreach $org_name (sort @all_orgs) {
 
     #Skip non-directories and non-organism directories
     
     if ((!-d "$genomes_dir/$org_name") ||
 	($org_name !~ /$search_key/))
     { 
 	next; 
     }
     if ($org_name =~ /^(TEMP|CLUSTER)/) {
 	next;
     }
 
     # Get Organism domain info
     opendir(ORG_DIR,"$genomes_dir/$org_name/") ||
 	die "Unable to open $genomes_dir/$org_name/ to read seq files\n";
     my @all_seq_files = readdir(ORG_DIR);
     local $first_gbk = '';
     for (my $filect=0; $filect <= $#all_seq_files; $filect++) 
     {
 	if ($all_seq_files[$filect] =~ /\.gbk/) {
 	    $first_gbk = $all_seq_files[$filect];
 	    last;
 	}
     }
     $full_tax_list = `head -20 "$genomes_dir/$org_name/$first_gbk" | grep -A2 "ORGANISM" | grep -v ORGANISM | grep -v REFERENCE`;
     $full_tax_list =~ s/\n/ /g;
     $full_tax_list =~ s/[ ]+/ /g;
     $full_tax_list =~ s/^[ ]*//g;
     ($org_domain,@rest) = split(/; /,$full_tax_list);    
 
     &Set_org_abbrev;
     &Read_report_files;
     &Read_summary_files;
     &Read_ncbi_summary_txt;
     if (!$tax_id) {
 	&Get_tax_id_from_gbk;
     }
     &Create_new_seqfile_names;
     &Get_GC_content;
 #    &Get_TIGR_Genome_ID;
     &Get_SwissProt_ID;
     &Get_JGI_Genome_ID;
     &Get_StringDb_ID;
 
     if ($org_domain eq "Archaea") {
 	$Domain_ct{$org_domain}+= 2;
     }
     else {
 	$Domain_ct{$org_domain}+= 1;
     }
     
     &Output_entry;
     
 #    Uncomment this if you want to see the genome names scroll by
 #    print STDERR "\nCreated entry for $org_abbr\n";
     $species_ct++;    
     print STDERR ".";
 }
 
 print STDERR "\nCompleted outputing genome information for $species_ct species\n";
 
 #############
 ## End Main #
 #############
 
 
 sub Set_org_abbrev {
     
     $org_name_str = $org_name;
     $org_name_str =~ s/_/ /g;
     
     ($org_genus,$org_species,@rest) = split(/_/,$org_name);
     $org_species =~ s/-//g;
     $org_genus   =~ s/-//g;
 
     # Get search string for last word in full name for later matching duties
     $org_str_tail = $rest[$#rest];
     if (!defined($org_str_tail)) {
 	$org_str_tail = '';
     }
 
     # Shorten strain name for $org_dbname
     $strain_info = uc(join("_",@rest));
     $strain_info =~ s/ATCC_/ATCC/;
     $strain_info =~ s/DSM_/DSM/;
     $strain_info =~ s/SEROVAR//;
     $strain_info =~ s/BIOVAR//;
     $strain_info =~ s/-/_/;
     $strain_info =~ s/\./_/;
     
     $strain_str = '';
     if ($strain_info ne '') {
 	$strain_str = substr($strain_info,0,12);
     }
 
     if ($org_domain eq "Viruses") {
 	$org_search_string = $org_name;
     }
     else {
 	$org_search_string = $org_genus." ".$org_species;
     }
     $org_abbr = join("_",substr($org_genus,0,4),substr($org_species,0,4),@rest);
     $org_dbname = join("",
 		       lc(substr($org_genus,0,4)),
 		       ucfirst(substr($org_species,0,4)),
 		       "_",
 		       $strain_str);
 
     $org_dbname =~ s/__/_/g;
     if (substr($org_dbname,length($org_dbname)-1,1) eq '_') {
 	chop($org_dbname);
     }
     
     if (defined($dbnames{$org_dbname})) {
 	$dbnames{$org_dbname}++;
 	$org_dbname =~ s/$strain_str/$strain_info/;
 	$org_dbname =~ s/__/_/g;
     }
     else {
 	$dbnames{$org_dbname} = 1;
     }
 }
 
 
 # Read *.rpt files in each organism's directory to collect more stats
 
 sub Read_report_files 
 {
 
     $genome_id_list = 0;
     $seq_acc = '';
     $full_tax_name = '';
     $orig_tax_name = '';
     $tax_id = 0;
     $last_genome_id = 0;
     $gene_count = '';
     $seq_len = '';
     $pub_id = 0;
 
-    $report_info = `cat $genomes_dir/$org_name/[ACN][PLZC]*.rpt`;
+    $report_info = `cat $genomes_dir/$org_name/[AN][ZC]*.rpt`;
     @report_lines = split(/\n/,$report_info);
     foreach $line (@report_lines) {
 	if ($line =~ /^Taxname:\s(.+)$/) {
 	    $temp_tax_name = $1;
 	    if (($temp_tax_name =~ /phage/) && 
 		($org_domain ne "Viruses"))
 	    {
 		last;
 	    }
 	    $full_tax_name = $1;
 	    $orig_tax_name = $1;
 	}
 	elsif ($line =~ /^Taxid:\s(\d+)/) {
 	    $tax_id = $1;
 	}
 	elsif ($line =~ /^Publications:\s+(\d+)/) {
 	    $pub_id = $1;
 	}
 	elsif ($line =~ /^Genome_ID = (\d+)/) {
 	    $genome_id_list .= $1." ";
 	    $last_genome_id = $1;
 	}
 	elsif ($line =~ /^DNA\s+length\s=\s+(\d+)/) {
 	    $seq_len .= $1." ";
 	}
 	elsif ($line =~ /^Accession: (\S+)\.\d+/) {
 	    $seq_acc .= $1." ";
 	}
 	elsif ($line =~ /^Gene count:\s+(\d+)/) {
 	    $gene_count .= $1." ";
 	}
     }
     chop($seq_acc);
 }
 
 
 ## Read info in NCBI Genomes /Bacteria directory with summary info for prok orgs
 
 sub Read_summary_files {
  
     if ($full_tax_name eq '') {
 	$full_tax_name = $org_search_string;
 	$full_tax_name =~ s/_/ /g;
     }
 
     $domain_letter   = "X";
     $tax_id_2        = 0;
     $full_tax_name_2 = '';
     $seq_source      = '';
     $genome_mb       = 0;
     $seq_acc_2       = '';
     $seq_date        = "-";
 #    $org_domain = "Unknown";
 
  
     if (!-r "$genomes_dir/Complete.txt") {
 	print STDERR "Could not find Complete.txt for this genome -- skipping summary file read\n";
 	return 0;
     }
     $summary_info =  `grep -i "$full_tax_name" $genomes_dir/Complete.txt`; 
     $summary_info .= `grep -i "$full_tax_name" $genomes_dir/InProgress.txt`; 
     
     @summary_lines = split(/\n/,$summary_info);
 
     $seq_acc_2 = 'Unknown';
     $line_ct=0;
     if (defined($summary_lines[$line_ct]) &&
 	(($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]+)\t(\S+)\t\S+\,(N\S+)\t(\S+)/) ||
 	 ($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)\t(N\S+)/) ||
 	 ($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)/))) {
 	$domain_letter   = $1;
 	$tax_id_2        = $2;
 	$full_tax_name_2 = $3;
 	$seq_source      = $4;
 	$genome_mb       = $5;
 	$seq_acc_2       = $6;
 	$seq_date        = $7;
  
 	while (($line_ct < $#summary_lines) && ($seq_acc !~ /$seq_acc_2/)) 
 	{
 	    $line_ct++;
 	    if (($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)\t\S+\,(N\S+)\t(\S+)/) ||
 		($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)\t(N\S+)/) ||
 		($summary_lines[$line_ct] =~ /^(\S)\t(\d+)\t(\S[^\t]*)\t([^\t]*)\t(\S+)/)) {
 		$domain_letter   = $1;
 		$tax_id_2        = $2;
 		$full_tax_name_2 = $3;
 		$seq_source      = $4;
 		$genome_mb       = $5;
 		$seq_acc_2       = $6;
 		$seq_date        = $7;
 	    }
 	}
 	if ($tax_id == 0) {
 	    $tax_id = $tax_id_2;
 	}
         if ($domain_letter eq "A") {
 	    $org_domain = "Archaea";
 	}
 	elsif ($domain_letter eq "B") {
 	    $org_domain = "Bacteria";
 	}
 	elsif ($domain_letter eq "E") {
 	    $org_domain = "Eukaryota";
 	}
 	elsif ($domain_letter eq "V") {
 	    $org_domain = "Viruses";
 	}
 	
 	# Pick the longer taxonomy name which hopefully has strain specification in it
 	if (length($full_tax_name_2) >  length($full_tax_name)) {
 	    $full_tax_name = $full_tax_name_2;
 	}
 	if ($seq_acc !~ /$seq_acc_2/) {
 	    print STDERR "\nWARNING: Seqence accession numbers ($seq_acc -> $seq_acc_2) do not match for $full_tax_name between *.rpt and Complete.txt\n";
 	}
 	if (!defined($seq_date)) {
 	    $seq_date = "-";
 	}
 	if ($tax_id != $tax_id_2) {
 	    print STDERR "\nWARNING: Tax ids do not match ($tax_id -> $tax_id_2) between *.rpt and Complete.txt for $org_name\n";
 	}
 
     }
     else {
 	print STDERR "\nUnable to find $full_tax_name in Complete.txt or InProgress.txt files\n";
     }
     
 }
 
 ## Read NCBI genome summary.txt to obtain GenBank accession numbers
 sub Read_ncbi_summary_txt {
     
     %genbank_acc_2 = ();
     $genbank_acc = '';
     $gb_acc = '';
     $seq_acc_2 = '';
     $tax_id_2 = 0;
     @refseq_acc = split(/ /, $seq_acc);
     
 
     if (!-r "$genomes_dir/summary.txt") {
 	print STDERR "Could not find summary.txt file for this genome -- skipping\n";
 	return 0;
     }
 
     $summary_info =  `grep -i "$full_tax_name" $genomes_dir/summary.txt`; 
     @summary_lines = split(/\n/,$summary_info);
 
     if (defined $summary_lines[0]) {
         foreach $summary_line (@summary_lines) 
 	{
             if (($summary_line =~ /^(\S+)\t(\S+)\t(\d+)\t(\d+)\t(\d+)\t(\S[^\t]*)\t(\S[^\t]*)\t(\S[^\t]*)\t(\S[^\t]*)/) ||
 		($summary_line =~ /^(\S+)\t(\S+)\t(\d+)\t(\d+)\t(\d+)\t([^\t]*)[\t]*([^\t]*)[\t]*([^\t]*)[\t]*([^\t]*)/))
 	    {
                 $seq_acc_2 = $1;
                 $gb_acc = $2;
                 $tax_id_2 = $4;
             
                 if ($seq_acc_2 =~ /./) {
                     $seq_acc_2 = substr($seq_acc_2, 0, index($seq_acc_2, '.'));
                 }
                 if ($gb_acc =~ /./) {
                     $gb_acc = substr($gb_acc, 0, index($gb_acc, '.'));
                 }
 
                 if ($seq_acc !~ /$seq_acc_2/) {
                     print STDERR "\nWARNING: Seqence accession numbers ($seq_acc -> $seq_acc_2) do not match for $full_tax_name between *.rpt and summary.txt\n";
                 }
                 if ($tax_id != $tax_id_2) {
                     print STDERR "\nWARNING: Tax ids do not match ($tax_id -> $tax_id_2) between *.rpt and summary.txt for $org_name\n";
                 }
     
                 $genbank_acc_2{$seq_acc_2} = $gb_acc;
             }                  
         }
         foreach $acc (@refseq_acc) {
             if (defined $genbank_acc_2{$acc}) {
                 $genbank_acc .= $genbank_acc_2{$acc} . " ";
             }
         }
     }
     else {
 	print STDERR "\nUnable to find $full_tax_name in summary.txt files\n";
     }
  }
 
 sub Get_tax_id_from_gbk 
 {
     my $tax_id_line = `grep taxon: $genomes_dir/$org_name/$first_gbk`;
     if ($tax_id_line =~ /db_xref=\"taxon\:(\d+)\"/) {
 	$tax_id = $1;
 	print STDERR "Found tax ID in Genbank file: $tax_id\n";
     }
 }
 
 sub Create_new_seqfile_names {
     
     my (@all_seq_files, $seq_file, $cur_gene_ct);
     $blat_DNA_ID = 0;
     $blat_prot_ID = 0;
     $new_name = '';
     $name_map = '';
     @gen_seqs = split(/ /,$seq_acc);
     
     if ($#gen_seqs < 0) 
     {
 	opendir(ORG_DIR,"$genomes_dir/$org_name") ||
 	    die "Unable to open $genomes_dir/$org_name to read seq files\n";
 	@all_seq_files = readdir(ORG_DIR);
 	foreach $seq_file (@all_seq_files) {
 	    if ($seq_file =~ /^(N\S_\S+)\.gbk/) {
 		push (@gen_seqs,$1);
 		$seq_acc .= $1." ";
 		$cur_gene_ct = int(`grep ' CDS ' $genomes_dir/$org_name/$seq_file | wc -l |  awk '{print \$1}'`);
 		$gene_count .= $cur_gene_ct." ";
 		$cur_seq_len = int(`head -1 $genomes_dir/$org_name/$seq_file | awk '{print \$3}'`);
 		$seq_len .= $cur_seq_len." ";
 	    }
 	}
     }
     $short_name = $org_name;
     $short_name =~ s/_/ /g;
     $orig_tax_name =~ s/(\W)/\\$1/g;
 
     for $seq (@gen_seqs) 
     {
 	$seq_def_tmp = `head "$genomes_dir/$org_name/$seq.gbk" | grep -A2 "^DEFINITION"`;
 	$seq_def_tmp =~ s/\n/ /g;
 	$seq_def_tmp =~ s/[ ]+/ /g;
 	($seq_def,@rest) = split(/ACCESSION/,$seq_def_tmp);
 
 	$seq_def_simplified = $seq_def;
 	$seq_def_simplified =~ s/(\W)/\\$1/g;
 	if (($seq_def =~ /DEFINITION\s+$full_tax_name\s*(.*)\,/) ||
 	    ($seq_def_simplified =~ /DEFINITION\s+$orig_tax_name\s*(.*)\,/) ||
 	    ($seq_def_simplified =~ /DEFINITION\s+$short_name\s*(.*)\,/) ||
 	    ($seq_def =~ /DEFINITION\s+$org_genus\s+$org_species\s+.*$org_str_tail\s*(.*)\,/) ||
 	    ($seq_def =~ /DEFINITION\s+$org_genus\s+$org_species\s+(.*)\,/) ||
 	    ($seq_def_simplified =~ /DEFINITION\s+$org_genus\s+$org_species\s+.*$org_str_tail\s*(.*)\,/))
 	{
 	    $new_name = $1;
 	    $new_name =~ s/chromosomal /chrom/g;
 	    $new_name =~ s/chromosome /chr/g;
 	    $new_name =~ s/ /_/g;
 	    if (($new_name eq '') ||
 		($new_name eq ')'))
 	    {
 		$new_name = "chr";
 	    }
 	    $name_map .= "$new_name ";
 	    
 	}
 	else {
 	    print STDERR "\nWarning: Could not grep ($orig_tax_name) OR ($short_name) from $seq_def_simplified\n";
 
 	    if ($org_domain eq "Viruses") {
 		$name_map .= "chr ";
 	    }
 	    else {
 		$name_map .= "$seq ";
 	    }
 	}
 	# Get full taxonomy from genbank sequence
     
 	$full_tax_list = `head -20 "$genomes_dir/$org_name/$seq.gbk" | grep -A2 "ORGANISM" | grep -v ORGANISM | grep -v REFERENCE`;
 	$full_tax_list =~ s/\n/ /g;
 	$full_tax_list =~ s/[ ]+/ /g;
 	$full_tax_list =~ s/^[ ]*//g;
 
 	if ($org_domain eq "Unknown") {
 	    ($org_domain,@rest) = split(/; /,$full_tax_list);
 	    $Domain_ct{$org_domain}+= 1;
 	}
     }
 
     if ($org_domain eq "Viruses") {
 	$db_src = "VIRUS";
     }
     elsif ($org_domain eq "Eukaryota") {
 	$db_src = "EUKARYA";
     }
     else {
 	$db_src = "NCBI";
     }
 
     # Pick a unique BLAT port number based on the NC number for the RefSeq sequence
     # If no RefSeq seqs available or not a standard NC number, start with a high port number
     if (!defined($gen_seqs[0]) || ($gen_seqs[0] !~ /NC/)) {
 	$cand_ID = 60000;
     }
     else {
 	# Pick last 4 digits of NC refseq number
 	$cand_ID = int(substr($gen_seqs[0],5,4));
 	# Port numbers less than 5000 seem to have conflicts, so bump up to higher ports
 	if ($cand_ID < 5000) {
 	    $cand_ID += 20000;
 	}
     }
     while (defined($Blat_ID[$cand_ID])) {
 	$cand_ID++;
     }
     $blat_DNA_ID = $cand_ID;
     $Blat_ID[$cand_ID] = 1;
     $blat_prot_ID = $blat_DNA_ID+1;
     
     while (defined($Blat_ID[$blat_prot_ID])) {
 	$blat_prot_ID++;
     }
     $Blat_ID[$blat_prot_ID] = 1;
 }
 
 sub Get_GC_content 
 {
     $seq_gc = '';
     
     @gen_seqs = split(/ /,$seq_acc);
     
     for $seq (@gen_seqs) {
 	$cur_gc = `/projects/lowelab/share/bin/i386/seq2stat -f "$genomes_dir/$org_name/$seq.gbk" | grep "GC content" | awk '{print \$3}'`; 
 	chop($cur_gc);
 	$seq_gc .= $cur_gc." ";
     }
 }
 
 sub Get_TIGR_Genome_ID
 {
 
     $tigr_genome_id = '';
     my $genus = '';
     my $species = '';
     my @rest;
     my $strain = '';
     my $org_name_str = $full_tax_name;
     $org_name_str =~ s/-//g;
     ($genus,$species,@rest) = split(/[ .']/,$org_name_str);
 
     if ($#rest >= 0) {
 	$strain = pop(@rest);
     }
     else {
 	$strain = ' ';
     }
 
     my $genome_id_lines = `grep -i "$full_tax_name" $tigr_id_page`;
     if ($genome_id_lines eq '') {
 	$genome_id_lines = `grep -i $genus $tigr_id_page | grep -i $species | grep -i "$strain"`;
     }
     
     @genome_id_list = split(/\n/,$genome_id_lines);
     foreach my $line (@genome_id_list) {
 	if ($line =~ /org=(\S+)\".+$genus/) {
 	    $tigr_genome_id = $1;
 	}
     }
     
     if ($tigr_genome_id eq '') {
 	print STDERR "\nCould not find TIGR genome ID for $full_tax_name\n";
     }
 }
 
 sub Get_SwissProt_ID {
 
     my $tax_name = $org_search_string;
     $tax_name =~ s/_/ /g;
 
     ($genus,$species,@rest) = split(/[ .']/,$tax_name);
 
     if ($#rest >= 0) {
  	$strain = pop(@rest);
     }
     else {
 	$strain = '';
     }
 
     my $sp_id_lines = `egrep -i "($tax_name|$genus|$species)" $sp_id_page`;
 
     $sp_species_id = '';
     my $sp_id_ct = 0;
     
     @sp_id_list = split(/\n/,$sp_id_lines);
     foreach my $line (@sp_id_list) {
 	if (($line =~ /^(\S+)\s+(\S)\s+($tax_id)\:\s+\S\=(\S+)/) ||
 	     ($line =~ /^(\S+)\s+(\S)\s+(\d+)\:\s+\S\=$genus.+$species.+$strain/) ||
 	     ($line =~ /^(\S+)\s+(\S)\s+($tax_id)\:\s+\S\=.+$species.+$strain/))
 	    {
 	    $sp_species_id .= $1." ";
 	    $sp_id_ct++;
 	}
     }
     if ($sp_id_ct == 0) {
 	foreach my $line (@sp_id_list) {
 	    if ($line =~ /^(\S+)\s+(\S)\s+(\S+)\:\s+\S\=$tax_name/) {
 		$sp_species_id .= $1." ";
 		$sp_id_ct++;
 	    }
 	}
     }
     
     if ($sp_id_ct == 0) {
 	print STDERR "\nCould not find a SwissProt species ID for $tax_name $strain\n";
     }
     elsif ($sp_id_ct > 5) {
 	print STDERR "\nWarning: Many ($sp_id_ct) SwissProt species IDs for $tax_name $strain\n";
     }    
 }
 
 sub Get_JGI_Genome_ID
 {
 
     $jgi_genome_id = 0;
     my $genus = '';
     my $species = '';
     my @rest;
     my $strain = '';
     my $org_name_str = $full_tax_name;
     $org_name_str =~ s/-//g;
     ($genus,$species,@rest) = split(/[ .']/,$org_name_str);
 
     if ($#rest >= 0) {
  	$strain = pop(@rest);
     }
     else {
 	$strain = ' ';
     }
 
     my $genome_id_lines = `grep -i "$full_tax_name" $jgi_id_page`;
     if ($genome_id_lines eq '') {
 	$genome_id_lines = `grep -i $genus $jgi_id_page | grep -i $species | grep -i "$strain"`;
     }
     
     @genome_id_list = split(/\n/,$genome_id_lines);
     foreach my $line (@genome_id_list) {
 	if ($line =~ /taxon_oid=(\d+).+$genus/) {
 	    $jgi_genome_id = $1;
             last;
 	}
     }
     
     if ($jgi_genome_id == 0) {
 	print STDERR "\nCould not find JGI genome ID for $full_tax_name\n";
     }
 }
 
 sub Get_StringDb_ID
 {
     $stringDb_id = 0;
     my $genus = '';
     my $species = '';
     my @rest;
     my $strain = '';
     my $org_name_str = $full_tax_name;
     $org_name_str =~ s/-//g;
     ($genus,$species,@rest) = split(/[ .\']/,$org_name_str);
     
     if ($#rest >= 0) {
  	$strain = pop(@rest);
     }
     else {
 	$strain = ' ';
     }
     
     my $genome_id_lines = `grep -i "$full_tax_name" $stringDb_id_page`;
     if ($genome_id_lines eq '') {
 	$genome_id_lines = `grep -i $genus $stringDb_id_page | grep -i $species | grep -i "$strain"`;
     }
     if ($genome_id_lines eq '') {
 	$genome_id_lines = `grep -i $genus $stringDb_id_page | grep -i $species`;
     }
     
     @genome_id_list = split(/\n/,$genome_id_lines);
     foreach my $line (@genome_id_list) {
 	if ($line =~ /value=\'(\d+)\'\>$genus $species/) 
 	{
 	    $stringDb_id = $1;
             last;
 	}
     }
     if (!$stringDb_id) {
 	print STDERR "\nCould not find String DB ID for $full_tax_name\n";
     }
 }
 
 sub Output_entry {
     print 
 	"Org_abr:  $org_abbr\n",
 	"Org_name: $org_name_str\n",
 	"Menu_name: $org_name_str\n",
 	"Domain:   $org_domain\n",
 	"DB_name:  $org_dbname\n",
 	"DB_src:   $db_src\n",
 	"Tax_name: $full_tax_name\n",
 	"BLAT_IDs: $blat_DNA_ID $blat_prot_ID\n",
 	"Tax_ID:   $tax_id: $full_tax_list\n",
 #	"TIGR_ID:  $tigr_genome_id\n",
 	"TIGR_ID:  0\n",
 	"JGI_ID:   $jgi_genome_id\n",
 	"SP_ID:    $sp_species_id\n",
 	"String_ID: $stringDb_id\n",
 	"Dom_ct:   $Domain_ct{$org_domain}\n",
 	"Gen_ID:   $genome_id_list\n",
 	"Seq_acc:  $seq_acc\n",
 	"Name_chg: $name_map\n",
         "Gb_acc:   $genbank_acc\n",
 	"Seq_size: $seq_len\n",
 	"Gene_ct:  $gene_count\n",
 	"Seq_GC:   $seq_gc\n",
 	"Pub_id:   $pub_id\n",
 	"Rel_date: $seq_date\n",
 	"Seq_src:  $seq_source\n",
 	"Web_seq:  ftp://ftp.ncbi.nih.gov/genomes/Bacteria/$org_name/\n",
 	"Web_db:   http://www.ncbi.nlm.nih.gov/genomes/framik.cgi?db=Genome&gi=$last_genome_id\n",
 	"Trna_opt: $prog_options{$org_domain}\n",
 	"Trna_abr: $org_abbr"."-tRNAs\n",
 	"Gen_seq:  $genomes_dir"."/".$org_name."/".$org_abbr.".fa\n",
 	"Prot_seq: $genomes_dir"."/".$org_name."/".$org_abbr."-prot.fa\n",
 	"Feat_seq: $genomes_dir"."/".$org_name."/".$org_abbr."-feat.fa\n\n";
 
 }