e78b1f8b7121149606b00890d327b78c06cd2522 hiram Fri Feb 25 11:01:10 2022 -0800 cleaner determination of genBankAccession no redmine diff --git src/hg/utils/automation/prepConfig.pl src/hg/utils/automation/prepConfig.pl index 8be29a4..3803823 100755 --- src/hg/utils/automation/prepConfig.pl +++ src/hg/utils/automation/prepConfig.pl @@ -1,181 +1,183 @@ #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); sub usage() { printf STDERR "usage: prepConfig.pl <db> <clade> <trackDbDir> ./refseq/*_assembly_report.txt\n\n"; printf STDERR "also expects to find photoReference.txt file in this directory\n"; printf STDERR " with two lines:\n"; printf STDERR "photoCreditURL xxx\n"; printf STDERR "photoCreditName xxx\n\n"; printf STDERR "must specify a <db> UCSC database id\n"; printf STDERR "where clade is one of:\n"; print STDERR `hgsql -N -e 'select name from clade order by priority;' hgcentraltest | xargs echo | fold -s -w 60 | sed -e 's/^/# /;'`; printf STDERR "and <trackDbDir> is a directory in trackDb/ directory\n"; printf STDERR "for example: birds bats chicken etc...\n"; printf STDERR "existing directories:\n"; print STDERR `(cd ~/kent/src/hg/makeDb/trackDb; ls -ogrt) | grep "^d" | awk '{print \$NF}' | sort | xargs echo | fold -s -w 70 | sed -e 's/^/# /;'`; } if ($argc != 4) { usage; exit 255; } if ( ! -s "photoReference.txt" ) { printf STDERR "#################################################\n"; printf STDERR "ERROR: must have file photoReference.txt present\n"; printf STDERR "#################################################\n"; usage; exit 255; } my $db = shift; my $clade = shift; my $trackDbDir = shift; my $asmReport = shift; my $mitoChr2Acc = $asmReport; $mitoChr2Acc =~ s/_assembly_report.txt//; $mitoChr2Acc .= "_assembly_structure/non-nuclear/assembled_chromosomes/chr2acc"; my @monthNumber = qw( Zero Jan. Feb. Mar. Apr. May Jun. Jul. Aug. Sep. Oct. Nov. Dec. ); my $asmDate = "notFound"; my $asmName = "notFound"; my $sciName = "notFound"; my $commonName = "notFound"; my $mySqlCommonName = "notFound"; my $submitter = "notFound"; my $genBankAccessionID = "notFound"; my $taxId = "notFound"; my $ncbiBioProject = "notFound"; my $ncbiBioSample = "notFound"; my $ncbiGenomeId = "notFound"; my $ncbiAssemblyId = "notFound"; my $genomeCladePriority = "notFound"; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = <FH>) { chomp $line; $line =~ s/ //g; if ($line =~ m/^#\s+Assembly name:/i) { $asmName = $line; $asmName =~ s/.*y name:\s+//i; $ncbiAssemblyId = `wget -O /dev/stdout "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=assembly&term=$asmName" 2> /dev/null | grep "<Id>" | head -1 | sed -e 's/[<>Id/]//g;'`; chomp $ncbiAssemblyId; } elsif ($line =~ m/^#\s+Organism name:/) { if ($line =~ m/\s+\(/) { $commonName = $line; $commonName =~ s/.*\(//; $commonName =~ s/\)//; $commonName = ucfirst($commonName); $mySqlCommonName = $commonName; $mySqlCommonName =~ s/'/_/g; # effective escape ' characters for MySQL } $sciName = $line; $sciName =~ s/.*m name:\s+//; $sciName =~ s/\s+\(.*//; } elsif ($line =~ m/^#\s+Date:/) { $asmDate = $line; $asmDate =~ s/.*Date:\s+//; my ($year, $month, $day) = split('-', $asmDate); $asmDate = sprintf("%s %s", $monthNumber[$month], $year); } elsif ($line =~ m/^#\s+Submitter:/) { $submitter = $line; $submitter =~ s/.*Submitter:\s+//; } elsif ($line =~ m/^#\s+RefSeq assembly accession:/i) { + if ($genBankAccessionID =~ m/notFound/) { $genBankAccessionID = $line; $genBankAccessionID =~ s/.*ccession:\s+//; $genBankAccessionID =~ s/ .*//; + } } elsif ($line =~ m/^#\s+GenBank assembly accession:/i) { if ($genBankAccessionID =~ m/notFound/) { $genBankAccessionID = $line; $genBankAccessionID =~ s/.*ccession:\s+//; $genBankAccessionID =~ s/ .*//; } } elsif ($line =~ m/^#\s+Taxid:/i) { $taxId = $line; $taxId =~ s/.*Taxid:\s+//; } elsif ($line =~ m/^#\s+BioSample:/) { $ncbiBioSample = $line; $ncbiBioSample =~ s/.*BioSample:\s+//; } elsif ($line =~ m/^#\s+BioProject:/) { $ncbiBioProject = $line; $ncbiBioProject =~ s/.*BioProject:\s+//; $ncbiGenomeId = `wget -O /dev/stdout "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=genome&term=$ncbiBioProject" 2> /dev/null | grep "<Id>" | sed -e 's/[<>Id/]//g;'`; $ncbiBioProject =~ s/PRJNA//; chomp $ncbiGenomeId; } } close (FH); my $previousOrder = `hgsql -N -e 'select orderKey from dbDb where organism="$mySqlCommonName";' hgcentraltest | wc -l`; chomp $previousOrder; my $orderKey = 0; if ($previousOrder > 0) { $orderKey = `hgsql -N -e 'select min(orderKey) from dbDb where organism="$mySqlCommonName";' hgcentraltest | awk '{printf "%d", \$1-1}'`; } else { $orderKey = `(printf "%s\t%s\n" "$db" "$commonName"; hgsql -N -e 'select name,organism,orderKey from dbDb order by orderKey;' hgcentraltest) | sort -k2 | grep -C 1 "$commonName" | cut -f3 | xargs echo | awk '{printf "%d", \$1+(\$2-\$1)/2}'`; } chomp $orderKey; my $mitoAcc = "notFound"; if ( -s $mitoChr2Acc ) { $mitoAcc = `tail -1 $mitoChr2Acc | awk '{print \$NF}'`; chomp $mitoAcc; } else { printf STDERR "# going to need a mitoAcc ?\n"; } # do not need a genomeCladePriority if it already exists for this organism my $genomeCladeExists = `hgsql -N -e 'select genome from genomeClade where genome="$mySqlCommonName";' hgcentraltest | wc -l`; chomp $genomeCladeExists; # if does not exist, use the most common value for this clade if ( $genomeCladeExists != 1 ) { $genomeCladePriority = `hgsql -N -e 'select priority from genomeClade where clade="$clade";' hgcentraltest | sort | uniq -c | sort -rn | head -1 | awk '{print \$NF}'`; chomp $genomeCladePriority; } if ($ncbiGenomeId eq "notFound") { my $searchTerm = $sciName; $searchTerm =~ s/ /+/g; $ncbiGenomeId = `wget -O /dev/stdout "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=genome&term=$searchTerm" 2> /dev/null | grep "<Id>" | sed -e 's/[<>Id/]//g;'`; chomp $ncbiGenomeId; } printf "# config parameters for makeGenomeDb.pl:\n"; printf "db %s\n", $db; printf "clade %s\n", $clade; if ( $genomeCladeExists != 1 ) { printf "genomeCladePriority %d\n", $genomeCladePriority; } printf "scientificName %s\n", $sciName; printf "commonName %s\n", ucfirst($commonName); printf "assemblyDate %s\n", $asmDate; printf "assemblyLabel %s\n", $submitter; printf "assemblyShortLabel %s\n", $asmName; printf "orderKey %s\n", $orderKey; if ( -s $mitoChr2Acc ) { printf "# mitochondrial sequence included in refseq release\n"; printf "# mitoAcc %s\n", $mitoAcc; printf "mitoAcc none\n"; } else { printf "mitoAcc %s\n", $mitoAcc; } printf "fastaFiles /hive/data/genomes/$db/ucsc/*.fa.gz\n"; printf "agpFiles /hive/data/genomes/$db/ucsc/*.agp\n"; printf "# qualFiles none\n"; printf "dbDbSpeciesDir %s\n", $trackDbDir; print `cat photoReference.txt`; # printf "photoCreditURL xxx\n"; # printf "photoCreditName xxx\n"; printf "ncbiGenomeId %s\n", $ncbiGenomeId; printf "ncbiAssemblyId %s\n", $ncbiAssemblyId; printf "ncbiAssemblyName %s\n", $asmName; printf "ncbiBioProject %s\n", $ncbiBioProject; printf "ncbiBioSample %s\n", $ncbiBioSample; printf "genBankAccessionID %s\n", $genBankAccessionID; printf "taxId %s\n", $taxId;