b313555b67a1aff99cd940e4bca28677b35b608d hiram Mon May 9 15:58:39 2022 -0700 fixup note about how to search, better include almost all GenArk assemblies no redmine diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl index c080f52..962b4b5 100755 --- src/hg/gar/garTable.pl +++ src/hg/gar/garTable.pl @@ -258,32 +258,36 @@ if (defined($sciName{$id})) { die "ERROR: duplicate asmId sciName: $id '$sci' '$sciName{$id}'"; } $sciName{$id} = $sci; $comName{$id} = $com; ++$hubComNames; } close (FH); printf STDERR "# common names from GenArk hubs: %s\n", commify($hubComNames); my $genArkCatchup = 0; ### catch up for any missed genArk hubs # accession<tab>assembly<tab>scientific name<tab>common name<tab>taxonId foreach my $asmIdKey (sort keys %genArkAsm) { - next if (defined($sciName{$asmIdKey})); - next if (defined($comName{$asmIdKey})); + if (defined($sciName{$asmIdKey})) { + next; + } + if (defined($comName{$asmIdKey})) { + next; + } my @a = split('\t', $genArkAsm{$asmIdKey}); $a[2] =~ s/_/ /g; $sciName{$asmIdKey} = $a[2]; $comName{$asmIdKey} = $a[3]; ++$genArkCatchup; } printf STDERR "# catch up %d genArk hubs from hgdownload list\n", $genArkCatchup; my %metaInfo; # key is asmId, value is a tsv string of numbers for: # asmSize asmContigCount n50Size n50Count n50ContigSize # n50ContigCount n50ScaffoldSize n50ScaffoldCount my %newMetaInfo; # accumulate new metaInfo to add to end of this file if ( -s "../accumulateMetaInfo.tsv" ) { @@ -343,33 +347,35 @@ # newer GCA assembly # GCF assembly # newer GCF assembly my $ncbiTotalAssemblies = 0; my $ncbiUniqueSpecies = 0; my $highestSpeciesCount = 0; my $highestCountName = ""; my $countingGoodToGo = 0; my $goodToGoReplaced = 0; my $warningNoDate = 0; my %skipPartialGenome; # key is asmId, value is sciName for 'partial' status +my $genomeReports = "/hive/data/outside/ncbi/genomes/reports"; + # obtain scientific name and asmId from assembly_summary files printf STDERR "### reading genbank and refseq assembly_summary\n"; -open (FH, "grep -v '^#' /hive/data/outside/ncbi/genomes/reports/assembly_summary_genbank.txt /hive/data/outside/ncbi/genomes/reports/assembly_summary_refseq.txt|cut -f8,14,20|") or die "can not read the assembly_summary files"; +open (FH, "grep -v '^#' $genomeReports/assembly_summary_genbank.txt $genomeReports/assembly_summary_refseq.txt $genomeReports/assembly_summary_genbank_historical.txt $genomeReports/assembly_summary_refseq_historical.txt|cut -f8,14,20|") or die "can not read the assembly_summary files"; while (my $line = <FH>) { chomp $line; # asmId will be derived from the ftpPath my ($sName, $fullPartial, $asmId) = split('\t', $line); # most interesting, skipping these partial assemblies ends up with more # in the final result ? $asmId =~ s#.*/##; next if ($asmId =~ m/^na$/); # something wrong with these two next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/); if ($fullPartial =~ m/partial/i) { # skip partial assemblies $skipPartialGenome{$asmId} = $sName; next; } ++$sciNameCount{$sName}; @@ -538,52 +544,60 @@ my $iucnSpeciesUsed = 0; # a unique count of IUCN sciName my $sciNameDisplayLimit = 5; # do not display more than 5 instances my $checkedAsmIds = 0; my $acceptedAsmIds = 0; my $notInGoodToGo = 0; my %alreadyDone; # key is asmId, value = 1 indicates already done my %underSized; # key is clade, value is count of underSized my $NA = "/hive/data/outside/ncbi/genomes/reports/newAsm"; my %genArkClade; # key is asmId, value is clade my $shouldBeGenArk = 0; my $shouldBeUcsc = 0; +my %usedGenArk; # key is asmId, value is count seen in *.today files foreach my $clade (@clades) { my $cladeCount = 0; my $goodToGoCount = 0; my $refSeq = "${NA}/all.refseq.${clade}.today"; my $genBank = "${NA}/all.genbank.${clade}.today"; - open (FH, "sort ${refSeq} ${genBank}|") or die "can not read $refSeq or $genBank"; + my $asmHubTsv = "~/kent/src/hg/makeDb/doc/${clade}AsmHub/${clade}.orderList.tsv"; + open (FH, "cut -f1 ${refSeq} ${genBank} ${asmHubTsv}|sort -u|") or die "can not read $refSeq or $genBank or $asmHubTsv"; while (my $asmId = <FH>) { chomp $asmId; $asmId =~ s/\.]/_/g; $asmId =~ s/\:/_/g; $asmId =~ s/\%/_/g; $asmId =~ s/\+/_/g; $asmId =~ s/\//_/g; $asmId =~ s/\#/_/g; # $asmId =~ s/[.:%+/#]/_/g; $asmId =~ s/[()]//g; $asmId =~ s/__/_/g; ++$shouldBeGenArk if (defined($genArkAsm{$asmId})); ++$shouldBeUcsc if (defined($rrGcaGcfList{$asmId})); - next if (defined ($skipPartialGenome{$asmId})); - next if (defined ($asmSuppressed{$asmId})); + if (defined ($skipPartialGenome{$asmId})) { + printf STDERR "# skipping genArk %s because skipPartialGenome\n", $asmId if (defined($genArkAsm{$asmId})); + next; + } + if (defined ($asmSuppressed{$asmId})) { + printf STDERR "# skipping genArk %s because asmSuppressed\n", $asmId if (defined($genArkAsm{$asmId})); + next; + } next if (defined ($alreadyDone{$asmId})); # something wrong with these two # GCA_900609255.1_Draft_mitochondrial_genome_of_wild_rice_W1683 # GCA_900609265.1_Draft_mitochondrial_genome_of_wild_rice_W1679 next if ($asmId =~ m/GCA_900609255.1|GCA_900609265.1/); # verify this asmId will pass the asmSize limit if (defined($metaInfo{$asmId})) { my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 0 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $asmId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $asmId if (defined($genArkAsm{$asmId})); printf STDERR "# ACK would be UCSC RR %s\n", $asmId if (defined($rrGcaGcfList{$asmId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $asmId, $metaInfo{$asmId}; next; @@ -600,30 +614,31 @@ next if ($ncbiSpeciesRecorded{$sciNames{$asmId}} > $sciNameDisplayLimit); $iucnSciName = $sciNames{$asmId}; $iucnSciName = $ncbiToIucnNames{$sciNames{$asmId}} if (defined($ncbiToIucnNames{$sciNames{$asmId}})); ++$iucnSpeciesRecorded{$iucnSciName}; } else { printf STDERR "# no sciName for: '%s' at %s\n", $asmId, commify($goodToGoCount + 1); } $genArkClade{$asmId} = $clade; ++$cladeCount; if (! defined($cladeToGo{$clade})) { my @a; $cladeToGo{$clade} = \@a; } my $cPtr = $cladeToGo{$clade}; push (@$cPtr, $asmId); + $usedGenArk{$asmId} += 1 if (defined($genArkAsm{$asmId})); ++$acceptedAsmIds; ++$goodToGoCount; ++$ncbiSpeciesUsed if ( defined($sciNames{$asmId}) && defined($ncbiSpeciesRecorded{$sciNames{$asmId}}) && (1 == $ncbiSpeciesRecorded{$sciNames{$asmId}}) ); ++$iucnSpeciesUsed if ( defined($iucnSpeciesRecorded{$iucnSciName}) && (1 == $iucnSpeciesRecorded{$iucnSciName})); ++$cladeCounts{$clade}; my $assembliesAvailable = 0; if (defined($sciNames{$asmId})) { if (defined($sciNameCount{$sciNames{$asmId}})) { $assembliesAvailable = $sciNameCount{$sciNames{$asmId}}; $maxDupAsm = $assembliesAvailable if ($assembliesAvailable > $maxDupAsm); } if ($assembliesAvailable > 1) { my $bPtr = $sciNameAsmList{$sciNames{$asmId}}; foreach my $aId (@$bPtr) { next if (defined ($alreadyDone{$aId})); @@ -633,49 +648,58 @@ my ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$aId}); # if asmSize is below the minimum, don't use it if ($asmSize < $minimalGenomeSize{$clade}) { printf STDERR "# %s underSized 1 %d %s %s < %s\n", $clade, ++$underSized{$clade}, $aId, commify($asmSize), commify($minimalGenomeSize{$clade}); printf STDERR "# ACK would be genArk assembly %s\n", $aId if (defined($genArkAsm{$aId})); printf STDERR "# ACK would be UCSC RR %s\n", $aId if (defined($rrGcaGcfList{$aId})); printf STDERR "# ACK metaInfo: %s '%s'\n", $aId, $metaInfo{$aId}; next; } } ++$ncbiSpeciesRecorded{$sciNames{$aId}}; # the defined($sciName{$aId}) indicates it is a GenArk genome # always accept those even if it goes beyond the limit if ( ($ncbiSpeciesRecorded{$sciNames{$aId}} <= $sciNameDisplayLimit) || defined($sciName{$aId}) ) { push (@$cPtr, $aId); + $usedGenArk{$aId} += 1 if (defined($genArkAsm{$aId})); ++$acceptedAsmIds; ++$goodToGoCount; ++$cladeCounts{$clade}; } # under limit count or is GenArk assembly } # if ($aId ne $asmId) } # foreach my $aId (@$bPtr) } # if ($assembliesAvailable > 1) } # if (defined($sciNames{$asmId})) } # while (my $asmId = <FH>) close (FH); printf STDERR "# %s\t%s\tgood to go %s\n", $clade, commify($cladeCount), $goodToGoCount; $totalGoodToGo += $goodToGoCount; } # foreach my $clade (@clades) printf STDERR "# checkedAsmIds: %d, acceptedAsmIds: %d, notInGoodToGo: %d\n", $checkedAsmIds, $acceptedAsmIds, $notInGoodToGo; printf STDERR "# total good to go %s, maxDupAsmCount: %s\n", commify($totalGoodToGo), commify($maxDupAsm);; printf STDERR "# should be GenArk: %s\n", commify($shouldBeGenArk); +printf STDERR "# had originally %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount); +foreach my $genArkAsmId (sort keys %genArkAsm) { + if (defined($usedGenArk{$genArkAsmId})) { + printf STDERR "# used genArk %s %s\n", $genArkAsmId, $genArkAsm{$genArkAsmId}; + } else { + printf STDERR "# missed genArk %s %s\n", $genArkAsmId, $genArkAsm{$genArkAsmId}; + } +} printf STDERR "# should be UCSC RR %s\n", commify($shouldBeUcsc); printf "<hr>\n"; if ( 1 == 0 ) { printf "<button id='summaryCountsHideShow' onclick='gar.hideTable(\"summaryCounts\")'>[hide]</button>\n"; printf "<table class='sortable borderOne' id='summaryCounts'>\n"; printf "<caption style='text-align:center;'><b>===== summary counts =====</b></caption>\n"; printf "<thead>\n"; printf "<tr>\n"; printf "<th>number of assemblies</th>\n"; printf "<th>category of count</th>\n"; printf "<th>table data in tsv<br>(tab separated value)<br>file format</th>\n"; printf "<th>assembly minimal size<br>to filter out projects that<br>are not whole genomes</th>\n"; printf "</tr>\n"; printf "</thead><tbody>\n"; @@ -904,31 +928,31 @@ # next; # } # if ("n/a" eq $gcxCountry{$asmId}) { # printf STDERR "# country is n/a $asmId\n"; # next; # } my ($p0, $p1, $p2) = split('_', $asmId, 3); my $accessionId = "${p0}_${p1}"; my $gcX = substr($asmId, 0, 3); my $d0 = substr($asmId, 4, 3); my $d1 = substr($asmId, 7, 3); my $d2 = substr($asmId, 10, 3); my $buildDir = "/hive/data/outside/ncbi/genomes/$gcX/$d0/$d1/$d2/$asmId"; my $asmRpt = "$buildDir/${asmId}_assembly_report.txt"; my $asmFna = "$buildDir/${asmId}_genomic.fna.gz"; - my $browserUrl = sprintf("https://genome.ucsc.edu/h/%s", $accessionId); + my $browserUrl = sprintf("/h/%s", $accessionId); my $arkDownload = sprintf("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/%s/%s/", $gcX, $d0, $d1, $d2, $accessionId); my $destDir = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2"; my $chromInfo = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.chromInfo.txt"; my $n50Txt = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.n50.txt"; my $contigN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.contigs.n50.txt"; my $scaffoldN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.scaffolds.n50.txt"; my $asmSize = 0; my $asmContigCount = 0; my $n50Size = 0; my $n50Count = 0; my $n50ContigSize = 0; my $n50ContigCount = 0; my $n50ScaffoldSize = 0; my $n50ScaffoldCount = 0; my $bioSample = ""; @@ -1002,31 +1026,31 @@ if ($iucnStatus ne " ") { $statusColor = $statusColors{$iucnStatus}; ++$statusCounts{$iucnStatus}; } ############# starting a table row ################################# if ($asmId =~ m/^GCF/) { $gcfGcaCounts{'GCF'} += 1; } elsif ($asmId =~ m/GCA/) { $gcfGcaCounts{'GCA'} += 1; } ### If equivalent to UCSC database browser, make reference to RR browser my $ucscDb = ""; $ucscDb = "/" . $rrGcaGcfList{$asmId} if (defined($rrGcaGcfList{$asmId})); if (length($ucscDb)) { - $browserUrl = sprintf("https://genome.ucsc.edu/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId}); + $browserUrl = sprintf("/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId}); } printf PC "%d", $asmCountInTable; # start a line output to clade.tableData.tsv ## count number of different scientific names used in this clade table if (!defined($cladeSciNameCounts{$clade})) { my %h; $cladeSciNameCounts{$clade} = \%h; } my $csnPtr = $cladeSciNameCounts{$clade}; $csnPtr->{$sciNames{$asmId}} += 1 if (defined($sciNames{$asmId})); my $rowClass = ""; my $gcaGcfClass = "gca"; $gcaGcfClass = "gcf" if ($asmId =~ m/^GCF/); $gcaGcfClass .= " hasIucn" if (length($iucnLink));