452b689cab327a20bb7ac506eaf89a3f69d23feb hiram Tue Jan 17 08:23:41 2023 -0800 invertebrates has become invertebrate and master.run.list has changed columns no redmine diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl index 247d28b..a803556 100755 --- src/hg/gar/garTable.pl +++ src/hg/gar/garTable.pl @@ -80,72 +80,71 @@ ############################################################################### # output a table cell for an N50 measurement sub n50Cell($$$) { my ($size, $count, $fh) = @_; if ($size > 0) { printf "<td style='display:none; text-align:right;' sorttable_customkey='%d'>%s (%s)</td>", $size, gmk($size), commify($count); printf $fh "\t%d (%d)", $size, $count; # output to clade.tableData.txt } else { printf "<td style='display:none;'> </td>"; printf $fh "\tn/a (n/a)"; # output to clade.tableData.txt } } ############################################################################### -my @clades = qw( primates mammals birds fish vertebrate invertebrates plants fungi ); +my @clades = qw( primates mammals birds fish vertebrate invertebrate plants fungi ); # to help weed out some of the noise # key is clade, value is minimal size to count as a whole genome # these are actually pretty low to allow in some alternate haplotype # assemblies that don't seem to be the whole assembly. # The assemblies are also filtered by NCBI status 'full/partial' to only # allow in the 'full' genomes meaning representation of the whole genome my %minimalGenomeSize = ( primates => 1000000000, mammals => 20000000, birds => 200000000, fish => 100000, vertebrate => 4000000, - invertebrates => 10000, + invertebrate => 10000, plants => 100000, fungi => 50000, ); ######################################################################### ## read in list of current GenArk assemblies my %genArkAsm; # key is asmId, value is string with: my %genArkClade; # key is asmId, value is clade my %genArkCladeProfile; # key is clade, value is count my %genArkSciName; # key is asmId, value is sciName my %genArkComName; # key is asmId, value is sciName # accession<tab>assembly<tab>scientific name<tab>common name<tab>taxonId<tab>clade my $genArkCount = 0; printf STDERR "# reading UCSC_GI.assemblyHubList.txt\n"; open (FH, "<UCSC_GI.assemblyHubList.txt") or die "can not read UCSC_GI.assemblyHubList.txt"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($accession, $assembly, $sciName, $commonName, $taxonId, $gClade) = split('\t', $line); my $asmId = sprintf("%s_%s", $accession, $assembly); die "ERROR: duplicate accession from UCSC_GI.assemblyHubList.txt" if (defined($genArkAsm{$asmId})); $genArkAsm{$asmId} = sprintf("%s\t%s\t%s\t%s\t%s", $accession, $assembly, $sciName, $commonName, $taxonId); $gClade =~ s/\(L\)//; # remove the 'legacy' tag - $gClade = "invertebrates" if ($gClade =~ m/invertebrate/); $genArkClade{$asmId} = $gClade; ++$genArkCladeProfile{$gClade}; ++$genArkCount; } close (FH); # accession assembly scientific name common name taxonId # GCA_000001905.1 Loxafr3.0 Loxodonta africana African savanna elephant (2009 genbank) 9785 printf STDERR "# have %s assemblies from GenArk UCSC_GI.assemblyHubList.txt\n", commify($genArkCount); printf STDERR "# genArk clade profile:\n"; foreach my $gClade (sort keys %genArkCladeProfile) { printf STDERR "# %6d\t%s\n", $genArkCladeProfile{$gClade}, $gClade; } @@ -575,31 +574,31 @@ my $examinedAsmId = 0; my $shouldBeGenArk = 0; my $shouldBeUcsc = 0; my %usedGenArk; # key is asmId, value is count seen in *.today files my $genArkTooManySpecies = 0; my $countingUsedGenArk = 0; my $prevUsedGenArk = 0; foreach my $clade (@clades) { my $cladeCount = 0; my $goodToGoCount = 0; my $refSeq = "${NA}/all.refseq.${clade}.today"; my $genBank = "${NA}/all.genbank.${clade}.today"; my $asmHubTsv = "~/kent/src/hg/makeDb/doc/${clade}AsmHub/${clade}.orderList.tsv"; - if ($clade eq "invertebrates") { + if ($clade eq "invertebrate") { $asmHubTsv = "~/kent/src/hg/makeDb/doc/invertebrateAsmHub/invertebrate.orderList.tsv"; } open (FH, "cut -f1 ${refSeq} ${genBank} ${asmHubTsv}|sort -u|") or die "can not read $refSeq or $genBank or $asmHubTsv"; while (my $asmId = <FH>) { chomp $asmId; ++$examinedAsmId; $asmId =~ s/\.]/_/g; $asmId =~ s/\:/_/g; $asmId =~ s/\%/_/g; $asmId =~ s/\+/_/g; $asmId =~ s/\//_/g; $asmId =~ s/\#/_/g; # $asmId =~ s/[.:%+/#]/_/g; $asmId =~ s/[()]//g; $asmId =~ s/__/_/g; @@ -1091,31 +1090,31 @@ ++$outputGenArkRR if (defined($rrGcaGcfList{$asmId}) || defined($genArkAsm{$asmId})); $outputGenArk{$asmId} = $clade if (defined($genArkAsm{$asmId})); $outputRR{$asmId} = $clade if (defined($rrGcaGcfList{$asmId})); if ($asmId =~ m/^GCF/) { $gcfGcaCounts{'GCF'} += 1; } elsif ($asmId =~ m/GCA/) { $gcfGcaCounts{'GCA'} += 1; } ### If equivalent to UCSC database browser, make reference to RR browser my $ucscDb = ""; $ucscDb = "/" . $rrGcaGcfList{$asmId} if (defined($rrGcaGcfList{$asmId})); if (length($ucscDb)) { $browserUrl = sprintf("/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId}); } - printf PC "%d", $asmCountInTable; # start a line output to clade.tableData.tsv + printf PC "%s", $browserUrl; # start a line output to clade.tableData.tsv ## count number of different scientific names used in this clade table if (!defined($cladeSciNameCounts{$clade})) { my %h; $cladeSciNameCounts{$clade} = \%h; } my $csnPtr = $cladeSciNameCounts{$clade}; $csnPtr->{$sciNames{$asmId}} += 1 if (defined($sciNames{$asmId})); my $rowClass = ""; my $gcaGcfClass = "gca"; $gcaGcfClass = "gcf" if ($asmId =~ m/^GCF/); $gcaGcfClass .= " hasIucn" if (length($iucnLink)); if (defined($comName{$asmId})) { if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db