e56876e545b21501152e96f655141eb3b229cf0c hiram Fri Feb 25 16:19:26 2022 -0800 add in bioSample and bioProject columns and control of IUCN display refs #28930 diff --git src/hg/gar/garTable.pl src/hg/gar/garTable.pl index 627318a..bf34033 100755 --- src/hg/gar/garTable.pl +++ src/hg/gar/garTable.pl @@ -502,30 +502,41 @@ die "ERROR: duplicate asmId data $id '$country' '$gcxCountry{$id}'"; } printf STDERR "# undefined collectDate for $id" if (!defined($collectDate)); printf STDERR "# undefined submitter for $id" if (!defined($submitter)); $country =~ s/"//g; $collectDate =~ s/"//g; $submitter =~ s/"//g; $gcxCountry{$id} = $country; $gcxDate{$id} = $collectDate; $gcxSubmitter{$id} = $submitter; ++$asmIdCount; } close (FH); printf STDERR "# asmId count: %s from asmId.country.date.by.tsv\n", commify($asmIdCount); +my %asmReportData; # key is asmId, value is tsv string for: +# sciName commonName bioSample bioProject taxId asmDate +# obtained from scanning all the assembly report files +open (FH, "<../asmReport.data.tsv") or die "can not read ../asmReport.data.tsv"; +while (my $line = ) { + chomp $line; + my ($id, $rest) = split('\t', $line, 2); + $asmReportData{$id} = $rest; +} +close (FH); + ### This cladeToGo set of lists is the main driver of the table ### An assembly needs to be in this set in order to get into the table my %cladeToGo; # key is clade name, value is array pointer for asmId list my $totalGoodToGo = 0; my %cladeCounts; # key is clade name, value is count of assemblies used my $maxDupAsm = 0; my %ncbiSpeciesRecorded; # key is NCBI sciName, value is count of those my $ncbiSpeciesUsed = 0; # a unique count of NCBI sciName my %iucnSpeciesRecorded; # key is IUCN sciName, value is count of those my $iucnSpeciesUsed = 0; # a unique count of IUCN sciName my $sciNameDisplayLimit = 5; # do not display more than 5 instances my $checkedAsmIds = 0; @@ -728,48 +739,51 @@ printf "
  • \n"; printf "
  • \n"; printf " \n"; printf " \n"; printf "\n"; printf "
    \n"; printf " select assembly type to display\n"; printf "
    \n"; printf "
      \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; +printf "
    • \n"; printf "
    \n"; printf "
    \n"; printf "
    \n"; printf "
    \n"; printf " show/hide columns\n"; printf "
    \n"; printf "
      \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; +printf "
    • \n"; +printf "
    • \n"; printf "
    • \n"; printf "
    • \n"; printf "
    \n"; printf "
    \n"; printf "
    \n\n"; printf " \n"; printf " \n\n"; printf "\n"; printf "\n"; printf "

    . . . please wait while page loads . . .

    \n\n"; printf "
    \n"; printf "
    \n\n"; @@ -783,48 +797,52 @@ ## is still not usable until much time later. ############################################################################## printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; +printf "\n"; +printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf "\n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n"; printf " \n", $statusColors{"CR"}, $statusColors{"EN"}, $statusColors{"VU"}; printf " \n"; printf " \n"; +printf " \n"; +printf " \n"; printf " \n"; -printf " \n"; +printf " \n"; printf "\n"; printf "\n"; my %equivalentNamesUsed; # key is NCBI sciName, value is IUCN sciName my $pageSectionCount = 0; my %checkDupAsmId; # key is asmId, value is count of times seen my %cladeSciNameCounts; # key is clade, value is number of different # scientific names my %gcfGcaCounts; # key is GCF or GCA, value is count of each my $asmCountInTable = 0; # counting the rows output my %statusCounts; # key is status: CR EN VU, value is count my $totalAssemblySize = 0; # sum of all assembly sizes @@ -899,30 +917,35 @@ my $browserUrl = sprintf("https://genome.ucsc.edu/h/%s", $accessionId); my $arkDownload = sprintf("https://hgdownload.soe.ucsc.edu/hubs/%s/%s/%s/%s/%s/", $gcX, $d0, $d1, $d2, $accessionId); my $destDir = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2"; my $chromInfo = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.chromInfo.txt"; my $n50Txt = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.n50.txt"; my $contigN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.contigs.n50.txt"; my $scaffoldN50 = "/hive/data/outside/ncbi/genomes/sizes/$gcX/$d0/$d1/$d2/${asmId}.scaffolds.n50.txt"; my $asmSize = 0; my $asmContigCount = 0; my $n50Size = 0; my $n50Count = 0; my $n50ContigSize = 0; my $n50ContigCount = 0; my $n50ScaffoldSize = 0; my $n50ScaffoldCount = 0; + my $bioSample = ""; + my $bioProject = ""; + if (defined($asmReportData{$asmId})) { + (undef, undef, $bioSample, $bioProject, undef) = split('\t', $asmReportData{$asmId}, 5); + } if (defined($metaInfo{$asmId})) { ($asmSize, $asmContigCount, $n50Size, $n50Count, $n50ContigSize, $n50ContigCount, $n50ScaffoldSize, $n50ScaffoldCount) = split('\t', $metaInfo{$asmId}); } else { my ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks); my $fnaModTime = 0; if (-s "$asmFna") { ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($asmFna); $fnaModTime = $mtime; } my $ciModTime = 0; if ( -s "$chromInfo" ) { ($dev, $ino, $mode, $nlink, $uid, $gid, $rdev, $size, $atime, $mtime, $ctime, $blksize, $blocks) = stat($chromInfo); $ciModTime = $mtime; } if ($fnaModTime > $ciModTime) { @@ -992,30 +1015,31 @@ $browserUrl = sprintf("https://genome.ucsc.edu/cgi-bin/hgTracks?db=%s", $rrGcaGcfList{$asmId}); } printf PC "%d", $asmCountInTable; # start a line output to clade.tableData.tsv ## count number of different scientific names used in this clade table if (!defined($cladeSciNameCounts{$clade})) { my %h; $cladeSciNameCounts{$clade} = \%h; } my $csnPtr = $cladeSciNameCounts{$clade}; $csnPtr->{$sciNames{$asmId}} += 1 if (defined($sciNames{$asmId})); my $rowClass = ""; my $gcaGcfClass = "gca"; $gcaGcfClass = "gcf" if ($asmId =~ m/^GCF/); + $gcaGcfClass .= " hasIucn" if (length($iucnLink)); if (defined($comName{$asmId})) { if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db } else { $rowClass = " class='gak $gcaGcfClass $clade'"; # present in GenArk } } else { # can be requested if (defined($rrGcaGcfList{$asmId})) { $rowClass = " class='ucscDb $gcaGcfClass $clade'"; # present in UCSC db } else { $rowClass = " class='gar $gcaGcfClass $clade'"; # available for request } } ### can override CSS settings here ### $rowClass = " class='gar' style='display: none;'"; @@ -1123,30 +1147,49 @@ printf PC "\t%s", $asmTaxId{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# tenth column, assembly date ################ if (defined($asmDate{$asmId})) { printf "", $asmDate{$asmId}; printf PC "\t%s", $asmDate{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } + ############# eleventh column, bioSample ################ + if (length($bioSample) && $bioSample !~ m#n/a#) { + printf "", $bioSample, $bioSample; + printf PC "\t%s", $bioSample; # output to clade.tableData.txt + } else { + printf ""; + printf PC "\t%s", "n/a"; # output to clade.tableData.txt + } + + ############# twelveth column, bioProject ################ + if (length($bioProject) && $bioProject !~ m#n/a#) { + printf "", $bioProject, $bioProject; + printf PC "\t%s", $bioProject; # output to clade.tableData.txt + + } else { + printf ""; + printf PC "\t%s", "n/a"; # output to clade.tableData.txt + } + ############# eleventh column, submitter ################ $asmUrl = "https://www.ncbi.nlm.nih.gov/assembly/$accessionId"; if (defined($asmSubmitter{$asmId})) { my $submitterSortKey = lc($asmSubmitter{$asmId}); $submitterSortKey =~ s/ //g; $submitterSortKey =~ s/[^a-z0-9]//ig; printf "", substr($submitterSortKey,0,20), $asmSubmitter{$asmId}; printf PC "\t%s", $asmSubmitter{$asmId}; # output to clade.tableData.txt } else { printf ""; printf PC "\t%s", "n/a"; # output to clade.tableData.txt } ############# twelveth column, clade ################ printf "\n", $clade;
    common nameLinks to an existing assembly browser, Button opens an assembly request form.
    scientific name (count)Links to Google image search. Count shows the number of assemblies available for this orgnism.
    NCBI assemblyLinks to NCBI resource record.
    assembly
    sizeNumber of nucleotides in the assembly.
    sequence
    countThe number of sequences in this assembly.
    scaffold N50
    length (L50)N50 (L50) length.
    contig N50
    length (L50)N50 (L50) length.
    IUCNLinks to IUCN Red List of Threatened Species (version 2021-3) CR - Critical / EN - Endangered / VU - Vulnerable
    NCBI taxIDLinks to NCBI Taxonomy database.
    assembly
    dateDate submitted to NCBI assembly database.
    BioSampleBioSample ID at NCBI.
    BioProjectBioProject ID at NCBI.
    Assembly submitterPerson or group who submitted to NCBI Assembly database.
    cladeClade of this organism. Note: the invertebrate clade is a catch all category that includes organisims not typically classified as invertebrate
    cladeClade of this organism.
    n/a%sn/a%s %s %sn/a%s