afee0fcf2a7b7de2eb37e1420dc126607373453e hiram Wed May 27 17:31:45 2020 -0700 rework scripts to allow subset hubs to be created from a list of assemblies refs #23734 diff --git src/hg/makeDb/doc/asmHubs/mkAsmStats.pl src/hg/makeDb/doc/asmHubs/mkAsmStats.pl index 85afdfa..881e100 100755 --- src/hg/makeDb/doc/asmHubs/mkAsmStats.pl +++ src/hg/makeDb/doc/asmHubs/mkAsmStats.pl @@ -1,39 +1,40 @@ #!/usr/bin/env perl use strict; use warnings; use File::stat; my $argc = scalar(@ARGV); -if ($argc != 2) { - printf STDERR "mkAsmStats Name asmName\n"; - printf STDERR "e.g.: mkAsmStats Mammals mammals\n"; +if ($argc != 3) { + printf STDERR "mkAsmStats Name asmHubName [two column name list]\n"; + printf STDERR "e.g.: mkAsmStats Mammals mammals mammals.asmId.commonName.tsv\n"; + printf STDERR "the name list is found in \$HOME/kent/src/hg/makeDb/doc/asmHubs/\n"; + printf STDERR "\nthe two columns are 1: asmId (accessionId_assemblyName)\n"; + printf STDERR "column 2: common name for species, columns separated by tab\n"; exit 255; } my $Name = shift; my $asmHubName = shift; +my $commonNameOrder = shift; my $home = $ENV{'HOME'}; my $toolsDir = "$home/kent/src/hg/makeDb/doc/asmHubs"; -my $commonNameList = "$asmHubName.asmId.commonName.tsv"; -my $commonNameOrder = "$asmHubName.commonName.asmId.orderList.tsv"; -my @orderList; # asmId of the assemblies in order from the *.list files -# the order to read the different .list files: -my %betterName; # key is asmId, value is better common name than found in - # assembly_report +my @orderList; # asmId of the assemblies in order from the commonNameOrder file +my %commonName; # key is asmId, value is a common name, perhaps more appropriate + # than found in assembly_report file my $vgpIndex = 0; $vgpIndex = 1 if ($Name =~ m/vgp/i); my $assemblyTotal = 0; # complete list of assemblies in this group my $asmCount = 0; # count of assemblies completed and in the table my $overallNucleotides = 0; my $overallSeqCount = 0; my $overallGapSize = 0; my $overallGapCount = 0; ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; @@ -240,31 +241,32 @@ my ($buildDir, $asmId) = @_; my $gapBed = "$buildDir/trackData/allGaps/$asmId.allGaps.bed.gz"; my $gapCount = 0; if ( -s "$gapBed" ) { $gapCount = `zcat $gapBed | awk '{print \$3-\$2}' | ave stdin | grep '^count' | awk '{print \$2}'`; } chomp $gapCount; return ($gapCount); } ############################################################################## ### tableContents() ############################################################################## sub tableContents() { - foreach my $asmId (reverse(@orderList)) { + foreach my $asmId (@orderList) { +printf STDERR "# asmId: '%s'\n", $asmId; my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); my $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; if ($gcPrefix eq "GCA") { $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId"; } my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; if (! -s "$asmReport") { printf STDERR "# no assembly report:\n# %s\n", $asmReport; next; } @@ -315,31 +317,31 @@ $bioSample =~ s/.*:\s+//; } } elsif ($line =~ m/BioProject:/) { if ($bioProject =~ m/notFound/) { ++$itemsFound; $bioProject = $line; $bioProject =~ s/.*:\s+//; } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*\(//; $commonName =~ s/\)//; - $commonName = $betterName{$asmId} if (exists($betterName{$asmId})); + $commonName = $commonName{$asmId} if (exists($commonName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } elsif ($line =~ m/Taxid:/) { if ($taxId =~ m/notFound/) { ++$itemsFound; $taxId = $line; $taxId =~ s/.*:\s+//; } } } close (FH); my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; printf "<tr><td align=right>%d</td>\n", ++$asmCount; # printf "<td align=center><a href='https://genome.ucsc.edu/cgi-bin/hgGateway?hubUrl=%s/hub.txt&genome=%s&position=lastDbPos' target=_blank>%s</a></td>\n", $hubUrl, $accessionId, $commonName; @@ -351,27 +353,27 @@ printf " <td align=right>%s</td>\n", commify($gapCount); printf " <td align=right>%s</td>\n", commify($gapSize); printf " <td align=right>%.2f</td>\n", $maskPerCent; printf "</tr>\n"; } } # sub tableContents() ############################################################################## ### main() ############################################################################## open (FH, "<$toolsDir/${commonNameOrder}") or die "can not read ${commonNameOrder}"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; - my ($commonName, $asmId) = split('\t', $line); + my ($asmId, $commonName) = split('\t', $line); push @orderList, $asmId; - $betterName{$asmId} = $commonName; + $commonName{$asmId} = $commonName; ++$assemblyTotal; } close (FH); startHtml(); startTable(); tableContents(); endTable(); endHtml();