1aa2bfd8c15486f23f6efeec5f4358f366cfad53 hiram Wed Dec 29 12:45:41 2021 -0800 updated scripts to tolerate database assemblies in the listings no redmine diff --git src/hg/makeDb/doc/asmHubs/mkAsmStats.pl src/hg/makeDb/doc/asmHubs/mkAsmStats.pl index 9f4ddf6..998d16c 100755 --- src/hg/makeDb/doc/asmHubs/mkAsmStats.pl +++ src/hg/makeDb/doc/asmHubs/mkAsmStats.pl @@ -189,98 +189,127 @@ ### end the HTML output ############################################################################## sub endHtml() { &commonHtml::otherHubLinks($vgpIndex, $asmHubName); &commonHtml::htmlFooter($vgpIndex, $asmHubName); } sub asmCounts($) { my ($chromSizes) = @_; my ($sequenceCount, $totalSize) = split('\s+', `ave -col=2 $chromSizes | egrep "^count|^total" | awk '{printf "%d\\n", \$NF}' | xargs echo`); return ($sequenceCount, $totalSize); } -# my ($gapSize) = maskStats($faSizeTxt); sub maskStats($) { my ($faSizeFile) = @_; my $gapSize = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$3}'`; chomp $gapSize; $gapSize =~ s/\(//; my $totalBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$1}'`; chomp $totalBases; my $maskedBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$9}'`; chomp $maskedBases; my $maskPerCent = 100.0 * $maskedBases / $totalBases; return ($gapSize, $maskPerCent); } # grep "sequences in 1 file" GCA_900324465.2_fAnaTes1.2.faSize.txt # 555641398 bases (3606496 N's 552034902 real 433510637 upper 118524265 lower) in 50 sequences in 1 files sub gapStats($$) { my ($buildDir, $asmId) = @_; my $gapBed = "$buildDir/trackData/allGaps/$asmId.allGaps.bed.gz"; my $gapCount = 0; + if ($asmId !~ m/^GC/) { + $gapBed = "/hive/data/genomes/$asmId/$asmId.N.bed"; if ( -s "$gapBed" ) { + $gapCount = `awk '{print \$3-\$2}' $gapBed | ave stdin | grep '^count' | awk '{print \$2}'`; + } + } elsif ( -s "$gapBed" ) { $gapCount = `zcat $gapBed | awk '{print \$3-\$2}' | ave stdin | grep '^count' | awk '{print \$2}'`; } chomp $gapCount; return ($gapCount); } ############################################################################## ### tableContents() ############################################################################## sub tableContents() { foreach my $asmId (@orderList) { -printf STDERR "# asmId: '%s'\n", $asmId; - my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); - my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); - my $accessionDir = substr($asmId, 0 ,3); + my $gcPrefix = "GCx"; + my $asmAcc = "asmAcc"; + my $asmName = "asmName"; + my $accessionId = "GCx_098765432.1"; + my $accessionDir = ""; + my $configRa = "n/a"; + if ($asmId !~ m/^GC/) { + $configRa = "/hive/data/genomes/$asmId/$asmId.config.ra"; + $accessionId = `grep ^genBankAccessionID "${configRa}" | cut -d' ' -f2`; + chomp $accessionId; + $asmName = `grep ^ncbiAssemblyName "${configRa}" | cut -d' ' -f2`; + chomp $asmName; + $accessionDir = substr($accessionId, 0 ,3); + $accessionDir .= "/" . substr($accessionId, 4 ,3); + $accessionDir .= "/" . substr($accessionId, 7 ,3); + $accessionDir .= "/" . substr($accessionId, 10 ,3); + ($gcPrefix, $asmAcc) = split('_', $accessionId, 2); + printf STDERR "# %03d\t%s_%s (%s)\n", 1 + $asmCount, $accessionId, $asmName, $asmId; + } else { + ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); + $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); + $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); + printf STDERR "# %03d\t%s\n", 1 + $asmCount, $asmId; + } + # assume GCF/refseq build my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; - if ($gcPrefix eq "GCA") { + if ($gcPrefix eq "GCA") { # this is a GCA/genbank build $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId"; } my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; + my $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; + my $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; + my $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; + if ($asmId !~ m/^GC/) { # this is a UCSC genome database browser + $buildDir="/hive/data/outside/ncbi/genomes/$accessionDir/${accessionId}_${asmName}"; + $asmReport="$buildDir/${accessionId}_${asmName}_assembly_report.txt"; + $chromSizes = "/hive/data/genomes/$asmId/chrom.sizes"; + $twoBit = "/hive/data/genomes/$asmId/$asmId.2bit"; + $faSizeTxt = "/hive/data/genomes/$asmId/faSize.${asmId}.2bit.txt"; + } if (! -s "$asmReport") { printf STDERR "# no assembly report:\n# %s\n", $asmReport; next; } - my $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; - my $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; if (! -s "$twoBit") { printf STDERR "# no 2bit file:\n# %s\n", $twoBit; next; } - my $trackDb="$buildDir/${asmId}.trackDb.txt"; - next if (! -s "$trackDb"); # assembly build not complete - my $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; if ( ! -s "$faSizeTxt" ) { printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n"; print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`; } my ($gapSize, $maskPerCent) = maskStats($faSizeTxt); $overallGapSize += $gapSize; my ($seqCount, $totalSize) = asmCounts($chromSizes); $overallSeqCount += $seqCount; -# my $totalSize=`ave -col=2 $chromSizes | grep "^total" | awk '{printf "%d", \$NF}'`; $overallNucleotides += $totalSize; my $gapCount = gapStats($buildDir, $asmId); $overallGapCount += $gapCount; my $sciName = "notFound"; my $commonName = "notFound"; my $bioSample = "notFound"; my $bioProject = "notFound"; my $taxId = "notFound"; my $asmDate = "notFound"; my $itemsFound = 0; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = <FH>) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; @@ -312,35 +341,46 @@ $commonName =~ s/\)//; $commonName = $commonName{$asmId} if (exists($commonName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } elsif ($line =~ m/Taxid:/) { if ($taxId =~ m/notFound/) { ++$itemsFound; $taxId = $line; $taxId =~ s/.*:\s+//; } } } close (FH); my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; + my $browserName = $commonName; + my $browserUrl = "https://genome.ucsc.edu/h/$accessionId"; + if ($asmId !~ m/^GC/) { + $hubUrl = "https://hgdownload.soe.ucsc.edu/goldenPath/$asmId/bigZips"; + $browserUrl = "https://genome.ucsc.edu/cgi-bin/hgTracks?db=$asmId"; + $browserName = "$commonName ($asmId)"; + } printf "<tr><td align=right>%d</td>\n", ++$asmCount; # printf "<td align=center><a href='https://genome.ucsc.edu/cgi-bin/hgGateway?hubUrl=%s/hub.txt&genome=%s&position=lastDbPos' target=_blank>%s</a></td>\n", $hubUrl, $accessionId, $commonName; - printf "<td align=center><a href='https://genome.ucsc.edu/h/%s' target=_blank>%s</a></td>\n", $accessionId, $commonName; + printf "<td align=center><a href='%s' target=_blank>%s</a></td>\n", $browserUrl, $browserName; printf " <td align=center><a href='%s/' target=_blank>%s</a></td>\n", $hubUrl, $sciName; + if ($asmId !~ m/^GC/) { + printf " <td align=left><a href='https://www.ncbi.nlm.nih.gov/assembly/%s_%s/' target=_blank>%s_%s</a></td>\n", $gcPrefix, $asmAcc, $accessionId, $asmName; + } else { printf " <td align=left><a href='https://www.ncbi.nlm.nih.gov/assembly/%s/' target=_blank>%s</a></td>\n", $accessionId, $asmId; + } printf " <td align=right>%s</td>\n", commify($seqCount); printf " <td align=right>%s</td>\n", commify($totalSize); printf " <td align=right>%s</td>\n", commify($gapCount); printf " <td align=right>%s</td>\n", commify($gapSize); printf " <td align=right>%.2f</td>\n", $maskPerCent; printf "</tr>\n"; } } # sub tableContents() ############################################################################## ### main() ############################################################################## open (FH, "<${orderList}") or die "can not read ${orderList}"; while (my $line = <FH>) {