1aa2bfd8c15486f23f6efeec5f4358f366cfad53 hiram Wed Dec 29 12:45:41 2021 -0800 updated scripts to tolerate database assemblies in the listings no redmine diff --git src/hg/makeDb/doc/asmHubs/trackData.pl src/hg/makeDb/doc/asmHubs/trackData.pl index c717a12..3ac3983 100755 --- src/hg/makeDb/doc/asmHubs/trackData.pl +++ src/hg/makeDb/doc/asmHubs/trackData.pl @@ -53,77 +53,89 @@ my $asmCount = 0; # count of assemblies completed and in the table my $overallNucleotides = 0; my $overallSeqCount = 0; my $overallGapSize = 0; my $overallGapCount = 0; ############################################################################## # from Perl Cookbook Recipe 2.17, print out large numbers with comma delimiters: ############################################################################## sub commify($) { my $text = reverse $_[0]; $text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; return scalar reverse $text } +# ($itemCount, $percentCover) = bigWigMeasure($trackFile, $genomeSize); +sub bigWigMeasure($$) { + my ($file, $genomeSize) = @_; + my $bigWigInfo = `bigWigInfo "$file" | egrep "basesCovered:|mean:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; + chomp $bigWigInfo; + my ($bases, $mean) = split('\s+', $bigWigInfo); + my $itemCount = sprintf ("%.2f", $mean); + my $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); + return ($itemCount, $percentCover); +} + +# $percentCover = pcFbFile($trackFb); +sub pcFbFile($) { + my ($trackFb) = @_; + my ($itemBases, undef, undef, $noGapSize, undef) = split('\s+', `cat $trackFb`, 5); + my $percentCover = sprintf("%.2f %%", 100.0 * $itemBases / $noGapSize); + return $percentCover; +} + # ($itemCount, $percentCover) = oneTrackData($asmId, $track, $trackFile, $totalSize, $trackFb, $runDir); # might have a track feature bits file (trackFb), maybe not sub oneTrackData($$$$$$) { my ($asmId, $trackName, $file, $genomeSize, $trackFb, $runDir) = @_; # printf STDERR "# %s\n", $file; my $itemCount = 0; my $percentCover = 0; if (! -s "${file}") { if ($trackName eq "gapOverlap") { if (-s "${runDir}/$asmId.gapOverlap.bed.gz" ) { my $lineCount=`zcat "${runDir}/$asmId.gapOverlap.bed.gz" | head | wc -l`; chomp $lineCount; if (0 == $lineCount) { return("0", "0 %"); } else { return("n/a", "n/a"); } } } elsif ($trackName eq "gap") { return("0", "0 %"); } else { return("n/a", "n/a"); } } else { if ($file =~ m/.bw$/) { - my $bigWigInfo = `bigWigInfo "$file" | egrep "basesCovered:|mean:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; - chomp $bigWigInfo; - my ($bases, $mean) = split('\s+', $bigWigInfo); - $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); - $itemCount = sprintf ("%.2f", $mean); -# printf STDERR "# bigWigInfo %s %s %s\n", $itemCount, $percentCover, $file; + ($itemCount, $percentCover) = bigWigMeasure($file, $genomeSize); } else { my $bigBedInfo = `bigBedInfo "$file" | egrep "itemCount:|basesCovered:" | awk '{print \$NF}' | xargs echo | sed -e 's/,//g;'`; chomp $bigBedInfo; my ($items, $bases) = split('\s', $bigBedInfo); $itemCount = commify($items); $percentCover = sprintf("%.2f %%", 100.0 * $bases / $genomeSize); -# 56992654 bases of 2616369673 (2.178%) in intersection if ( -s "${trackFb}" ) { - my ($itemBases, undef, undef, $noGapSize, undef) = split('\s+', `cat $trackFb`, 5); - $percentCover = sprintf("%.2f %%", 100.0 * $itemBases / $noGapSize); + $percentCover = pcFbFile($trackFb); } # printf STDERR "# bigBedInfo %s %s %s\n", $itemCount, $percentCover, $file; } } return ($itemCount, $percentCover); -} +} # sub oneTrackData($$$$$$) ############################################################################## ### start the HTML output ############################################################################## sub startHtml() { my $timeStamp = `date "+%F"`; chomp $timeStamp; my $subSetMessage = "subset of $asmHubName only"; if ($asmHubName eq "vertebrate") { $subSetMessage = "subset of other ${asmHubName}s only"; } if ($vgpIndex) { @@ -273,117 +285,149 @@ ### end the HTML output ############################################################################## sub endHtml() { &commonHtml::otherHubLinks($vgpIndex, $asmHubName); &commonHtml::htmlFooter($vgpIndex, $asmHubName); } # sub endHtml() sub asmCounts($) { my ($chromSizes) = @_; my ($sequenceCount, $totalSize) = split('\s+', `ave -col=2 $chromSizes | egrep "^count|^total" | awk '{printf "%d\\n", \$NF}' | xargs echo`); return ($sequenceCount, $totalSize); } -# my ($gapSize, $maskPerCent, $sizeNoGaps) = maskStats($faSizeTxt); sub maskStats($) { my ($faSizeFile) = @_; my $sizeNoGaps = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$4}'`; my $gapSize = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$3}'`; chomp $gapSize; $gapSize =~ s/\(//; my $totalBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$1}'`; chomp $totalBases; my $maskedBases = `grep 'sequences in 1 file' $faSizeFile | awk '{print \$9}'`; chomp $maskedBases; my $maskPerCent = 100.0 * $maskedBases / $totalBases; return ($gapSize, $maskPerCent, $sizeNoGaps); } # grep "sequences in 1 file" GCA_900324465.2_fAnaTes1.2.faSize.txt # 555641398 bases (3606496 N's 552034902 real 433510637 upper 118524265 lower) in 50 sequences in 1 files sub gapStats($$) { my ($buildDir, $asmId) = @_; my $gapBed = "$buildDir/trackData/allGaps/$asmId.allGaps.bed.gz"; my $gapCount = 0; + if ($asmId !~ m/^GC/) { + $gapBed = "/hive/data/genomes/$asmId/$asmId.N.bed"; if ( -s "$gapBed" ) { + $gapCount = `awk '{print \$3-\$2}' $gapBed | ave stdin | grep '^count' | awk '{print \$2}'`; + } + } elsif ( -s "$gapBed" ) { $gapCount = `zcat $gapBed | awk '{print \$3-\$2}' | ave stdin | grep '^count' | awk '{print \$2}'`; } chomp $gapCount; return ($gapCount); } ############################################################################## ### tableContents() ############################################################################## sub tableContents() { my $asmCounted = 0; if ($testOutput) { # add extra columns during 'test' output # 0 1 2 3 4 5 6 # 7 8 9 10 11 12 13 # 14 # my @trackList = qw(ncbiRefSeq xenoRefGene augustus ensGene gc5Base gap allGaps assembly rmsk simpleRepeat windowMasker gapOverlap tandemDups cpgIslandExtUnmasked cpgIslandExt); # 0 1 2 3 4 5 6 # 7 8 9 10 # my @trackList = qw(ncbiRefSeq xenoRefGene augustus ensGene gc5Base allGaps assembly rmsk simpleRepeat windowMasker cpgIslandExtUnmasked); splice @trackList, 11, 0, "cpgIslandExt"; splice @trackList, 10, 0, "tandemDups"; splice @trackList, 10, 0, "gapOverlap"; splice @trackList, 5, 0, "gap"; splice @trackList, 1, 0, "ncbiGene"; } foreach my $asmId (@orderList) { + my $gcPrefix = "GCx"; + my $asmAcc = "asmAcc"; + my $asmName = "asmName"; + my $accessionId = "GCx_098765432.1"; + my $accessionDir = ""; + my $configRa = "n/a"; my $tracksCounted = 0; - my ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); - my $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); - my $accessionDir = substr($asmId, 0 ,3); + my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; + my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; + my $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; + my $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; + my $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; + if ($asmId !~ m/^GC/) { + $configRa = "/hive/data/genomes/$asmId/$asmId.config.ra"; + $accessionId = `grep ^genBankAccessionID "${configRa}" | cut -d' ' -f2`; + chomp $accessionId; + $asmName = `grep ^ncbiAssemblyName "${configRa}" | cut -d' ' -f2`; + chomp $asmName; + $accessionDir = substr($accessionId, 0 ,3); + $accessionDir .= "/" . substr($accessionId, 4 ,3); + $accessionDir .= "/" . substr($accessionId, 7 ,3); + $accessionDir .= "/" . substr($accessionId, 10 ,3); + ($gcPrefix, $asmAcc) = split('_', $accessionId, 2); + $buildDir="/hive/data/outside/ncbi/genomes/$accessionDir/${accessionId}_${asmName}"; + $asmReport="$buildDir/${accessionId}_${asmName}_assembly_report.txt"; + $chromSizes = "/hive/data/genomes/$asmId/chrom.sizes"; + $twoBit = "/hive/data/genomes/$asmId/$asmId.2bit"; + $faSizeTxt = "/hive/data/genomes/$asmId/faSize.${asmId}.2bit.txt"; + } else { + ($gcPrefix, $asmAcc, $asmName) = split('_', $asmId, 3); + $accessionId = sprintf("%s_%s", $gcPrefix, $asmAcc); + $accessionDir = substr($asmId, 0 ,3); $accessionDir .= "/" . substr($asmId, 4 ,3); $accessionDir .= "/" . substr($asmId, 7 ,3); $accessionDir .= "/" . substr($asmId, 10 ,3); - my $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; + $buildDir = "/hive/data/genomes/asmHubs/refseqBuild/$accessionDir/$asmId"; if ($gcPrefix eq "GCA") { $buildDir = "/hive/data/genomes/asmHubs/genbankBuild/$accessionDir/$asmId"; } - my $trackDb="$buildDir/${asmId}.trackDb.txt"; - next if (! -s "$trackDb"); # assembly build not complete - my $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; + $asmReport="$buildDir/download/${asmId}_assembly_report.txt"; + $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; + $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; + $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; + } +# my $trackDb="$buildDir/${asmId}.trackDb.txt"; +# next if (! -s "$trackDb"); # assembly build not complete if (! -s "$asmReport") { printf STDERR "# no assembly report:\n# %s\n", $asmReport; next; } - my $chromSizes = "${buildDir}/${asmId}.chrom.sizes"; - my $twoBit = "${buildDir}/trackData/addMask/${asmId}.masked.2bit"; if (! -s "$twoBit") { printf STDERR "# no 2bit file:\n# %s\n", $twoBit; printf "<tr><td align=right>%d</td>\n", ++$asmCount; printf "<td align=center>%s</td>\n", $accessionId; printf "<th colspan=15 align=center>missing masked 2bit file</th>\n"; printf "</tr>\n"; next; } - my $faSizeTxt = "${buildDir}/${asmId}.faSize.txt"; if ( ! -s "$faSizeTxt" ) { printf STDERR "twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt\n"; print `twoBitToFa $twoBit stdout | faSize stdin > $faSizeTxt`; } my ($gapSize, $maskPerCent, $sizeNoGaps) = maskStats($faSizeTxt); $overallGapSize += $gapSize; my ($seqCount, $totalSize) = asmCounts($chromSizes); $overallSeqCount += $seqCount; -# my $totalSize=`ave -col=2 $chromSizes | grep "^total" | awk '{printf "%d", \$NF}'`; $overallNucleotides += $totalSize; my $gapCount = gapStats($buildDir, $asmId); $overallGapCount += $gapCount; my $sciName = "notFound"; my $commonName = "notFound"; my $asmDate = "notFound"; my $itemsFound = 0; open (FH, "<$asmReport") or die "can not read $asmReport"; while (my $line = <FH>) { last if ($itemsFound > 5); chomp $line; $line =~ s/ //g;; $line =~ s/\s+$//g;; if ($line =~ m/Date:/) { if ($asmDate =~ m/notFound/) { @@ -393,100 +437,172 @@ } } elsif ($line =~ m/Organism name:/) { if ($sciName =~ m/notFound/) { ++$itemsFound; $commonName = $line; $sciName = $line; $commonName =~ s/.*\(//; $commonName =~ s/\)//; $commonName = $commonName{$asmId} if (exists($commonName{$asmId})); $sciName =~ s/.*:\s+//; $sciName =~ s/\s+\(.*//; } } } close (FH); - my $hubUrl = "https://genome.ucsc.edu/h/$accessionId"; + my $hubUrl = "https://hgdownload.soe.ucsc.edu/hubs/$accessionDir/$accessionId"; + my $browserName = $commonName; + my $browserUrl = "https://genome.ucsc.edu/h/$accessionId"; + if ($asmId !~ m/^GC/) { + $hubUrl = "https://hgdownload.soe.ucsc.edu/goldenPath/$asmId/bigZips"; + $browserUrl = "https://genome.ucsc.edu/cgi-bin/hgTracks?db=$asmId"; + $browserName = "$commonName ($asmId)"; if ($testOutput) { - $hubUrl = "https://genome-test.gi.ucsc.edu/h/$accessionId"; + $browserUrl = "https://genome-test.gi.ucsc.edu/cgi-bin/hgTracks?db=$asmId"; + $hubUrl = "https://hgdownload-test.gi.ucsc.edu/goldenPath/$asmId/bigZips"; + } + } elsif ($testOutput) { + $browserUrl = "https://genome-test.gi.ucsc.edu/h/$accessionId"; } printf "<tr><td align=right>%d</td>\n", ++$asmCount; - printf "<td align=center><a href='%s' target=_blank>%s<br>%s</a></td>\n", $hubUrl, $commonName, $accessionId; + printf "<td align=center><a href='%s' target=_blank>%s<br>%s</a></td>\n", $browserUrl, $browserName, $accessionId; foreach my $track (@trackList) { my $trackFile = "$buildDir/bbi/$asmId.$track"; my $trackFb = "$buildDir/trackData/$track/fb.$asmId.$track.txt"; my $runDir = "$buildDir/trackData/$track"; my ($itemCount, $percentCover); + my $customKey = ""; + if ($asmId !~ m/^GC/) { + $itemCount = "n/a"; + $percentCover = "n/a"; + if ($track eq "ncbiRefSeq") { + my $refSeqDir=`ls -d /hive/data/genomes/$asmId/bed/ncbiRefSeq.20* | tail -1`; + chomp $refSeqDir; + if ( -d "${refSeqDir}" ) { + my $trackFb = "${refSeqDir}/fb.ncbiRefSeq.$asmId.txt"; + if ( -s "${trackFb}" ) { + $itemCount = `hgsql -N -e 'select count(*) from $track;' $asmId 2> /dev/null`; + chomp $itemCount; + $percentCover = pcFbFile($trackFb); + } + } + } elsif ($track eq "gc5Base") { + my $bwFile = "/gbdb/$asmId/bbi/gc5Base.bw"; + $bwFile = "/gbdb/$asmId/bbi/gc5BaseBw/gc5Base.bw" if (! -s "${bwFile}"); + ($itemCount, $percentCover) = bigWigMeasure($bwFile, $totalSize); + } elsif ($track eq "rmsk") { + my $rmskStats = "/hive/data/genomes/$asmId/bed/repeatMasker/$asmId.rmsk.stats"; + if (! -s "${rmskStats}") { + my $faOut = "/hive/data/genomes/$asmId/bed/repeatMasker/$asmId.sorted.fa.out.gz"; + if ( -s "$faOut") { + my $items = `zgrep -c ^ "$faOut"`; + chomp $items; + $itemCount = commify($items); + my $masked = `grep masked "/hive/data/genomes/$asmId/bed/repeatMasker/faSize.rmsk.txt" | awk '{print \$4}' | sed -e 's/%//;'`; + chomp $masked; + $percentCover = sprintf("%.2f %%", $masked); + open (RS, ">$rmskStats") or die "can now write to $rmskStats"; + printf RS "%s\t%s\n", $itemCount, $percentCover; + close (RS); + } else { + $itemCount = "n/a"; + $percentCover = "n/a"; + } + } else { + ($itemCount, $percentCover) = split('\s+', `cat $rmskStats`); + chomp $percentCover; + $customKey = sprintf("%.2f", $percentCover); + $percentCover = sprintf("%.2f %%", $percentCover); + } + } # elsif ($track eq "rmsk") +########### need to figure which tables can be measured here +# x else x +# $itemCount = `hgsql -N -e 'select count(*) from $track;' $asmId 2> /dev/null`; +# chomp $itemCount; +# if (length($itemCount) < 1) { +# $itemCount = "n/a"; +# $percentCover = "n/a"; +# } else { +# $percentCover = `featureBits $asmId $track 2>&1 | cut -d' ' -f5 | tr -d ')('`; +# chomp $percentCover; +# $customKey = $percentCover; +# $customKey =~ s/[ %]+//; +# } + } else { # working on an assembly hub if ( "$track" eq "gc5Base" ) { $trackFile .= ".bw"; } else { $trackFile .= ".bb"; } - my $customKey = ""; if ( "$track" eq "rmsk") { my $rmskStats = "$buildDir/trackData/repeatMasker/$asmId.rmsk.stats"; if (! -s "${rmskStats}") { my $faOut = "$buildDir/trackData/repeatMasker/$asmId.sorted.fa.out.gz"; if ( -s "$faOut") { - my $items = `zcat "$faOut" | wc -l`; + my $items = `zgrep -c ^ "$faOut"`; chomp $items; $itemCount = commify($items); my $masked = `grep masked "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" | awk '{print \$4}' | sed -e 's/%//;'`; chomp $masked; $percentCover = sprintf("%.2f %%", $masked); open (RS, ">$rmskStats") or die "can now write to $rmskStats"; printf RS "%s\t%s\n", $itemCount, $percentCover; close (RS); } else { $itemCount = "n/a"; $percentCover = "n/a"; } } else { ($itemCount, $percentCover) = split('\s+', `cat $rmskStats`); chomp $percentCover; $customKey = sprintf("%.2f", $percentCover); $percentCover = sprintf("%.2f %%", $percentCover); } } else { # not the rmsk track ($itemCount, $percentCover) = oneTrackData($asmId, $track, $trackFile, $totalSize, $trackFb, $runDir); if (0 == $testOutput) { # if track ncbiRefSeq does not exist, try the ncbiGene track - if ($trackDb eq "ncbiRefSeq" && $itemCount eq "n/a") { + if ($track eq "ncbiRefSeq" && $itemCount eq "n/a") { $runDir = "$buildDir/trackData/ncbiGene"; $trackFile = "$buildDir/bbi/$asmId.$track.bb"; ($itemCount, $percentCover) = oneTrackData($asmId, "ncbiGene", $trackFile, $totalSize, $trackFb, $runDir); } } + } # else not the rmsk track + } # else if ($asmId !~ m/^GC/) if (($percentCover =~ m/%/) || ($percentCover !~ m#n/a#)) { $customKey = $percentCover; $customKey =~ s/[ %]+//; } - } if (length($customKey)) { printf " <td align=right sorttable_customkey='%s'>%s<br>(%s)</td>\n", $customKey, $itemCount, $percentCover; } else { if ($itemCount eq "n/a") { printf " <td align=right>n/a</td>\n"; } else { printf " <td align=right>%s<br>(%s)</td>\n", $itemCount, $percentCover; } } $tracksCounted += 1 if ($itemCount ne "n/a"); } # foreach my $track (@trackList) printf "</tr>\n"; $asmCounted += 1; + if ($asmId =~ m/^GC/) { printf STDERR "# %03d\t%02d tracks\t%s\n", $asmCounted, $tracksCounted, $asmId; + } else { + printf STDERR "# %03d\t%02d tracks\t%s_%s (%s)\n", $asmCounted, $tracksCounted, $accessionId, $asmName, $asmId; + } } } ############################################################################## ### main() ############################################################################## open (FH, "<${orderList}") or die "can not read ${orderList}"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($asmId, $commonName) = split('\t', $line); push @orderList, $asmId; $commonName{$asmId} = $commonName; ++$assemblyTotal;