77a819d426026e8a6ac3d7965af8f44fbb9a0272 hiram Wed Jan 10 12:42:07 2024 -0800 better manage large genome construction and add in RepeatModeler track refs #29545 diff --git src/hg/utils/automation/asmHubRmskJoinAlign.pl src/hg/utils/automation/asmHubRmskJoinAlign.pl index 03040df..8fbd058 100755 --- src/hg/utils/automation/asmHubRmskJoinAlign.pl +++ src/hg/utils/automation/asmHubRmskJoinAlign.pl @@ -8,32 +8,38 @@ use File::Basename; my $argc = scalar(@ARGV); if ($argc != 2) { printf STDERR "usage: asmHubRmskJoinAlign.pl asmId buildDir > asmId.repeatMasker.html\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "expecting to find buildDir/html/asmId.names.tab naming file for this assembly,\n"; printf STDERR "and buildDir/trackData/repeatMasker/asmId.rmsk.class.profile counts of rmsk categories.\n"; exit 255; } my $asmId = shift; my $buildDir = shift; my $namesFile = "$buildDir/html/$asmId.names.tab"; +my $faSizeFile = "$buildDir/trackData/repeatMasker/faSize.rmsk.txt"; my $rmskClassProfile = "$buildDir/trackData/repeatMasker/$asmId.rmsk.class.profile.txt"; my $rmskVersion = "$buildDir/$asmId.repeatMasker.version.txt"; +my $maskingPercent = ""; +if ( -s "${faSizeFile}" ) { + $maskingPercent=`grep -w masked "${faSizeFile}" | cut -d' ' -f1`; + chomp $maskingPercent; +} my $errOut = 0; if ( ! -s $rmskClassProfile ) { printf STDERR "ERROR: can not find rmsk class profile file:\n\t'%s'\n", $rmskClassProfile; $errOut = 255; } if ( ! -s $namesFile ) { printf STDERR "ERROR: can not find rmsk class profile file:\n\t'%s'\n", $rmskClassProfile; $errOut = 255; } if ($errOut) { exit $errOut; } @@ -60,30 +66,41 @@ for interspersed repeats and low complexity DNA sequences. The program outputs a detailed annotation of the repeats that are present in the query sequence (represented by this track), as well as a modified version of the query sequence in which all the annotated repeats have been masked (generally available on the <a href="http://hgdownload.soe.ucsc.edu/downloads.html" target=_blank>Downloads</a> page). RepeatMasker uses the <a href="http://www.girinst.org/repbase/update/index.html" target=_blank>Repbase Update</a> library of repeats from the <a href="http://www.girinst.org/" target=_blank>Genetic Information Research Institute</a> (GIRI). Repbase Update is described in Jurka (2000) in the References section below.</p> _EOF_ ; +if ( length($maskingPercent) ) { + printf "<h2>Percent masking of sequence: %s</h2>\n", $maskingPercent; + my $asmSize=`grep -w bases "${faSizeFile}" | cut -d' ' -f1`; + chomp $asmSize; + my $maskedBases=`grep -w bases "${faSizeFile}" | cut -d' ' -f9`; + chomp $maskedBases; + printf "<p><b>Assembly size:</b> %s bases<br>\n", &AsmHub::commify($asmSize); + printf "<b>Sequence masked:</b> %s bases\n", &AsmHub::commify($maskedBases); + printf "</p>\n"; +} + if ( -s "$rmskVersion" ) { print <<_EOF_ <h2>RepeatMasker and libraries version</h2> <p> <pre> _EOF_ ; print `cat $rmskVersion`; print <<_EOF_ </pre> </p> _EOF_ ;