77a819d426026e8a6ac3d7965af8f44fbb9a0272 hiram Wed Jan 10 12:42:07 2024 -0800 better manage large genome construction and add in RepeatModeler track refs #29545 diff --git src/hg/utils/automation/asmHubRmskJoinAlign.pl src/hg/utils/automation/asmHubRmskJoinAlign.pl index 03040df..8fbd058 100755 --- src/hg/utils/automation/asmHubRmskJoinAlign.pl +++ src/hg/utils/automation/asmHubRmskJoinAlign.pl @@ -8,32 +8,38 @@ use File::Basename; my $argc = scalar(@ARGV); if ($argc != 2) { printf STDERR "usage: asmHubRmskJoinAlign.pl asmId buildDir > asmId.repeatMasker.html\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "expecting to find buildDir/html/asmId.names.tab naming file for this assembly,\n"; printf STDERR "and buildDir/trackData/repeatMasker/asmId.rmsk.class.profile counts of rmsk categories.\n"; exit 255; } my $asmId = shift; my $buildDir = shift; my $namesFile = "$buildDir/html/$asmId.names.tab"; +my $faSizeFile = "$buildDir/trackData/repeatMasker/faSize.rmsk.txt"; my $rmskClassProfile = "$buildDir/trackData/repeatMasker/$asmId.rmsk.class.profile.txt"; my $rmskVersion = "$buildDir/$asmId.repeatMasker.version.txt"; +my $maskingPercent = ""; +if ( -s "${faSizeFile}" ) { + $maskingPercent=`grep -w masked "${faSizeFile}" | cut -d' ' -f1`; + chomp $maskingPercent; +} my $errOut = 0; if ( ! -s $rmskClassProfile ) { printf STDERR "ERROR: can not find rmsk class profile file:\n\t'%s'\n", $rmskClassProfile; $errOut = 255; } if ( ! -s $namesFile ) { printf STDERR "ERROR: can not find rmsk class profile file:\n\t'%s'\n", $rmskClassProfile; $errOut = 255; } if ($errOut) { exit $errOut; } @@ -60,30 +66,41 @@ for interspersed repeats and low complexity DNA sequences. The program outputs a detailed annotation of the repeats that are present in the query sequence (represented by this track), as well as a modified version of the query sequence in which all the annotated repeats have been masked (generally available on the Downloads page). RepeatMasker uses the Repbase Update library of repeats from the Genetic Information Research Institute (GIRI). Repbase Update is described in Jurka (2000) in the References section below.

_EOF_ ; +if ( length($maskingPercent) ) { + printf "

Percent masking of sequence: %s

\n", $maskingPercent; + my $asmSize=`grep -w bases "${faSizeFile}" | cut -d' ' -f1`; + chomp $asmSize; + my $maskedBases=`grep -w bases "${faSizeFile}" | cut -d' ' -f9`; + chomp $maskedBases; + printf "

Assembly size: %s bases
\n", &AsmHub::commify($asmSize); + printf "Sequence masked: %s bases\n", &AsmHub::commify($maskedBases); + printf "

\n"; +} + if ( -s "$rmskVersion" ) { print <<_EOF_

RepeatMasker and libraries version

 _EOF_
 ;
 print `cat $rmskVersion`;
 print <<_EOF_
 

_EOF_ ;