77a819d426026e8a6ac3d7965af8f44fbb9a0272
hiram
  Wed Jan 10 12:42:07 2024 -0800
better manage large genome construction and add in RepeatModeler track refs #29545

diff --git src/hg/utils/automation/asmHubRmskJoinAlign.pl src/hg/utils/automation/asmHubRmskJoinAlign.pl
index 03040df..8fbd058 100755
--- src/hg/utils/automation/asmHubRmskJoinAlign.pl
+++ src/hg/utils/automation/asmHubRmskJoinAlign.pl
@@ -8,32 +8,38 @@
 use File::Basename;
 
 my $argc = scalar(@ARGV);
 
 if ($argc != 2) {
   printf STDERR "usage: asmHubRmskJoinAlign.pl asmId buildDir > asmId.repeatMasker.html\n";
   printf STDERR "where asmId is the assembly identifier,\n";
   printf STDERR "expecting to find buildDir/html/asmId.names.tab naming file for this assembly,\n";
   printf STDERR "and buildDir/trackData/repeatMasker/asmId.rmsk.class.profile counts of rmsk categories.\n";
   exit 255;
 }
 
 my $asmId = shift;
 my $buildDir = shift;
 my $namesFile = "$buildDir/html/$asmId.names.tab";
+my $faSizeFile = "$buildDir/trackData/repeatMasker/faSize.rmsk.txt";
 my $rmskClassProfile = "$buildDir/trackData/repeatMasker/$asmId.rmsk.class.profile.txt";
 my $rmskVersion = "$buildDir/$asmId.repeatMasker.version.txt";
+my $maskingPercent = "";
+if ( -s "${faSizeFile}" ) {
+  $maskingPercent=`grep -w masked "${faSizeFile}" | cut -d' ' -f1`;
+  chomp $maskingPercent;
+}
 
 my $errOut = 0;
 if ( ! -s $rmskClassProfile ) {
   printf STDERR "ERROR: can not find rmsk class profile file:\n\t'%s'\n", $rmskClassProfile;
   $errOut = 255;
 }
 
 if ( ! -s $namesFile ) {
   printf STDERR "ERROR: can not find rmsk class profile file:\n\t'%s'\n", $rmskClassProfile;
   $errOut = 255;
 }
 
 if ($errOut) {
   exit $errOut;
 }
@@ -60,30 +66,41 @@
 for interspersed repeats and low complexity DNA sequences. The program
 outputs a detailed annotation of the repeats that are present in the
 query sequence (represented by this track), as well as a modified version
 of the query sequence in which all the annotated repeats have been masked
 (generally available on the
 <a href="http://hgdownload.soe.ucsc.edu/downloads.html"
 target=_blank>Downloads</a> page). RepeatMasker uses the
 <a href="http://www.girinst.org/repbase/update/index.html"
 target=_blank>Repbase Update</a> library of repeats from the
 <a href="http://www.girinst.org/" target=_blank>Genetic 
 Information Research Institute</a> (GIRI).
 Repbase Update is described in Jurka (2000) in the References section below.</p>
 _EOF_
 ;
 
+if ( length($maskingPercent) ) {
+  printf "<h2>Percent masking of sequence: %s</h2>\n", $maskingPercent;
+  my $asmSize=`grep -w bases "${faSizeFile}" | cut -d' ' -f1`;
+  chomp $asmSize;
+  my $maskedBases=`grep -w bases "${faSizeFile}" | cut -d' ' -f9`;
+  chomp $maskedBases;
+  printf "<p><b>Assembly size:</b> %s bases<br>\n", &AsmHub::commify($asmSize);
+  printf "<b>Sequence masked:</b> %s bases\n", &AsmHub::commify($maskedBases);
+  printf "</p>\n";
+}
+
 if ( -s "$rmskVersion" ) {
 
 print <<_EOF_
 <h2>RepeatMasker and libraries version</h2>
 <p>
 <pre>
 _EOF_
 ;
 print `cat $rmskVersion`;
 print <<_EOF_
 </pre>
 </p>
 _EOF_
 ;