967b763f92c9dc4d3e57399de0c875029d828ef1 hiram Tue Dec 13 20:58:42 2022 -0800 now running the next version of RepeatMasker v4.1.4 refs #29545 diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl index 017ffa4..77d21ea 100755 --- src/hg/utils/automation/doRepeatMasker.pl +++ src/hg/utils/automation/doRepeatMasker.pl @@ -4,49 +4,52 @@ # edit ~/kent/src/hg/utils/automation/doRepeatMasker.pl instead. # $Id: doRepeatMasker.pl,v 1.14 2009/03/19 16:15:29 hiram Exp $ use Getopt::Long; use warnings; use strict; use Carp; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Hardcoded command path: -my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker"; +my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker221107"; my $RepeatMasker = "$RepeatMaskerPath/RepeatMasker"; -my $RepeatMaskerEngine = "-engine crossmatch -s"; -# Let parasol pick defaults -my $parasolRAM = ""; +# default engine changed from crossmatch to rmblast as of 2022-12 +# with RM version 4.1.4 +my $RepeatMaskerEngine = "-engine rmblast -pa 1"; +# per RM doc, rmblast uses 4 CPUs for each job +my $parasolRAM = "-cpu=4 -ram=32g"; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_ncbiRmsk $opt_dupList $opt_liftSpec $opt_species $opt_unmaskedSeq $opt_customLib $opt_useHMMER $opt_useRMBlastn + $opt_useCrossMatch $opt_splitTables $opt_noSplit $opt_updateTable /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'cluster', func => \&doCluster }, { name => 'cat', func => \&doCat }, { name => 'mask', func => \&doMask }, { name => 'install', func => \&doInstall }, { name => 'cleanup', func => \&doCleanup }, ] ); @@ -77,31 +80,32 @@ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatMasker.\$date (necessary when continuing at a later date). -species sp Use sp (which can be quoted genus and species, or a common name that RepeatMasker recognizes. Default: $defaultSpecies. -ncbiRmsk path/file_rm.out.gz - Use the repeat masker result supplied in the assembly as calculated by NCBI -dupList .../download/asmId.remove.dups.list - to remove duplicates from NCBI repeat masker file -liftSpec path/file.lift - Use this lift file to lift the NCBI coordinates to UCSC coordinates, usually used with ncbiRmsk -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead of default ($unmaskedSeq). -customLib lib.fa Use custom repeat library instead of RepeatMaskers\'s. - -useRMBlastn Use NCBI rmblastn instead of crossmatch + -useRMBlastn This is the default as of 2022-12 == NCBI rmblastn + -useCrossMatch Use crossmatch instead of NCBI rmblastn -useHMMER Use hmmer instead of crossmatch ( currently for human only ) -updateTable load into table name rmskUpdate (default: rmsk) -splitTables split the _rmsk tables (default is not split) -noSplit default behavior, this option no longer required. _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => '', 'bigClusterHub' => ''); print STDERR " Automates UCSC's RepeatMasker process for genome database \$db. Steps: cluster: Do a cluster run of RepeatMasker on 500kb sequence chunks. cat: Concatenate the cluster run results into \$db.sorted.fa.out. mask: Create a \$db.2bit masked by \$db.sorted.fa.out. install: Load \$db.sorted.fa.out into the rmsk table (possibly split) in \$db, @@ -128,30 +132,31 @@ my ($db); # Other: my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'species=s', 'ncbiRmsk=s', 'dupList=s', 'liftSpec=s', 'unmaskedSeq=s', 'customLib=s', 'useRMBlastn', + 'useCrossMatch', 'useHMMER', 'splitTables', 'noSplit', 'updateTable', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### @@ -193,35 +198,35 @@ my $customLib = $opt_customLib; my $repeatLib = ""; if ($opt_customLib && $opt_species) { $repeatLib = "-species \'$species\' -lib $customLib"; } elsif ($opt_customLib) { $repeatLib = "-lib $customLib"; } else { $repeatLib = "-species \'$species\'"; } # updated for ku kluster operation -cpu option instead of ram option if ( $opt_useRMBlastn ) { $RepeatMaskerEngine = "-engine rmblast -pa 1"; + $parasolRAM = "-cpu=4 -ram=32g"; + } elsif ( $opt_useCrossMatch ) { + $RepeatMaskerEngine = "-engine crossmatch -s"; $parasolRAM = "-cpu=1"; - } - - # updated for ku kluster operation -cpu option instead of ram option - if ( $opt_useHMMER ) { + } elsif ( $opt_useHMMER ) { # NOTE: This is only applicable for 8gb one-job-per-node scheduling $RepeatMaskerEngine = "-engine hmmer -pa 4"; $parasolRAM = "-cpu=4 -ram=32g"; } # Script to do a dummy run of RepeatMasker, to test our invocation and # unpack library files before kicking off a large cluster run. # And now that RM is being run from local /scratch/data/RepeatMasker/ # this is also done in the cluster run script so each node will have # its library initialized my $fh = &HgAutomate::mustOpen(">$runDir/dummyRun.csh"); print $fh <<_EOF_ #!/bin/csh -ef set path = (/cluster/software/bin \$path) @@ -306,62 +311,71 @@ rm -rf \$tmpDir _EOF_ ; close($fh); &HgAutomate::makeGsub($runDir, "./RMRun.csh {check out line $partDir/\$(path1).out}"); my $whatItDoes = "It computes a logical partition of unmasked 2bit into 500k chunks and runs it on the cluster with the most available bandwidth."; my $bossScript = new HgRemoteScript("$runDir/doCluster.csh", $paraHub, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ + +set path = (/cluster/software/bin \$path) + chmod a+x dummyRun.csh chmod a+x RMRun.csh # Record RM version used: printf "The repeat files provided for this assembly were generated using RepeatMasker.\\ Smit, AFA, Hubley, R & Green, P.,\\ - RepeatMasker Open-4.0.\\ + RepeatMasker version 4.1.4\\ 1996-2010 .\\ \\ VERSION:\\n" > ../versionInfo.txt ./dummyRun.csh | grep -v "dev/null" >> ../versionInfo.txt $RepeatMasker -v >> ../versionInfo.txt -grep RELEASE $RepeatMaskerPath/Libraries/RepeatMaskerLib.embl >> ../versionInfo.txt +printf "# RMRBMeta.embl library version: %s\\n" "`grep RELEASE $RepeatMaskerPath/Libraries/RMRBMeta.embl`" >> ../versionInfo.txt printf "# RepeatMasker engine: %s\\n" "${RepeatMaskerEngine}" >> ../versionInfo.txt ls -ld $RepeatMaskerPath $RepeatMasker $RepeatMasker -v -grep RELEASE $RepeatMaskerPath/Libraries/RepeatMaskerLib.embl +echo -n "# RMRBMeta.embl library version: " +grep RELEASE $RepeatMaskerPath/Libraries/RMRBMeta.embl | sed -e 's/ *\\*\$//;' echo "# RepeatMasker engine: $RepeatMaskerEngine" _EOF_ ); - if ($opt_useRMBlastn) { + if ($opt_useCrossMatch) { + $bossScript->add(<<_EOF_ +printf "# using engine crossmatch\\n" >> ../versionInfo.txt +echo "# useCrossMatch crossmatch" +_EOF_ + ); + } elsif ($opt_useRMBlastn) { $bossScript->add(<<_EOF_ -printf "# using rmblastn:\\t" >> ../versionInfo.txt +printf "# using engine rmblastn:\\t" >> ../versionInfo.txt echo "# useRMBlastn: rmblastn:" grep -w value $RepeatMaskerPath/RepeatMaskerConfig.pm | grep rmblastn | awk '{print \$NF}' >> ../versionInfo.txt _EOF_ ); - } - if ($opt_useHMMER) { + } elsif ($opt_useHMMER) { $bossScript->add(<<_EOF_ printf "# using Dfam library and HMMER3:\\n" >> ../versionInfo.txt echo "# useHMMER: Dfam library: " ls -ld $RepeatMaskerPath/Libraries/Dfam.hmm grep Release: $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt echo "# useHMMER: HMMER3: " grep -m 1 ^HMMER3 $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt _EOF_ ); } if (length($repeatLib) > 0) { $bossScript->add(<<_EOF_ printf "# RepeatMasker library options: %s\\n" "${repeatLib}" >> ../versionInfo.txt echo "# RepeatMasker library options: '$repeatLib'" _EOF_