6a48e07c2fbefbf0a0e5e74662dd768ef93fca88 hiram Wed Apr 29 11:27:30 2020 -0700 adding ncbiRmsk option to use NCBI rm.out.gz file instead of cluster run and record versionInfo.txt of RepeatMasker operation refs #23734 diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl index 286e999..8e05e16 100755 --- src/hg/utils/automation/doRepeatMasker.pl +++ src/hg/utils/automation/doRepeatMasker.pl @@ -15,74 +15,85 @@ use HgRemoteScript; use HgStepManager; # Hardcoded command path: my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker"; my $RepeatMasker = "$RepeatMaskerPath/RepeatMasker"; my $RepeatMaskerEngine = "-engine crossmatch -s"; # Let parasol pick defaults my $parasolRAM = ""; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir + $opt_ncbiRmsk + $opt_liftSpec $opt_species $opt_unmaskedSeq $opt_customLib $opt_useHMMER $opt_useRMBlastn $opt_splitTables $opt_noSplit $opt_updateTable /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'cluster', func => \&doCluster }, { name => 'cat', func => \&doCat }, { name => 'mask', func => \&doMask }, { name => 'install', func => \&doInstall }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $unmaskedSeq = "\$db.unmasked.2bit"; my $defaultSpecies = 'scientificName from dbDb'; +my $ncbiRmsk = ""; # path to NCBI *_rm.out.gz file to use NCBI result + # for repeat masking result + # in the assembly as calculated by NCBI +my $liftSpec = ""; # lift file from NCBI coordinates to UCSC coordinates + # usually required with ncbiRmsk my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base db options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatMasker.\$date (necessary when continuing at a later date). -species sp Use sp (which can be quoted genus and species, or a common name that RepeatMasker recognizes. Default: $defaultSpecies. + -ncbiRmsk path/file_rm.out.gz - Use the repeat masker result supplied in + the assembly as calculated by NCBI + -liftSpec path/file.lift - Use this lift file to lift the NCBI coordinates + to UCSC coordinates, usually used with ncbiRmsk -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead of default ($unmaskedSeq). -customLib lib.fa Use custom repeat library instead of RepeatMaskers\'s. -useRMBlastn Use NCBI rmblastn instead of crossmatch -useHMMER Use hmmer instead of crossmatch ( currently for human only ) -updateTable load into table name rmskUpdate (default: rmsk) -splitTables split the _rmsk tables (default is not split) -noSplit default behavior, this option no longer required. _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => '', 'bigClusterHub' => ''); print STDERR " Automates UCSC's RepeatMasker process for genome database \$db. Steps: @@ -107,51 +118,63 @@ exit $status; } # Globals: # Command line args: db my ($db); # Other: my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'species=s', + 'ncbiRmsk=s', + 'liftSpec=s', 'unmaskedSeq=s', 'customLib=s', 'useRMBlastn', 'useHMMER', 'splitTables', 'noSplit', 'updateTable', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### # * step: cluster [bigClusterHub] sub doCluster { + return if ($ncbiRmsk); my $runDir = "$buildDir/run.cluster"; + if ( -d "$runDir" ) { + if ( -s "$runDir/run.time" ) { + &HgAutomate::verbose(1, + "\ncluster step previously completed\n"); + return; + } else { + die "\nERROR: cluster step may not be completed properly\n"; + } + } &HgAutomate::mustMkdir($runDir); my $paraHub = $opt_bigClusterHub ? $opt_bigClusterHub : &HgAutomate::chooseClusterByBandwidth(); if (! -e $unmaskedSeq) { die "Error: required file $unmaskedSeq does not exist."; } my @okIn = grep !/scratch/, &HgAutomate::chooseFilesystemsForCluster($paraHub, "in"); my @okOut = &HgAutomate::chooseFilesystemsForCluster($paraHub, "out"); if (scalar(@okOut) > 1) { @okOut = grep !/$okIn[0]/, @okOut; } my $inHive = 0; $inHive = 1 if ($okIn[0] =~ m#/hive/data/genomes#); @@ -277,58 +300,73 @@ ; close($fh); &HgAutomate::makeGsub($runDir, "./RMRun.csh {check out line $partDir/\$(path1).out}"); my $whatItDoes = "It computes a logical partition of unmasked 2bit into 500k chunks and runs it on the cluster with the most available bandwidth."; my $bossScript = new HgRemoteScript("$runDir/doCluster.csh", $paraHub, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ chmod a+x dummyRun.csh chmod a+x RMRun.csh -./dummyRun.csh # Record RM version used: +printf "The repeat files provided for this assembly were generated using RepeatMasker.\\ + Smit, AFA, Hubley, R & Green, P.,\\ + RepeatMasker Open-4.0.\\ + 1996-2010 .\\ +\\ +VERSION:\\n" > ../versionInfo.txt + +./dummyRun.csh | grep -v "dev/null" >> ../versionInfo.txt + +grep 'version of RepeatMasker\$' $RepeatMasker >> ../versionInfo.txt +grep RELEASE $RepeatMaskerPath/Libraries/RepeatMaskerLib.embl >> ../versionInfo.txt +printf "# RepeatMasker engine: %s\\n" "${RepeatMaskerEngine}" >> ../versionInfo.txt + ls -ld $RepeatMaskerPath $RepeatMasker grep 'version of RepeatMasker\$' $RepeatMasker grep RELEASE $RepeatMaskerPath/Libraries/RepeatMaskerLib.embl echo "# RepeatMasker engine: $RepeatMaskerEngine" _EOF_ ); if ($opt_useRMBlastn) { $bossScript->add(<<_EOF_ +printf "# using rmblastn:\\n" >> ../versionInfo.txt echo "# useRMBlastn: rmblastn:" -grep RMBLAST_DIR $RepeatMaskerPath/RepeatMaskerConfig.pm | awk '{print \$NF}' +grep RMBLAST_DIR $RepeatMaskerPath/RepeatMaskerConfig.pm | grep rmblastn-2 | awk '{print \$NF}' >> ../versionInfo.txt _EOF_ ); } if ($opt_useHMMER) { $bossScript->add(<<_EOF_ +printf "# using Dfam library and HMMER3:\\n" >> ../versionInfo.txt echo "# useHMMER: Dfam library: " ls -ld $RepeatMaskerPath/Libraries/Dfam.hmm -grep Release: $RepeatMaskerPath/Libraries/Dfam.hmm +grep Release: $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt echo "# useHMMER: HMMER3: " -grep -m 1 ^HMMER3 $RepeatMaskerPath/Libraries/Dfam.hmm +grep -m 1 ^HMMER3 $RepeatMaskerPath/Libraries/Dfam.hmm >> ../versionInfo.txt _EOF_ ); } if (length($repeatLib) > 0) { $bossScript->add(<<_EOF_ +printf "# RepeatMasker library options: %s\\n" "${repeatLib}" >> ../versionInfo.txt echo "# RepeatMasker library options: '$repeatLib'" _EOF_ ); } if ( ! $opt_unmaskedSeq && ! $inHive) { $bossScript->add(<<_EOF_ mkdir -p $clusterSeqDir rsync -av $unmaskedSeq $clusterSeq _EOF_ ); } if ($opt_unmaskedSeq) { $bossScript->add(<<_EOF_ rm -rf $partDir $Bin/simplePartition.pl $clusterSeq 500000 $partDir @@ -339,107 +377,147 @@ rm -rf $partDir $Bin/simplePartition.pl $clusterSeq 500000 $partDir rm -f $buildDir/RMPart ln -s $partDir $buildDir/RMPart _EOF_ ); } my $binPara = "/parasol/bin/para"; if ( ! -s "$binPara" ) { # allow PATH to find the para command $binPara = "para"; } my $gensub2 = &HgAutomate::gensub2(); $bossScript->add(<<_EOF_ +printf "\\nPARAMETERS:\\ +$RepeatMasker $RepeatMaskerEngine -align $repeatLib\\n" >> ../versionInfo.txt + $gensub2 $partDir/partitions.lst single gsub jobList +exit 0 $binPara $parasolRAM make jobList $binPara check $binPara time > run.time cat run.time _EOF_ ); if (! $opt_unmaskedSeq && ! $inHive) { $bossScript->add(<<_EOF_ rm -f $clusterSeq _EOF_ ); } $bossScript->execute(); } # doCluster ######################################################################### # * step: cat [fileServer] sub doCat { my $runDir = "$buildDir"; + if (! -s "${ncbiRmsk}" ) { &HgAutomate::checkExistsUnlessDebug('cluster', 'cat', "$buildDir/run.cluster/run.time"); - + } + if ( -s "$runDir/doCat.bash" ) { + if ( ! (-s "$runDir/$db.sorted.fa.out.gz" || -s "$runDir/$db.sorted.fa.out" ) ) { + die "\nERROR: cat step may not be completed properly\n"; + } else { + &HgAutomate::verbose(1, + "\ncat step previously completed\n"); + return; + } + } my $whatItDoes = "It concatenates .out files from cluster run into a single $db.sorted.fa.out.\n" . "liftUp (with no lift specs) is used to concatenate .out files because it\n" . "uniquifies (per input file) the .out IDs which can then be used to join\n" . "fragmented repeats in the Nested Repeats track."; my $fileServer = &HgAutomate::chooseFileServer($runDir); my $bossScript = newBash HgRemoteScript("$runDir/doCat.bash", $fileServer, $runDir, $whatItDoes); - # Use symbolic link created in cluster step: - my $partDir = "$buildDir/RMPart"; - my $indent = ""; - my $path = ""; - my $levels = $opt_debug ? 1 : `cat $partDir/levels`; - chomp $levels; $bossScript->add("# Debug mode -- don't know the real number of levels\n") if ($opt_debug); + if ($ncbiRmsk) { + my $zippedSource = 0; # asssume not a gz zipped file + $zippedSource = 1 if ($ncbiRmsk =~ m/.gz$/); + my $liftOpts = ""; + if (-s "${liftSpec}" ) { + $liftOpts = "| liftUp -type=.out stdout $liftSpec carry stdin"; + } + $bossScript->add(<<_EOF_ +export db="${db}" +export ncbiRmOutFile="${ncbiRmsk}" +cat /dev/null > "\${db}.sorted.fa.out" +_EOF_ + ); + if ($zippedSource) { + $bossScript->add(<<_EOF_ +zcat "\${ncbiRmOutFile}" | tail -n +4 | sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out" +_EOF_ + ); + } else { + $bossScript->add(<<_EOF_ +tail -n +4 "\${ncbiRmOutFile}" sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out" +_EOF_ + ); + } + } else { # not using the NCBI supplied file, use local cluster run result + my $partDir = "$buildDir/RMPart"; + my $levels = $opt_debug ? 1 : `cat $partDir/levels`; + chomp $levels; + # Use symbolic link created in cluster step: + my $indent = ""; + my $path = ""; # Make the appropriate number of nested foreach's to match the number # of levels from the partitioning. If $levels is 1, no foreach required. for (my $l = $levels - 2; $l >= 0; $l--) { my $dir = ($l == ($levels - 2)) ? $partDir : "\$d" . ($l + 1); $bossScript->add(<<_EOF_ ${indent}for d$l in $dir/??? do _EOF_ ); $indent .= ' '; $path .= "\$d$l/"; } for (my $l = 0; $l <= $levels - 2; $l++) { $bossScript->add(<<_EOF_ ${indent} liftUp ${path}cat.out /dev/null carry ${path}???/*.out ${indent} liftUp ${path}cat.align /dev/null carry ${path}???/*.align ${indent}done _EOF_ ); $indent =~ s/ //; $path =~ s/\$d\d+\/$//; } $bossScript->add(<<_EOF_ liftUp $db.fa.out /dev/null carry $partDir/???/*.out liftUp $db.fa.align /dev/null carry $partDir/???/*.align head -3 $db.fa.out > $db.sorted.fa.out tail -n +4 $db.fa.out | sort -k5,5 -k6,6n >> $db.sorted.fa.out _EOF_ ); + } $bossScript->add(<<_EOF_ # Use the ID column to join up fragments of interrupted repeats for the # Nested Repeats track. Try to avoid the Undefined id errors. -($Bin/extractNestedRepeats.pl $db.fa.out 2> nestRep.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed +($Bin/extractNestedRepeats.pl $db.sorted.fa.out 2> nestRep.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed if [ -s "nestRep.err" ]; then export lineCount=`(grep "Undefined id, line" nestRep.err || true) | cut -d' ' -f6 | wc -l` if [ "\${lineCount}" -gt 0 ]; then if [ "\${lineCount}" -gt 10 ]; then printf "ERROR: too many Undefined id lines (> 10) reported by extractNestedRepeats.pl" 1>&2 exit 255 fi export sedExpr=`grep "Undefined id, line" nestRep.err | cut -d' ' -f6 | sed -e 's/\$/d;/;' | xargs echo` export sedCmd="sed -i.broken -e '\$sedExpr'" eval \$sedCmd $db.fa.out ($Bin/extractNestedRepeats.pl $db.fa.out 2> nestRep2.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed if [ -s "nestRep2.err" ]; then printf "ERROR: the attempt of cleaning nestedRepeats did not work" 1>&2 exit 255 fi @@ -584,70 +662,72 @@ | splitFileByColumn -col=5 stdin /cluster/data/$db -chromDirs \\ -ending=.fa.out -head=/tmp/rmskHead.txt _EOF_ ); $bossScript->execute(); } } # doInstall ######################################################################### # * step: cleanup [fileServer] sub doCleanup { my $runDir = "$buildDir"; my $whatItDoes = "It cleans up or compresses intermediate files."; my $fileServer = &HgAutomate::chooseFileServer($runDir); - my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer, + my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -if (-e $db.fa.align) then +if [ -e "$db.fa.align" ]; then gzip $db.fa.align -endif +fi rm -fr RMPart/* rm -fr RMPart -if ( -d /hive/data/genomes/$db/RMPart ) then +if [ -d "/hive/data/genomes/$db/RMPart" ]; then rmdir /hive/data/genomes/$db/RMPart -endif +fi _EOF_ ); $bossScript->execute(); } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; ($db) = @ARGV; # Now that we know the $db, figure out our paths: my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/RepeatMasker.$date"; $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq : "$HgAutomate::clusterData/$db/$db.unmasked.2bit"; my $seqCount = `twoBitInfo $unmaskedSeq stdout | wc -l`; $chromBased = ($seqCount <= $HgAutomate::splitThreshold) && $opt_splitTables; $updateTable = $opt_updateTable ? "Update" : ""; +$ncbiRmsk = $opt_ncbiRmsk ? $opt_ncbiRmsk : ""; +$liftSpec = $opt_liftSpec ? $opt_liftSpec : ""; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60;