06d7be056190c14b85e71bc12523f18ea6815b5e markd Mon Dec 7 00:50:29 2020 -0800 BLAT mmap index support merge with master diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl index b823e36..35bf619 100755 --- src/hg/utils/automation/doRepeatMasker.pl +++ src/hg/utils/automation/doRepeatMasker.pl @@ -16,82 +16,86 @@ use HgStepManager; # Hardcoded command path: my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker"; my $RepeatMasker = "$RepeatMaskerPath/RepeatMasker"; my $RepeatMaskerEngine = "-engine crossmatch -s"; # Let parasol pick defaults my $parasolRAM = ""; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_ncbiRmsk + $opt_dupList $opt_liftSpec $opt_species $opt_unmaskedSeq $opt_customLib $opt_useHMMER $opt_useRMBlastn $opt_splitTables $opt_noSplit $opt_updateTable /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'cluster', func => \&doCluster }, { name => 'cat', func => \&doCat }, { name => 'mask', func => \&doMask }, { name => 'install', func => \&doInstall }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $unmaskedSeq = "\$db.unmasked.2bit"; my $defaultSpecies = 'scientificName from dbDb'; my $ncbiRmsk = ""; # path to NCBI *_rm.out.gz file to use NCBI result # for repeat masking result # in the assembly as calculated by NCBI +my $dupList = ""; # path to download/asmId.remove.dups.list for NCBI my $liftSpec = ""; # lift file from NCBI coordinates to UCSC coordinates # usually required with ncbiRmsk my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base db options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatMasker.\$date (necessary when continuing at a later date). -species sp Use sp (which can be quoted genus and species, or a common name that RepeatMasker recognizes. Default: $defaultSpecies. -ncbiRmsk path/file_rm.out.gz - Use the repeat masker result supplied in the assembly as calculated by NCBI + -dupList .../download/asmId.remove.dups.list - to remove duplicates from + NCBI repeat masker file -liftSpec path/file.lift - Use this lift file to lift the NCBI coordinates to UCSC coordinates, usually used with ncbiRmsk -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead of default ($unmaskedSeq). -customLib lib.fa Use custom repeat library instead of RepeatMaskers\'s. -useRMBlastn Use NCBI rmblastn instead of crossmatch -useHMMER Use hmmer instead of crossmatch ( currently for human only ) -updateTable load into table name rmskUpdate (default: rmsk) -splitTables split the _rmsk tables (default is not split) -noSplit default behavior, this option no longer required. _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => '', 'bigClusterHub' => ''); @@ -119,30 +123,31 @@ } # Globals: # Command line args: db my ($db); # Other: my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'species=s', 'ncbiRmsk=s', + 'dupList=s', 'liftSpec=s', 'unmaskedSeq=s', 'customLib=s', 'useRMBlastn', 'useHMMER', 'splitTables', 'noSplit', 'updateTable', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); @@ -454,40 +459,56 @@ if (! -s "${liftSpec}" ) { printf STDERR "# WARNING: no liftSpec given with ncbiRmsk file, probably will need one ?\n"; $bossScript->add(<<_EOF_ ######### # WARNING: no liftSpec given with ncbiRmsk file, probably will need one ? ######### printf " SW perc perc perc query position in query matching repeat position in repeat score div. del. ins. sequence begin end (left) repeat class/family begin end (left) ID " >> "\${db}.sorted.fa.out" _EOF_ ); } if ($zippedSource) { + if ( -s "${dupList}" ) { + $bossScript->add(<<_EOF_ +zcat "\${ncbiRmOutFile}" | tail -n +4 | sort -k5,5 -k6,6n $liftOpts \\ + | grep -v -f ${dupList} >> "\${db}.sorted.fa.out" +_EOF_ + ); + } else { $bossScript->add(<<_EOF_ zcat "\${ncbiRmOutFile}" | tail -n +4 | sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out" _EOF_ ); + } + } else { + if ( -s "${dupList}" ) { + $bossScript->add(<<_EOF_ +tail -n +4 "\${ncbiRmOutFile}" sort -k5,5 -k6,6n $liftOpts \\ + | grep -v -f ${dupList} >> "\${db}.sorted.fa.out" +_EOF_ + ); } else { $bossScript->add(<<_EOF_ tail -n +4 "\${ncbiRmOutFile}" sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out" _EOF_ ); } + } } else { # not using the NCBI supplied file, use local cluster run result my $partDir = "$buildDir/RMPart"; my $levels = $opt_debug ? 1 : `cat $partDir/levels`; chomp $levels; # Use symbolic link created in cluster step: my $indent = ""; my $path = ""; # Make the appropriate number of nested foreach's to match the number # of levels from the partitioning. If $levels is 1, no foreach required. for (my $l = $levels - 2; $l >= 0; $l--) { my $dir = ($l == ($levels - 2)) ? $partDir : "\$d" . ($l + 1); $bossScript->add(<<_EOF_ ${indent}for d$l in $dir/??? do _EOF_ @@ -716,30 +737,31 @@ $secondsStart = `date "+%s"`; chomp $secondsStart; ($db) = @ARGV; # Now that we know the $db, figure out our paths: my $date = `date +%Y-%m-%d`; chomp $date; $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/RepeatMasker.$date"; $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq : "$HgAutomate::clusterData/$db/$db.unmasked.2bit"; my $seqCount = `twoBitInfo $unmaskedSeq stdout | wc -l`; $chromBased = ($seqCount <= $HgAutomate::splitThreshold) && $opt_splitTables; $updateTable = $opt_updateTable ? "Update" : ""; $ncbiRmsk = $opt_ncbiRmsk ? $opt_ncbiRmsk : ""; +$dupList = $opt_dupList ? $opt_dupList : ""; $liftSpec = $opt_liftSpec ? $opt_liftSpec : ""; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60;