395595f60c883b2344ae20299687563b8ea6e215 hiram Thu Dec 15 14:10:23 2022 -0800 adding RepeatModeler option no redmine diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index a8d20df..4e4ae60 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -16,73 +16,76 @@ use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir $opt_species $opt_rmskSpecies + $opt_runRepeatModeler $opt_ncbiRmsk $opt_noRmsk $opt_augustusSpecies $opt_noAugustus $opt_xenoRefSeq $opt_noXenoRefSeq $opt_ucscNames $opt_dbName /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'chromAlias', func => \&doChromAlias }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, + { name => 'repeatModeler', func => \&doRepeatModeler }, { name => 'repeatMasker', func => \&doRepeatMasker }, { name => 'simpleRepeat', func => \&doSimpleRepeat }, { name => 'allGaps', func => \&doAllGaps }, { name => 'idKeys', func => \&doIdKeys }, { name => 'windowMasker', func => \&doWindowMasker }, { name => 'addMask', func => \&doAddMask }, { name => 'gapOverlap', func => \&doGapOverlap }, { name => 'tandemDups', func => \&doTandemDups }, { name => 'cpgIslands', func => \&doCpgIslands }, { name => 'ncbiGene', func => \&doNcbiGene }, { name => 'ncbiRefSeq', func => \&doNcbiRefSeq }, { name => 'xenoRefGene', func => \&doXenoRefGene }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $species = ""; # usually found in asmId_assembly_report.txt my $ftpDir = ""; # will be determined from given asmId my $rmskSpecies = ""; +my $runRepeatModeler = 0; # default off my $noRmsk = 0; # when RepeatMasker is not possible, such as bacteria my $ncbiRmsk = 0; # when =1 call doRepeatMasker.pl # with -ncbiRmsk=path.out.gz and -liftSpec=... my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq"; my $noAugustus = 0; # bacteria do *not* create an augustus track my $noXenoRefSeq = 0; # bacteria do *not* create a xenoRefSeq track my $ucscNames = 0; # default 'FALSE' (== 0) my $dbName = ""; # default uses NCBI asmId for name, can specify: abcDef1 my $defaultName = ""; # will be asmId or dbName if present my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen @@ -107,58 +110,65 @@ print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Construct assembly hub in dir instead of default $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/ -sourceDir dir Find assembly in dir instead of default: $sourceDir/GC[AF]/123/456/789/asmId -ucscNames Translate NCBI/INSDC/RefSeq names to UCSC names default is to use the given NCBI/INSDC/RefSeq names -dbName name for UCSC style database name, e.g. 'abcDef1' -species use this species designation if there is no asmId_assembly_report.txt with an 'Organism name:' entry to obtain species -rmskSpecies to override default 'species' name for repeat masker the default is found in the asmId_asssembly_report.txt e.g. -rmskSpecies=viruses + -runRepeatModeler run RepeatModeler to supply custom lib to RepeatMasker, + default is to not run RepeatModeler -noRmsk when RepeatMasker is not possible, such as bacteria -ncbiRmsk use NCBI rm.out.gz file instead of local cluster run for repeat masking -augustusSpecies default 'human' -noAugustus do *not* create the Augustus gene track -noXenoRefSeq do *not* create the Xeno RefSeq gene track -xenoRefSeq - location of xenoRefMrna.fa.gz expanded directory of mrnas/ and xenoRefMrna.sizes, default $xenoRefSeq _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => $fileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates build of assembly hub. Steps: download: sets up sym link working hierarchy from already mirrored files from NCBI in: $sourceDir/GC[AF]/123/456/789/asmId sequence: establish AGP and 2bit file from NCBI directory assemblyGap: create assembly and gap bigBed files and indexes for assembly track names chromAlias: construct asmId.chromAlias.txt for alias name recognition gatewayPage: create html/asmId.description.html contents cytoBand: create cytoBand track and navigation ideogram gc5Base: create bigWig file for gc5Base track + repeatModeler: optionally, run RepeatModeler to construct custom library + for repeatMasker run. Use: -runRepeatModeler to perform + this procedure, warning: can take a considerable amount + of time (12 to 48 hours or more), and consumes an entire + ku cluster node repeatMasker: run repeat masker cluster run and create bigBed files for the composite track categories of repeats simpleRepeat: run trf cluster run and create bigBed file for simple repeats allGaps: calculate all actual real gaps due to N's in sequence, can be more than were specified in the AGP file idKeys: calculate md5sum for each sequence in the assembly to be used to find identical sequences in similar assemblies windowMasker: run windowMasker cluster run, create windowMasker bigBed file and compute intersection with repeatMasker results addMask: combine the higher masking of (windowMasker or repeatMasker) with trf simpleRepeats into one 2bit file gapOverlap: find duplicated sequence on each side of a gap tandemDups: annotate all pairs of duplicated sequence with some gap between cpgIslands: run CpG islands cluster runs for both masked and unmasked sequences and create bigBed files for this composite track @@ -191,30 +201,31 @@ } # Globals: # Command line args: asmId my ( $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource, $asmReport); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', 'rmskSpecies=s', + 'runRepeatModeler', 'noRmsk', 'ncbiRmsk', 'augustusSpecies=s', 'xenoRefSeq=s', 'dbName=s', 'noXenoRefSeq', 'noAugustus', 'ucscNames', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); @@ -1158,85 +1169,129 @@ ../../\$asmId.2bit \\ | gzip -c > \$asmId.wigVarStep.gz wigToBigWig \$asmId.wigVarStep.gz ../../\$asmId.chrom.sizes \$asmId.gc5Base.bw rm -f \$asmId.wigVarStep.gz touch -r ../../\$asmId.2bit \$asmId.gc5Base.bw else printf "# gc5Base step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # gc5Base ######################################################################### +# * step: repeatModeler [workhorse, bigClusterHub] +sub doRepeatModeler { + if (! $runRepeatModeler) { + &HgAutomate::verbose(1, "# RepeatModeler not being run, add argument: -runRepeatModeler to run this step before RepeatMasker\n"); + return; + } + my $runDir = "$buildDir/trackData/repeatModeler"; + # check if been run before + if ( -d "${runDir}" ) { + if ( -s "${runDir}/${defaultName}-families.fa" ) { + &HgAutomate::verbose(1, "\nRepeatModeler step previously completed\n"); + return; + } elsif ( -s "$buildDir/trackData/repeatModeler/blastDb.bash" ) { + &HgAutomate::verbose(1, "\nERROR: repeatModeler step may be running\n"); + exit 255; + } + } + &HgAutomate::mustMkdir($runDir); + my $whatItDoes = "run RepeatModeler on sequence to prepare RepeatMasker custom library"; + my $bossScript = newBash HgRemoteScript("$runDir/doRepeatModeler.bash", + $workhorse, $runDir, $whatItDoes); + + $bossScript->add(<<_EOF_ +export asmId="${defaultName}" +export buildDir="${buildDir}" + +doRepeatModeler.pl -buildDir=`pwd` \\ + -unmaskedSeq=\$buildDir/\$asmId.unmasked.2bit \\ + -bigClusterHub=$bigClusterHub -workhorse=$workhorse \$asmId +_EOF_ + ); + $bossScript->execute(); +} # sub doRepeatModeler + +######################################################################### # * step: repeatMasker [workhorse] sub doRepeatMasker { if ($noRmsk) { &HgAutomate::verbose(1, "# -noRmsk == RepeatMasker not being run\n"); return; } my $runDir = "$buildDir/trackData/repeatMasker"; if ( -d "$buildDir/trackData/repeatMasker/run.cluster" ) { if ( ! -s "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" ) { &HgAutomate::verbose(1, "\nERROR: step repeatmasker may be running\n"); exit 255; } } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct repeatMasker track data"; my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash", $workhorse, $runDir, $whatItDoes); + my $repeatModeler = "$buildDir/trackData/repeatModeler"; + my $customLib = ""; + if ( -s "${repeatModeler}/${defaultName}-families.fa" ) { + $customLib = "-customLib=\"${repeatModeler}/${defaultName}-families.fa\""; + } my $rmskOpts = ""; if ($ncbiRmsk) { if ( -s "$buildDir/download/${asmId}_rm.out.gz" ) { $rmskOpts = " \\ -ncbiRmsk=\"$buildDir/download/${asmId}_rm.out.gz\" "; if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) { $rmskOpts .= " \\ -dupList=\"${buildDir}/download/${asmId}.remove.dups.list\" "; } if ($ucscNames) { $rmskOpts .= " \\ -liftSpec=\"$buildDir/sequence/$asmId.ncbiToUcsc.lift\""; } } } + my $speciesOrLib = "-species=\"\$species\""; + if ( length(${customLib}) ) { + $speciesOrLib = "${customLib}"; + } $bossScript->add(<<_EOF_ export asmId=$defaultName export ncbiAsmId=$asmId if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then export species=`echo $rmskSpecies | sed -e 's/_/ /g;'` rm -f versionInfo.txt doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit $rmskOpts \\ - -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId + -bigClusterHub=$bigClusterHub -workhorse=$workhorse $speciesOrLib \$asmId if [ -s "\$asmId.fa.out" ]; then gzip \$asmId.fa.out fi gzip \$asmId.sorted.fa.out \$asmId.nestedRepeats.bed doRepeatMasker.pl -continue=cleanup -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit $rmskOpts \\ - -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId + -bigClusterHub=$bigClusterHub -workhorse=$workhorse $speciesOrLib \$asmId if [ ! -s versionInfo.txt ]; then if [ -s ../../download/\${ncbiAsmId}_rm.run ]; then ln -s ../../download/\${ncbiAsmId}_rm.run versionInfo.txt fi fi \$HOME/kent/src/hg/utils/automation/asmHubRepeatMasker.sh \$asmId `pwd`/\$asmId.sorted.fa.out.gz `pwd` else printf "# repeatMasker step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); @@ -2045,54 +2100,56 @@ if (-s "$asmReport") { $species = `grep -i "organism name:" $asmReport`; chomp $species; $species =~ s/.*organism\s+name:\s+//i; $species =~ s/\s+\(.*//; } else { die "ERROR: no -species specified and can not find $asmReport"; } if (length($species) < 1) { die "ERROR: no -species specified and can not find Organism name: in $asmReport"; } } $dbName = $opt_dbName ? $opt_dbName : $dbName; $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; +$runRepeatModeler = $opt_runRepeatModeler ? $opt_runRepeatModeler : $runRepeatModeler; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $noAugustus = $opt_noAugustus ? 1 : $noAugustus; # '1' == 'TRUE' $noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0; $noRmsk = $opt_noRmsk ? 1 : 0; $defaultName = $asmId; $defaultName = $dbName if (length($dbName)); die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; if (length($dbName)) { printf STDERR "# dbName %s - building in /hive/data/genomes/%s\n", $dbName, $dbName; } printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; +printf STDERR "# runRepeatModeler %s\n", $runRepeatModeler ? "TRUE" : "FALSE"; printf STDERR "# rmskSpecies %s\n", $rmskSpecies; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE"; printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE"; printf STDERR "# noAugustus %s\n", $noAugustus ? "TRUE" : "FALSE"; printf STDERR "# noXenoRefSeq %s\n", $noXenoRefSeq ? "TRUE" : "FALSE"; printf STDERR "# noRmsk %s\n", $noRmsk ? "TRUE" : "FALSE"; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)";