90b9985e1fc9106341b275badcd445531be9a48c hiram Thu Oct 15 09:23:22 2020 -0700 add noAugustus and noXenoRefSeq options for bacteria hub build refs #23891 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index b64d4bd..b021e16 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -19,31 +19,33 @@ use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir $opt_species $opt_rmskSpecies $opt_ncbiRmsk $opt_noRmsk $opt_augustusSpecies + $opt_noAugustus $opt_xenoRefSeq + $opt_noXenoRefSeq $opt_ucscNames $opt_asmHubName /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'chromAlias', func => \&doChromAlias }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, { name => 'repeatMasker', func => \&doRepeatMasker }, { name => 'simpleRepeat', func => \&doSimpleRepeat }, @@ -62,30 +64,32 @@ { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $species = ""; # usually found in asmId_assembly_report.txt my $ftpDir = ""; # will be determined from given asmId my $rmskSpecies = ""; my $noRmsk = 0; # when RepeatMasker is not possible, such as bacteria my $ncbiRmsk = 0; # when =1 call doRepeatMasker.pl # with -ncbiRmsk=path.out.gz and -liftSpec=... my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq"; +my $noAugustus = 0; # bacteria do *not* create an augustus track +my $noXenoRefSeq = 0; # bacteria do *not* create a xenoRefSeq track my $ucscNames = 0; # default 'FALSE' (== 0) my $asmHubName = "n/a"; # directory name in: /gbdb/hubs/asmHubName my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; # key is original accession name from the remove.dups.list, value is 1 my %dupAccessionList; sub usage { # Usage / help / self-documentation: @@ -106,30 +110,32 @@ -sourceDir dir Find assembly in dir instead of default: $sourceDir/GC[AF]/123/456/789/asmId -ucscNames Translate NCBI/INSDC/RefSeq names to UCSC names default is to use the given NCBI/INSDC/RefSeq names -asmHubName <name> directory name in: /gbdb/hubs/asmHubName -species <name> use this species designation if there is no asmId_assembly_report.txt with an 'Organism name:' entry to obtain species -rmskSpecies <name> to override default 'species' name for repeat masker the default is found in the asmId_asssembly_report.txt e.g. -rmskSpecies=viruses -noRmsk when RepeatMasker is not possible, such as bacteria -ncbiRmsk use NCBI rm.out.gz file instead of local cluster run for repeat masking -augustusSpecies <human|chicken|zebrafish> default 'human' + -noAugustus do *not* create the Augustus gene track + -noXenoRefSeq do *not* create the Xeno RefSeq gene track -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz expanded directory of mrnas/ and xenoRefMrna.sizes, default $xenoRefSeq _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => $fileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates build of assembly hub. Steps: download: sets up sym link working hierarchy from already mirrored files from NCBI in: $sourceDir/GC[AF]/123/456/789/asmId @@ -828,31 +834,34 @@ ### construct sequence when no AGP files exist if (0 == $partsDone) { printf STDERR "creating fake AGP\n"; if ($ucscNames) { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names _EOF_ ); } else { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz -zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names +twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\ + | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\ + | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\ + | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names _EOF_ ); } } else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2 cat \$asmId.dups.txt 1>&2 @@ -1713,30 +1722,34 @@ -assemblyHub -bigClusterHub=$bigClusterHub -dbHost=$dbHost $liftSpec \\ -target2bit="\$target2bit" \\ -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\ \$asmId \$asmId else printf "# ncbiRefSeq step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # ncbiRefSeq ######################################################################### # * step: augustus [workhorse] sub doAugustus { + if ($noAugustus) { + &HgAutomate::verbose(1, "# -noAugustus == Augustus gene track not created\n"); + return; + } my $runDir = "$buildDir/trackData/augustus"; if (! -s "$buildDir/$asmId.2bit") { &HgAutomate::verbose(1, "ERROR: augustus step can not find $buildDir/$asmId.2bit\n"); exit 255; } if (-d "${runDir}" ) { if (! -s "$runDir/$asmId.augustus.bb") { &HgAutomate::verbose(1, "WARNING augustus step may already be running, but not completed ?\n"); return; } elsif (! needsUpdate("$buildDir/$asmId.2bit", "$runDir/$asmId.augustus.bb")) { &HgAutomate::verbose(1, "# augustus step previously completed\n"); return; } @@ -1757,30 +1770,34 @@ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1 time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > cleanup.log 2>&1 else printf "# augustus genes step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doAugustus ######################################################################### # * step: xenoRefGene [bigClusterHub] sub doXenoRefGene { + if ($noXenoRefSeq) { + &HgAutomate::verbose(1, "# -noXenoRefSeq == Xeno RefSeq gene track not created\n"); + return; + } my $runDir = "$buildDir/trackData/xenoRefGene"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run xeno RefSeq gene mapping procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doXenoRefGene.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.xenoRefGene.bb ]; then time (~/kent/src/hg/utils/automation/doXenoRefGene.pl -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -mrnas=$xenoRefSeq -workhorse=$workhorse \\ -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \$asmId) > do.log 2>&1 @@ -1887,49 +1904,54 @@ chomp $species; $species =~ s/.*organism\s+name:\s+//i; $species =~ s/\s+\(.*//; } else { die "no -species specified and can not find $asmReport"; } if (length($species) < 1) { die "no -species specified and can not find Organism name: in $asmReport"; } } $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' +$noAugustus = $opt_noAugustus ? 1 : $noAugustus; # '1' == 'TRUE' +$noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName; $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0; $noRmsk = $opt_noRmsk ? 1 : 0; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; printf STDERR "# asmHubName %s\n", $asmHubName; printf STDERR "# rmskSpecies %s\n", $rmskSpecies; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE"; printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE"; +printf STDERR "# noAugustus %s\n", $noAugustus ? "TRUE" : "FALSE"; +printf STDERR "# noXenoRefSeq %s\n", $noXenoRefSeq ? "TRUE" : "FALSE"; +printf STDERR "# noRmsk %s\n", $noRmsk ? "TRUE" : "FALSE"; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60;