66a4bb8eccef8184fe80dbeb638c99f2dcbda169 hiram Wed Feb 26 14:53:01 2020 -0800 better detection if steps need to be done or have been done or are currently running refs #23891 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index cdd137e..58dd6e3 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -67,31 +67,31 @@ my $ftpDir = ""; # will be determined from given asmId my $rmskSpecies = ""; my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; my $ucscNames = 0; # default 'FALSE' (== 0) my $asmHubName = "n/a"; # directory name in: /gbdb/hubs/asmHubName my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; # key is original accession name from the remove.dups.list, value is 1 -my %dupAccessionList = {}; +my %dupAccessionList; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base [options] asmId required arguments: asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ @@ -924,41 +924,43 @@ sub doGatewayPage { if ($asmHubName eq "n/a") { printf STDERR "ERROR: step gatewayPage needs argument -asmHubName <name>\n"; exit 255; } my $runDir = "$buildDir/html"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct html/$asmId.description.html"; my $bossScript = newBash HgRemoteScript("$runDir/doGatewayPage.bash", $workhorse, $runDir, $whatItDoes); my $photoJpg = "noPhoto"; my $photoCredit = "noPhoto"; my $photoLink = ""; - if ( -s "$runDir/../photo/$species.jpg" ) { + my $speciesNoBlank = $species; + $speciesNoBlank =~ s/ /_/g; + if ( -s "$runDir/../photo/$speciesNoBlank.jpg" ) { $photoJpg = "../photo/\${species}.jpg"; $photoCredit = "../photo/photoCredits.txt"; $photoLink = "rm -f \${species}.jpg; ln -s ../photo/\${species}.jpg ." } else { printf STDERR "# gatewayPage: warning: no photograph available\n"; } $bossScript->add(<<_EOF_ export asmId=$asmId -export species=$species +export species=$speciesNoBlank \$HOME/kent/src/hg/utils/automation/asmHubGatewayPage.pl \\ $asmHubName ../download/\${asmId}_assembly_report.txt \\ ../\${asmId}.chrom.sizes \\ $photoJpg $photoCredit \\ > \$asmId.description.html 2> \$asmId.names.tab \$HOME/kent/src/hg/utils/automation/genbank/buildStats.pl \\ ../\$asmId.chrom.sizes 2> \$asmId.build.stats.txt touch -r ../download/\${asmId}_assembly_report.txt \$asmId.description.html $photoLink _EOF_ ); $bossScript->execute(); } # gatewayPage @@ -1016,30 +1018,37 @@ rm -f \$asmId.wigVarStep.gz touch -r ../../\$asmId.2bit \$asmId.gc5Base.bw else printf "# gc5Base step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # gc5Base ######################################################################### # * step: repeatMasker [workhorse] sub doRepeatMasker { my $runDir = "$buildDir/trackData/repeatMasker"; + if ( -d "$buildDir/trackData/repeatMasker/run.cluster" ) { + if ( ! -s "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" ) { + &HgAutomate::verbose(1, + "\nERROR: step repeatmasker may be running\n"); + exit 255; + } + } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct repeatMasker track data"; my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then export species=`echo $rmskSpecies | sed -e 's/_/ /g;'` doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\ -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId @@ -1174,41 +1183,51 @@ rm -f toBbi.bed gzip *.bed else printf "# allgaps step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # allGaps ######################################################################### # * step: idKeys [workhorse] sub doIdKeys { my $runDir = "$buildDir/trackData/idKeys"; + if (! -s "$buildDir/$asmId.2bit") { + &HgAutomate::verbose(1, "ERROR: idKeys can not find $asmId.2bit\n"); + exit 255; + } + if (! needsUpdate("$buildDir/$asmId.2bit", "$runDir/$asmId.keySignature.txt")) { + &HgAutomate::verbose(1, "# idKeys step previously completed\n"); + return; + } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct ID key data for each contig/chr"; my $bossScript = newBash HgRemoteScript("$runDir/doIdKeys.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId +export twoBit=$buildDir/\$asmId.2bit if [ ../../\$asmId.2bit -nt \$asmId.keySignature.txt ]; then - doIdKeys.pl \$asmId -buildDir=`pwd` -twoBit=../../\$asmId.2bit + doIdKeys.pl \$asmId -buildDir=`pwd` -twoBit=\$twoBit + touch -r \$twoBit \$asmId.keySignature.txt else printf "# idKeys step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # doIdKeys ######################################################################### # * step: addMask [workhorse] sub doAddMask { my $runDir = "$buildDir/trackData/addMask"; my $goNoGo = 0; @@ -1330,42 +1349,60 @@ if [ ../../\$asmId.unmasked.2bit -nt \$asmId.gapOverlap.bed.gz ]; then doGapOverlap.pl -buildDir=`pwd` -bigClusterHub=$bigClusterHub -smallClusterHub=$smallClusterHub -workhorse=$workhorse -twoBit=../../\$asmId.2bit \$asmId else printf "# gapOverlap step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # doGapOverlap ######################################################################### # * step: tandemDups [workhorse] sub doTandemDups { - my $runDir = "$buildDir/trackData/gapOverlap"; + my $runDir = "$buildDir/trackData/tandemDups"; + if (! -s "$buildDir/$asmId.unmasked.2bit") { + &HgAutomate::verbose(1, + "ERROR: tandemDups: can not find $buildDir/$asmId.unmasked.2bit\n"); + exit 255; + } + if (-d "${runDir}" ) { + if (! -s "$runDir/$asmId.tandemDups.bb") { + &HgAutomate::verbose(1, + "WARNING tandemDups step may already be running, but not completed ?\n"); + return; + } elsif (! needsUpdate("$buildDir/$asmId.unmasked.2bit", "$runDir/$asmId.tandemDups.bb")) { + &HgAutomate::verbose(1, "# tandemDups step previously completed\n"); + return; + } + } + &HgAutomate::mustMkdir($runDir); - my $whatItDoes = "construct gap overlap track (duplicate sequence on each side of a gap)"; + my $whatItDoes = "construct tandem dups track (nearby pairs of exact duplicate sequence)"; my $bossScript = newBash HgRemoteScript("$runDir/doTandemDups.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId +export twoBit=$buildDir/\$asmId.unmasked.2bit -if [ ../../\$asmId.unmasked.2bit -nt \$asmId.gapOverlap.bed.gz ]; then - doTandemDup.pl -buildDir=`pwd` -bigClusterHub=$bigClusterHub -smallClusterHub=$smallClusterHub -workhorse=$workhorse -twoBit=../../\$asmId.2bit \$asmId +if [ \$twoBit -nt \$asmId.tandemDups.bb ]; then + doTandemDup.pl -buildDir=`pwd` -bigClusterHub=$bigClusterHub -smallClusterHub=$smallClusterHub -workhorse=$workhorse -twoBit=\$twoBit \$asmId + touch -r \$twoBit \$asmId.tandemDups.bb else printf "# tandemDups step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # doTandemDups ######################################################################### # * step: cpgIslands [workhorse] sub doCpgIslands { my $runDir = "$buildDir/trackData/cpgIslands"; &HgAutomate::mustMkdir($runDir); @@ -1409,31 +1446,31 @@ fi _EOF_ ); $bossScript->execute(); } # sub doCpgIslands ######################################################################### # * step: ncbiGene [workhorse] sub doNcbiGene { my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz"; if ( ! -s "${gffFile}" ) { printf STDERR "# step ncbiGene: no gff file found at:\n# %s\n", $gffFile; return; } if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) { - printf STDERR "# ERROR: ncbiGene: can not find ../../sequence/$asmId.ncbiToUcsc.lift\n"; + printf STDERR "# ERROR: ncbiGene: can not find $buildDir/sequence/$asmId.ncbiToUcsc.lift\n"; exit 255; } my $runDir = "$buildDir/trackData/ncbiGene"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "translate NCBI GFF3 gene definitions into a track"; my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId export gffFile=$gffFile function cleanUp() { @@ -1463,31 +1500,31 @@ rm -f \$asmId.ncbiGene.ix.txt genePredToBigGenePred \$asmId.ncbiGene.ucsc.genePred.gz stdout \\ | sort -k1,1 -k2,2n > \$asmId.ncbiGene.bed (bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\ -extraIndex=name \$asmId.ncbiGene.bed \\ ../../\$asmId.chrom.sizes \$asmId.ncbiGene.bb || true) if [ ! -s "\$asmId.ncbiGene.bb" ]; then printf "# ncbiGene: failing bedToBigBed\\n" 1>&2 exit 255 fi touch -r\$gffFile \$asmId.ncbiGene.bb bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo else - printf "# ncbiGene previously completed\\n" 1>&2 + printf "# ncbiGene step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doNcbiGene ######################################################################### # * step: ncbiRefSeq [workhorse] sub doNcbiRefSeq { my $runDir = "$buildDir/trackData/ncbiRefSeq"; my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz"; if ( ! -s "${gffFile}" ) { printf STDERR "# step ncbiRefSeq no gff file found at:\n# %s\n", $gffFile; return; } @@ -1501,91 +1538,91 @@ $bossScript->add(<<_EOF_ export asmId="$asmId" export buildDir="$buildDir" export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift" export target2bit="\$buildDir/\$asmId.2bit" if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -toGpWarnOnly -buildDir=`pwd` \\ -bigClusterHub=$bigClusterHub -dbHost=$dbHost \\ -liftFile="\$liftFile" \\ -target2bit="\$target2bit" \\ -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\ \$asmId \$asmId else - printf "# ncbiRefSeq previously completed\\n" 1>&2 + printf "# ncbiRefSeq step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # ncbiRefSeq ######################################################################### # * step: augustus [workhorse] sub doAugustus { my $runDir = "$buildDir/trackData/augustus"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run Augustus gene prediction procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1 time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > cleanup.log 2>&1 else - printf "# augustus genes previously completed\\n" 1>&2 + printf "# augustus genes step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doAugustus ######################################################################### # * step: xenoRefGene [bigClusterHub] sub doXenoRefGene { my $runDir = "$buildDir/trackData/xenoRefGene"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run xeno RefSeq gene mapping procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doXenoRefGene.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.xenoRefGene.bb ]; then time (~/kent/src/hg/utils/automation/doXenoRefGene.pl -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -mrnas=$xenoRefSeq -workhorse=$workhorse \\ -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \$asmId) > do.log 2>&1 if [ -s "\$asmId.xenoRefGene.bb" ]; then bigBedInfo \$asmId.xenoRefGene.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$asmId.xenoRefGene.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# xenoRefGene %s %'d %s %'d\\n" `cat \$asmId.xenoRefGene.stats.txt` | xargs echo fi else - printf "# xenoRefGene previously completed\\n" 1>&2 + printf "# xenoRefGene step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doXenoRefGene ######################################################################### # * step: trackDb [workhorse] sub doTrackDb { my $runDir = "$buildDir"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct asmId.trackDb.txt file"; my $bossScript = newBash HgRemoteScript("$runDir/doTrackDb.bash", $workhorse, $runDir, $whatItDoes); @@ -1634,67 +1671,70 @@ # GC[AF]_123456789.3_assembly_Name # 0 1 2 3 .... my @partNames = split('_', $asmId); $ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0], substr($partNames[1],0,3), substr($partNames[1],3,3), substr($partNames[1],6,3), $asmId); # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; # $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/asmHubs/refseqBuild/$ftpDir"; -$assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$ftpDir"; +$sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir; +$assemblySource = $opt_sourceDir ? "$opt_sourceDir" : "$sourceDir/$ftpDir"; my $asmReport = "$assemblySource/${asmId}_assembly_report.txt"; $species = $opt_species ? $opt_species : $species; if (length($species) < 1) { if (-s "$asmReport") { $species = `grep -i "organism name:" $asmReport`; chomp $species; $species =~ s/.*organism\s+name:\s+//i; $species =~ s/\s+\(.*//; } else { die "no -species specified and can not find $asmReport"; } if (length($species) < 1) { die "no -species specified and can not find Organism name: in $asmReport"; } } -$sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir; $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; +printf STDERR "# asmHubName %s\n", $asmHubName; +printf STDERR "# rmskSpecies %s\n", $rmskSpecies; +printf STDERR "# augustusSpecies %s\n", $augustusSpecies; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60;