2aa890be7b759af7095a891fa999cb27a63bf5a2 hiram Fri Feb 28 15:32:32 2020 -0800 better detection if step already done and need to fixup spaces in gene names from NCBI refs #23891 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 58dd6e3..bd3e7fb 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -1442,38 +1442,52 @@ -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId else printf "# cpgIslands masked previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # sub doCpgIslands ######################################################################### # * step: ncbiGene [workhorse] sub doNcbiGene { my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz"; if ( ! -s "${gffFile}" ) { - printf STDERR "# step ncbiGene: no gff file found at:\n# %s\n", $gffFile; + &HgAutomate::verbose(1, "# step ncbiGene: no gff file found at:\n# %s\n", $gffFile); return; } if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) { - printf STDERR "# ERROR: ncbiGene: can not find $buildDir/sequence/$asmId.ncbiToUcsc.lift\n"; + &HgAutomate::verbose(1, "# ERROR: ncbiGene: can not find $buildDir/sequence/$asmId.ncbiToUcsc.lift\n"); exit 255; } my $runDir = "$buildDir/trackData/ncbiGene"; + if (-d "${runDir}" ) { + if (! -s "$runDir/$asmId.ncbiGene.bb") { + &HgAutomate::verbose(1, + "WARNING ncbiGene step may already be running, but not completed ?\n"); + return; + } elsif (! needsUpdate("$gffFile", "$runDir/$asmId.ncbiGene.bb")) { + &HgAutomate::verbose(1, "# ncbiGene step previously completed\n"); + return; + } + } + if (! -s "$buildDir/$asmId.faSize.txt") { + &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n# %s\n", "$buildDir/$asmId.faSize.txt"); + exit 255; + } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "translate NCBI GFF3 gene definitions into a track"; my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId export gffFile=$gffFile function cleanUp() { rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred rm -f \$asmId.geneAttrs.ncbi.txt } @@ -1482,47 +1496,54 @@ (gff3ToGenePred -warnAndContinue -useName \\ -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\ 2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\ | gzip -c > \$asmId.ncbiGene.genePred.gz genePredCheck \$asmId.ncbiGene.genePred.gz export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'` if [ "\${howMany}" -eq 0 ]; then printf "# ncbiGene: no gene definitions found in \$gffFile\n"; cleanUp exit 0 fi liftUp -extGenePred -type=.gp stdout \\ ../../sequence/\$asmId.ncbiToUcsc.lift warn \\ \$asmId.ncbiGene.genePred.gz | gzip -c \\ > \$asmId.ncbiGene.ucsc.genePred.gz + genePredToBed -tab -fillSpace \$asmId.ncbiGene.ucsc.genePred.gz stdout \\ + | bedToExons stdin stdout | bedSingleCover.pl stdin > \$asmId.exons.bed + export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$asmId.exons.bed` + export asmSizeNoGaps=`grep sequences ../../\$asmId.faSize.txt | awk '{print \$1}'` + export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'` + rm -f \$asmId.exons.bed ~/kent/src/hg/utils/automation/gpToIx.pl \$asmId.ncbiGene.ucsc.genePred.gz \\ | sort -u > \$asmId.ncbiGene.ix.txt ixIxx \$asmId.ncbiGene.ix.txt \$asmId.ncbiGene.ix \$asmId.ncbiGene.ixx rm -f \$asmId.ncbiGene.ix.txt genePredToBigGenePred \$asmId.ncbiGene.ucsc.genePred.gz stdout \\ | sort -k1,1 -k2,2n > \$asmId.ncbiGene.bed (bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\ -extraIndex=name \$asmId.ncbiGene.bed \\ ../../\$asmId.chrom.sizes \$asmId.ncbiGene.bb || true) if [ ! -s "\$asmId.ncbiGene.bb" ]; then printf "# ncbiGene: failing bedToBigBed\\n" 1>&2 exit 255 fi touch -r\$gffFile \$asmId.ncbiGene.bb bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo + printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiGene.txt else printf "# ncbiGene step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doNcbiGene ######################################################################### # * step: ncbiRefSeq [workhorse] sub doNcbiRefSeq { my $runDir = "$buildDir/trackData/ncbiRefSeq"; my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz"; if ( ! -s "${gffFile}" ) { printf STDERR "# step ncbiRefSeq no gff file found at:\n# %s\n", $gffFile; @@ -1549,30 +1570,45 @@ -target2bit="\$target2bit" \\ -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\ \$asmId \$asmId else printf "# ncbiRefSeq step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # ncbiRefSeq ######################################################################### # * step: augustus [workhorse] sub doAugustus { my $runDir = "$buildDir/trackData/augustus"; + if (! -s "$buildDir/$asmId.2bit") { + &HgAutomate::verbose(1, + "ERROR: augustus step can not find $buildDir/$asmId.2bit\n"); + exit 255; + } + if (-d "${runDir}" ) { + if (! -s "$runDir/$asmId.augustus.bb") { + &HgAutomate::verbose(1, + "WARNING augustus step may already be running, but not completed ?\n"); + return; + } elsif (! needsUpdate("$buildDir/$asmId.2bit", "$runDir/$asmId.augustus.bb")) { + &HgAutomate::verbose(1, "# augustus step previously completed\n"); + return; + } + } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run Augustus gene prediction procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1 time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\