e3c90a0c1514cc8f4e0bf17618fd2d2d06e235ef
hiram
  Mon May 2 10:16:21 2022 -0700
clean up loose ends no redmine

diff --git src/hg/utils/automation/doAugustus.pl src/hg/utils/automation/doAugustus.pl
index d9face2..6534100 100755
--- src/hg/utils/automation/doAugustus.pl
+++ src/hg/utils/automation/doAugustus.pl
@@ -1,445 +1,446 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doAugustus.pl instead.
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_maskedSeq
     $opt_species
     $opt_utr
     $opt_noDbGenePredCheck
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'partition',   func => \&doPartition },
       { name => 'augustus', func => \&doAugustus },
       { name => 'makeGp', func => \&doMakeGp },
       { name => 'load', func => \&doLoadAugustus },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 # my $bigClusterHub = 'swarm';
 my $bigClusterHub = 'ku';
 my $workhorse = 'hgwdev';
 my $dbHost = 'hgwdev';
 my $defaultWorkhorse = 'hgwdev';
 my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit";
 my $utr = "off";
 my $noDbGenePredCheck = 1;    # default yes, use -db for genePredCheck
 my $species = "human";
 my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1";
 my $augustusConfig="$augustusDir/config";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus
                           (necessary when continuing at a later date).
     -maskedSeq seq.2bit   Use seq.2bit as the masked input sequence instead
                           of default ($maskedSeq).
     -utr                  Obsolete, now is automatic (was: Use augustus arg: --UTR=on, default is --UTR=off)
     -noDbGenePredCheck    do not use -db= on genePredCheck, there is no real db
     -species <name>       name from list: human chicken zebrafish, default: human
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
                                 'bigClusterHub' => $bigClusterHub,
                                 'workhorse' => $defaultWorkhorse);
   print STDERR "
 Automates UCSC's Augustus track construction for the database \$db.  Steps:
     partition:  Creates hard-masked fastas needed for the CpG Island program.
     augustus:   Run gsBig on the hard-masked fastas
     makeGp:   Transform output from gsBig into augustus.gtf augustus.pep and
     load:      Load augustus.gtf and into \$db.
     cleanup:   Removes hard-masked fastas and output from gsBig.
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'maskedSeq=s',
 		      'species=s',
 		      'utr',
 		      'noDbGenePredCheck',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $workhorse = $opt_workhorse if ($opt_workhorse);
   $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 # * step: partition [workhorse]
 sub doPartition {
   # Set up and perform the cluster run to run create the partitioned sequences.
   my $runDir = "$buildDir/partition";
 
   # First, make sure we're starting clean.
   if (-d "$runDir") {
     die "doPartition: partition step already done, remove directory 'partition' to rerun,\n" .
       "or '-continue augustus' to run next step.\n";
   }
 
   &HgAutomate::mustMkdir($runDir);
   my $whatItDoes="partition 2bit file into fasta chunks for augustus processing.";
   my $bossScript = newBash HgRemoteScript("$runDir/doPartition.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 export db="$db"
 mkdir -p partBundles
 mkdir -p ../fasta
 
 twoBitInfo $maskedSeq stdout | sort -k2,2nr > \$db.chrom.sizes
 # aim for 1000 bundles or less
 export partSize=`cat \$db.chrom.sizes | wc -l | awk '{printf "%d", 1+\$1/1000}'`
 if [ "\$partSize" -gt 1000 ]; then
    partSize=1000
 fi
 
 /cluster/bin/scripts/partitionSequence.pl 11000000 1000000 \\
     $maskedSeq \\
     \$db.chrom.sizes \$partSize -rawDir=gtf \\
     -lstDir=partBundles > part.list
 
 (grep -v partBundles part.list || /bin/true) | sed -e 's/.*2bit://;' \\
    | awk -F':' '{print \$1}' | sort -u | while read chr
 do
    twoBitToFa $maskedSeq:\${chr} ../fasta/\${chr}.fa
 done
 if [ -s partBundles/part000.lst ]; then
   for F in partBundles/*.lst
   do
      B=`basename \$F | sed -e 's/.lst//;'`
      sed -e 's#.*.2bit:##;' \$F \\
       | twoBitToFa -seqList=stdin $maskedSeq stdout > ../fasta/\${B}.fa
   done
 fi
 _EOF_
   );
 
   $bossScript->execute();
 } # doPartition
 
 #########################################################################
 # * step: augustus [bigClusterHub]
 sub doAugustus {
   # Set up and perform the cluster run to run the augustus gene finder on the
   #     chunked fasta sequences.
   my $paraHub = $bigClusterHub;
   my $runDir = "$buildDir/run.augustus";
   # First, make sure we're starting clean.
   if (! -d "$buildDir/fasta" || ! -s "$buildDir/partition/part.list") {
     die "doAugustus: the previous step 'partition' did not complete \n" .
       "successfully (partition/part.list or fasta/*.fa does not exists).\nPlease " .
       "complete the previous step: -continue=-partition\n";
   } elsif (-e "$runDir/run.time") {
     die "doAugustus: looks like this step was run successfully already " .
       "(run.time exists).\nEither run with -continue makeGp or some later " .
 	"stage,\nor move aside/remove $runDir/ and run again.\n";
   } elsif ((-e "$runDir/jobList") && ! $opt_debug) {
     die "doAugustus: looks like we are not starting with a clean " .
       "slate.\n\tclean\n  $runDir/\n\tand run again.\n";
   }
   &HgAutomate::mustMkdir($runDir);
   my $whatItDoes = "runs one parasol augustus prediction";
   my $bossScript = new HgRemoteScript("$runDir/runOne", $dbHost,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 # expected inputs:
 #   <integer> <integer> </fullPath/fasta.fa> <gtf/directory/file.gtf>
 #       1         2            3                   4
 #     start      end        chr fasta file        output file result
 #       start==end  for a multiple fasta file
 #     start is 0 relative, will increment by 1 here for augustus
 #     error result will end up in augErr/directory/file.err
 
 set db = "$db"
 set start = \$1
 set end = \$2
 set fasta = \$3
 set chrName = \$fasta:r:t
 
 set resultGz = \$4
 set gtfFile = \$resultGz:t:r
 set errFile = "augErr/\$resultGz:h:t/\$gtfFile:r:t.err"
 mkdir -p \$resultGz:h
 mkdir -p \$errFile:h
 set tmpDir = "/dev/shm/\$db.\$chrName.\$start.\$end"
 
 mkdir -p \$tmpDir
 pushd \$tmpDir
 
 if ( \$start == \$end ) then
 $augustusDir/bin/augustus --species=$species --softmasking=1 --UTR=$utr --protein=off \\
  --AUGUSTUS_CONFIG_PATH=$augustusConfig \\
   --alternatives-from-sampling=true --sample=100 --minexonintronprob=0.2 \\
    --minmeanexonintronprob=0.5 --maxtracks=3 --temperature=2 \\
     \$fasta --outfile=\$gtfFile --errfile=\$errFile:t
 else
  @ start++
 $augustusDir/bin/augustus --species=$species --softmasking=1 --UTR=$utr --protein=off \\
  --AUGUSTUS_CONFIG_PATH=$augustusConfig \\
   --alternatives-from-sampling=true --sample=100 --minexonintronprob=0.2 \\
    --minmeanexonintronprob=0.5 --maxtracks=3 --temperature=2 \\
      --predictionStart=\$start --predictionEnd=\$end \\
       \$fasta --outfile=\$gtfFile --errfile=\$errFile:t
 endif
 
 gzip \$gtfFile
 popd
 mv \$tmpDir/\$gtfFile.gz \$resultGz
 mv \$tmpDir/\$errFile:t \$errFile
 rm -fr \$tmpDir
 _EOF_
   );
 
   $whatItDoes = "Run augustus on chunked fasta sequences.";
   $bossScript = newBash HgRemoteScript("$runDir/runAugustus.bash", $paraHub,
 				      $runDir, $whatItDoes);
   my $paraRun = &HgAutomate::paraRun();
   $bossScript->add(<<_EOF_
 (grep -v partBundles ../partition/part.list || /bin/true) | while read twoBit
 do
   chr=`echo \$twoBit |  sed -e 's/.*2bit://;' | awk -F':' '{print \$1}'`
   chrStart=`echo \$twoBit |  sed -e 's/.*2bit://;' | awk -F':' '{print \$2}' | sed -e 's/-.*//;'`
   chrEnd=`echo \$twoBit |  sed -e 's/.*2bit://;' | awk -F':' '{print \$2}' | sed -e 's/.*-//;'`
   echo "runOne \$chrStart \$chrEnd {check in exists+ $buildDir/fasta/\${chr}.fa} {check out exists+ gtf/\$chr/\$chr.\$chrStart.\$chrEnd.gtf.gz}"
 done > jobList
 
 (grep partBundles ../partition/part.list || /bin/true) | while read bundleName
 do
   B=`basename \$bundleName | sed -e 's/.lst//;'`
   echo "runOne 0 0 {check in exists+ $buildDir/fasta/\${B}.fa} {check out exists+ gtf/\${B}/\${B}.0.0.gtf.gz}"
 done >> jobList
 
 chmod +x runOne
 
 $paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doAugustus
 
 #########################################################################
 # * step: make gp [workhorse]
 sub doMakeGp {
   my $runDir = $buildDir;
   &HgAutomate::mustMkdir($runDir);
 
   # First, make sure we're starting clean.
   if (! -e "$runDir/run.augustus/run.time") {
     die "doMakeGp: the previous step augustus did not complete \n" .
       "successfully ($buildDir/run.augustus/run.time does not exist).\nPlease " .
       "complete the previous step: -continue=augustus\n";
   } elsif (-e "$runDir/$db.augustus.bb" ) {
     die "doMakeGp: looks like this was run successfully already\n" .
       "($db.augustus.bb exists).  Either run with -continue load or cleanup\n" .
 	"or move aside/remove $runDir/$db.augustus.bb\nand run again.\n";
   }
 
   my $whatItDoes = "Makes genePred file from augustus gff output.";
   my $bossScript = newBash HgRemoteScript("$runDir/makeGp.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   my $dbCheck = "-db=$db";
   $dbCheck = "" if (0 == $noDbGenePredCheck);
 
   $bossScript->add(<<_EOF_
 export db=$db
 find ./run.augustus/gtf -type f | grep ".gtf.gz\$" \\
   | sed -e 's#/# _D_ #g; s#\\.# _dot_ #g;' \\
     | sort -k11,11 -k13,13n \\
      | sed -e 's# _dot_ #.#g; s# _D_ #/#g' | xargs zcat \\
        | $augustusDir/scripts/join_aug_pred.pl \\
           | grep -P "\\t(CDS|exon|stop_codon|start_codon|tts|tss)\\t" \\
             > \$db.augustus.gtf
 gtfToGenePred -genePredExt -infoOut=\$db.info \$db.augustus.gtf \$db.augustus.gp
 genePredCheck $dbCheck \$db.augustus.gp
 genePredToBed \$db.augustus.gp stdout \\
     | bedToExons stdin stdout | bedSingleCover.pl stdin > \$db.exons.bed
 export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$db.exons.bed`
 export asmSizeNoGaps=`grep sequences ../../\$db.faSize.txt | awk '{print \$5}'`
 export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'`
 printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$db.augustus.txt
 rm -f \$db.exons.bed
 genePredToBigGenePred \$db.augustus.gp stdout | sort -k1,1 -k2,2n > \$db.augustus.bgp
 bedToBigBed -type=bed12+8 -tab -as=$ENV{'HOME'}/kent/src/hg/lib/bigGenePred.as \$db.augustus.bgp partition/\$db.chrom.sizes \$db.augustus.bb
 getRnaPred -genePredExt -keepMasking -genomeSeqs=$maskedSeq \$db \$db.augustus.gp all \$db.augustusGene.rna.fa
 getRnaPred -genePredExt -peptides -genomeSeqs=$maskedSeq \$db \$db.augustus.gp all \$db.augustusGene.faa
 _EOF_
   );
   $bossScript->execute();
 } # doMakeGp
 
 #########################################################################
 # * step: load [dbHost]
 sub doLoadAugustus {
   my $runDir = $buildDir;
   &HgAutomate::mustMkdir($runDir);
   my $tableName = "augustusGene";
 
   if (! -e "$runDir/$db.augustus.gp") {
     die "doLoadAugustus: the previous step makeGp did not complete \n" .
       "successfully ($db.augustus.gp does not exists).\nPlease " .
       "complete the previous step: -continue=-makeGp\n";
   } elsif (-e "$runDir/fb.$db.$tableName.txt" ) {
     die "doLoadAugustus: looks like this was run successfully already\n" .
       "(fb.$db.$tableName.txt exists).  Either run with -continue cleanup\n" .
 	"or move aside/remove\n$runDir/fb.$db.$tableName.txt and run again.\n";
   }
   my $whatItDoes = "Loads $db.augustus.gp.";
   my $bossScript = newBash HgRemoteScript("$runDir/loadAugustus.bash", $dbHost,
 				      $runDir, $whatItDoes);
 
   my $dbCheck = "-db=$db";
   $dbCheck = "" if (0 == $noDbGenePredCheck);
 
   $bossScript->add(<<_EOF_
 export db="$db"
 export table="$tableName"
 genePredCheck $dbCheck \$db.augustus.gp
 hgLoadGenePred \$db -genePredExt \$table \$db.augustus.gp
 genePredCheck $dbCheck \$table
 featureBits \$db \$table > fb.\$db.\$table.txt 2>&1
 checkTableCoords -verboseBlocks -table=\$table \$db
 cat fb.\$db.\$table.txt
 _EOF_
   );
   $bossScript->execute();
 } # doLoad
 
 #########################################################################
 # * step: cleanup [workhorse]
 sub doCleanup {
   my $runDir = $buildDir;
 
   if (-e "$runDir/augustus.gtf.gz" ) {
     die "doCleanup: looks like this was run successfully already\n" .
       "(augustus.gtf.gz exists).  Investigate the run directory:\n" .
 	" $runDir/\n";
   }
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = newBash HgRemoteScript("$runDir/cleanup.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 export db="$db"
 rm -fr $buildDir/fasta
 rm -fr $buildDir/run.augustus/err/
 rm -fr $buildDir/run.augustus/gtf/
 rm -f $buildDir/run.augustus/batch.bak
+rm -fr $buildDir/run.augustus/gtf
 rm -fr $buildDir/run.augustus/augErr
 rm -f $buildDir/\$db.augustus.bgp
 gzip $buildDir/\$db.augustus.gtf
 gzip $buildDir/\$db.augustus.gp
 gzip $buildDir/\$db.augustusGene.rna.fa
 gzip $buildDir/\$db.augustusGene.faa
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Force debug and verbose until this is looking pretty solid:
 #$opt_debug = 1;
 #$opt_verbose = 3 if ($opt_verbose < 3);
 
 $noDbGenePredCheck = $opt_noDbGenePredCheck ? 0 : $noDbGenePredCheck;
 
 # Establish what directory we will work in.
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/augustus";
 $maskedSeq = $opt_maskedSeq ? $opt_maskedSeq :
   "$HgAutomate::clusterData/$db/$db.2bit";
 $species = $opt_species ? $opt_species : $species;
 if ( -s "${augustusConfig}/species/$species/${species}_utr_probs.pbl" ) {
   $utr = "on";
 } else {
   $utr = "off";
 }
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 &HgAutomate::verbose(1,
 	"\n *** All done !  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
 &HgAutomate::verbose(1,
 	"\n *** All done!$upThrough\n");
 &HgAutomate::verbose(1,
 	" *** Steps were performed in $buildDir\n");
 &HgAutomate::verbose(1, "\n");