19eac373dbe1f4905c05238efc78f5becd8a31fc
hiram
  Thu Aug 8 10:55:14 2019 -0700
beginning of xenoRefGene track construction refs #23734

diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl
new file mode 100755
index 0000000..d8c559d7
--- /dev/null
+++ src/hg/utils/automation/doXenoRefGene.pl
@@ -0,0 +1,405 @@
+#!/usr/bin/env perl
+
+# DO NOT EDIT the /cluster/bin/scripts copy of this file --
+# edit ~/kent/src/hg/utils/automation/doXenoRefGene.pl instead.
+
+use Getopt::Long;
+use warnings;
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin";
+use HgAutomate;
+use HgRemoteScript;
+use HgStepManager;
+
+# Option variable names, both common and peculiar to this script:
+use vars @HgAutomate::commonOptionVars;
+use vars @HgStepManager::optionVars;
+use vars qw/
+    $opt_buildDir
+    $opt_maskedSeq
+    $opt_mrnas
+    $opt_noDbGenePredCheck
+    /;
+
+# Specify the steps supported with -continue / -stop:
+my $stepper = new HgStepManager(
+    [ { name => 'splitTarget',   func => \&doSplitTarget },
+      { name => 'blatRun', func => \&doBlatRun },
+      { name => 'filterPsl', func => \&doFilterPsl },
+      { name => 'makeGp', func => \&doMakeGp },
+      { name => 'makeBigGp', func => \&doMakeBigGp },
+      { name => 'cleanup', func => \&doCleanup },
+    ]
+				);
+
+# Option defaults:
+my $bigClusterHub = 'ku';
+my $workhorse = 'hgwdev';
+my $dbHost = 'hgwdev';
+my $defaultWorkhorse = 'hgwdev';
+my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit";
+my $mrnas = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
+my $noDbGenePredCheck = 1;    # default yes, use -db for genePredCheck
+my $mrnas = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
+my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1";
+my $augustusConfig="$augustusDir/config";
+
+my $base = $0;
+$base =~ s/^(.*\/)?//;
+
+sub usage {
+  # Usage / help / self-documentation:
+  my ($status, $detailed) = @_;
+  # Basic help (for incorrect usage):
+  print STDERR "
+usage: $base db
+options:
+";
+  print STDERR $stepper->getOptionHelp();
+  print STDERR <<_EOF_
+    -buildDir dir         Use dir instead of default
+                          $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/xenoRefGene
+                          (necessary when continuing at a later date).
+    -maskedSeq seq.2bit   Use seq.2bit as the masked input sequence instead
+                          of default ($maskedSeq).
+    -mrnas </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz
+                expanded directory of mrnas/ and xenoRefMrna.sizes, default
+                $mrnas
+    -noDbGenePredCheck    do not use -db= on genePredCheck, there is no real db
+_EOF_
+  ;
+
+  print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
+                                'bigClusterHub' => $bigClusterHub,
+                                'workhorse' => $defaultWorkhorse);
+  print STDERR "
+Automates construction of a xeno RefSeq gene track from RefSeq mRNAs.  Steps:
+    splitTarget split the masked target sequence into individual fasta sequences
+    blatRun:    Run blat with the xenoRefSeq mRNAs query to target sequence
+    filterPsl:  Run pslCDnaFilter on the blat psl results
+    makeGp:     Transform the filtered PSL into a genePred file
+    makeBigGp:  Construct the bigGenePred from the genePred file
+    cleanup:    Removes hard-masked fastas and output from gsBig.
+All operations are performed in the build directory which is
+$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus unless -buildDir is given.
+";
+  # Detailed help (-help):
+  print STDERR "
+Assumptions:
+1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
+   database/assembly \$db.
+" if ($detailed);
+  print "\n";
+  exit $status;
+}
+
+
+# Globals:
+# Command line args: db
+my ($db);
+# Other:
+my ($buildDir, $secondsStart, $secondsEnd);
+
+sub checkOptions {
+  # Make sure command line options are valid/supported.
+  my $ok = GetOptions(@HgStepManager::optionSpec,
+		      'buildDir=s',
+		      'maskedSeq=s',
+		      'mrnas=s',
+		      'noDbGenePredCheck',
+		      @HgAutomate::commonOptionSpec,
+		      );
+  &usage(1) if (!$ok);
+  &usage(0, 1) if ($opt_help);
+  &HgAutomate::processCommonOptions();
+  my $err = $stepper->processOptions();
+  usage(1) if ($err);
+  $workhorse = $opt_workhorse if ($opt_workhorse);
+  $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
+  $dbHost = $opt_dbHost if ($opt_dbHost);
+}
+
+#########################################################################
+# * step: splitTarget [workhorse]
+sub doSplitTarget {
+  # run faSplit on the masked 2bit target sequence and prepare the target.list
+  my $runDir = "$buildDir";
+
+  # First, make sure we're starting clean.
+  if (-d "$runDir/target") {
+    die "doXenoRefGene splitTarget step already done, remove directory 'target' to rerun,\n" .
+      "or '-continue blatRun' to run next step.\n";
+  }
+
+  my $whatItDoes="split the masked 2bit file into fasta files for blat alignment processing.";
+  my $bossScript = newBash HgRemoteScript("$runDir/doSplitTarget.bash", $workhorse,
+				      $runDir, $whatItDoes);
+  $bossScript->add(<<_EOF_
+mkdir target
+twoBitToFa $maskedSeq stdout | faSplit byname stdin target/
+gzip target/*.fa
+ls target | sed -e 's/.fa.gz//;' > target.list
+faSize -detailed target/*.fa.gz | sort -k2,2nr > $db.chrom.sizes
+_EOF_
+  );
+
+  $bossScript->execute();
+} # doSplitTarget
+
+#########################################################################
+# * step: blatRun [bigClusterHub]
+sub doBlatRun {
+  # Set up and perform the cluster run to run the blat alignment of RefSeq
+  #     mrnas to the split target fasta sequences.
+  my $paraHub = $bigClusterHub;
+  my $runDir = "$buildDir/blatRun";
+  # First, make sure previous step has completed,
+  #       and starting clean without this step result present:
+  if (! -d "$buildDir/target" || ! -s "$buildDir/target.list") {
+    die "doXenoRefGene the previous step 'splitTarget' did not complete \n" .
+      "successfully (target.list or target/*.fa.gz does not exists).\nPlease " .
+      "complete the previous step: -continue=-splitTarget\n";
+  } elsif (-e "$runDir/run.time") {
+    die "doXenoRefGene looks like this step was run successfully already " .
+      "(run.time exists).\nEither run with -continue filterPsl or some later " .
+	"stage,\nor move aside/remove $runDir/ and run again.\n";
+  } elsif ((-e "$runDir/jobList") && ! $opt_debug) {
+    die "doXenoRefGene looks like we are not starting with a clean " .
+      "slate.\n\tclean\n  $runDir/\n\tand run again.\n";
+  }
+  &HgAutomate::mustMkdir($runDir);
+
+  my $templateCmd = ("blatOne " . '$(root1) ' . '$(root2) '
+                . "{check out exists result/" . '$(root1)/$(root2).psl}');
+  &HgAutomate::makeGsub($runDir, $templateCmd);
+ `touch "$runDir/para_hub_$paraHub"`;
+
+  my $whatItDoes = "runs blat mRNAs query sequence bundle to one target sequence";
+  my $bossScript = newBash HgRemoteScript("$runDir/blatOne", $paraHub,
+				      $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+export queryDir="$mrnas/mrnas";
+
+export target=\$1
+export query=\$2
+export result=\$3
+mkdir -p `dirname \$result`
+
+blat -noHead -q=rnax -t=dnax -mask=lower ../target/\$target.fa.gz \$queryDir/\$query.fa.gz \$result
+
+_EOF_
+  );
+
+  $whatItDoes = "Operate the blat run of the mRNAs query sequence to the target split sequence.";
+  $bossScript = newBash HgRemoteScript("$runDir/runBlat.bash", $paraHub,
+				      $runDir, $whatItDoes);
+  my $paraRun = &HgAutomate::paraRun();
+  $bossScript->add(<<_EOF_
+
+chmod +x blatOne
+gensub2 ../target.list $mrnas/query.list gsub jobList
+
+$paraRun
+_EOF_
+  );
+  $bossScript->execute();
+} # doBlatRun
+
+#########################################################################
+# * step: filterPsl [workhorse]
+sub doFilterPsl {
+  my $runDir = $buildDir;
+  &HgAutomate::mustMkdir($runDir);
+
+  # First, make sure we're starting clean.
+  if (! -e "$runDir/blatRun/run.time") {
+    die "doFilterPsl the previous step blatRun did not complete \n" .
+      "successfully ($buildDir/blatRun/run.time does not exist).\nPlease " .
+      "complete the previous step: -continue=blatRun\n";
+  } elsif (-e "$runDir/$db.xenoRefGene.psl" ) {
+    die "doFilterPsl looks like this was run successfully already\n" .
+      "($db.xenoRefGene.psl exists).  Either run with -continue makeGp or cleanup\n" .
+	"or move aside/remove $runDir/$db.xenoRefGene.psl\nand run again.\n";
+  }
+
+  my $whatItDoes = "Filters the raw psl results from the blatRun.";
+  my $bossScript = newBash HgRemoteScript("$runDir/filterPsl.bash", $workhorse,
+				      $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+export db="$db"
+find ./blatRun/result -type f | xargs cat \\
+    | gnusort -S100G --parallel=32 -k10,10 > \$db.all.psl
+
+pslCDnaFilter -minId=0.35 -minCover=0.25  -globalNearBest=0.0100 -minQSize=20 \\
+  -ignoreIntrons -repsAsMatch -ignoreNs -bestOverlap \\
+    \$db.all.psl \$db.xenoRefGene.psl
+
+pslCheck -targetSizes=\$db.chrom.sizes \\
+  -querySizes=/hive/data/genomes/asmHubs/VGP/xenoRefSeq/xenoRefMrna.sizes \\
+     \$db.xenoRefGene.psl
+_EOF_
+  );
+  $bossScript->execute();
+} # doFilterPsl
+
+#########################################################################
+# * step: make gp [workhorse]
+sub doMakeGp {
+  my $runDir = $buildDir;
+  &HgAutomate::mustMkdir($runDir);
+
+  # First, make sure we're starting clean.
+  if (! -e "$runDir/run.augustus/run.time") {
+    die "doMakeGp: the previous step augustus did not complete \n" .
+      "successfully ($buildDir/run.augustus/run.time does not exist).\nPlease " .
+      "complete the previous step: -continue=augustus\n";
+  } elsif (-e "$runDir/$db.augustus.bb" ) {
+    die "doMakeGp: looks like this was run successfully already\n" .
+      "($db.augustus.bb exists).  Either run with -continue load or cleanup\n" .
+	"or move aside/remove $runDir/$db.augustus.bb\nand run again.\n";
+  }
+
+  my $whatItDoes = "Makes genePred file from augustus gff output.";
+  my $bossScript = newBash HgRemoteScript("$runDir/makeGp.bash", $workhorse,
+				      $runDir, $whatItDoes);
+
+  my $dbCheck = "-db=$db";
+  $dbCheck = "" if (0 == $noDbGenePredCheck);
+
+  $bossScript->add(<<_EOF_
+export db=$db
+find ./run.augustus/gtf -type f | grep ".gtf.gz\$" \\
+  | sed -e 's#/# _D_ #g; s#\\.# _dot_ #g;' \\
+    | sort -k11,11 -k13,13n \\
+     | sed -e 's# _dot_ #.#g; s# _D_ #/#g' | xargs zcat \\
+       | $augustusDir/scripts/join_aug_pred.pl \\
+          | grep -P "\\t(CDS|exon|stop_codon|start_codon|tts|tss)\\t" \\
+            > \$db.augustus.gtf
+gtfToGenePred -genePredExt -infoOut=\$db.info \$db.augustus.gtf \$db.augustus.gp
+genePredCheck $dbCheck \$db.augustus.gp
+genePredToBigGenePred \$db.augustus.gp stdout | sort -k1,1 -k2,2n > \$db.augustus.bgp
+bedToBigBed -type=bed12+8 -tab -as=$ENV{'HOME'}/kent/src/hg/lib/bigGenePred.as \$db.augustus.bgp partition/\$db.chrom.sizes \$db.augustus.bb
+getRnaPred -genePredExt -keepMasking -genomeSeqs=$maskedSeq \$db \$db.augustus.gp all \$db.augustusGene.rna.fa
+getRnaPred -genePredExt -peptides -genomeSeqs=$maskedSeq \$db \$db.augustus.gp all \$db.augustusGene.faa
+_EOF_
+  );
+  $bossScript->execute();
+} # doMakeGp
+
+#########################################################################
+# * step: load [dbHost]
+sub doLoadAugustus {
+  my $runDir = $buildDir;
+  &HgAutomate::mustMkdir($runDir);
+  my $tableName = "augustusGene";
+
+  if (! -e "$runDir/$db.augustus.gp") {
+    die "doLoadAugustus: the previous step makeGp did not complete \n" .
+      "successfully ($db.augustus.gp does not exists).\nPlease " .
+      "complete the previous step: -continue=-makeGp\n";
+  } elsif (-e "$runDir/fb.$db.$tableName.txt" ) {
+    die "doLoadAugustus: looks like this was run successfully already\n" .
+      "(fb.$db.$tableName.txt exists).  Either run with -continue cleanup\n" .
+	"or move aside/remove\n$runDir/fb.$db.$tableName.txt and run again.\n";
+  }
+  my $whatItDoes = "Loads $db.augustus.gp.";
+  my $bossScript = newBash HgRemoteScript("$runDir/loadAugustus.bash", $dbHost,
+				      $runDir, $whatItDoes);
+
+  my $dbCheck = "-db=$db";
+  $dbCheck = "" if (0 == $noDbGenePredCheck);
+
+  $bossScript->add(<<_EOF_
+export db="$db"
+export table="$tableName"
+genePredCheck $dbCheck \$db.augustus.gp
+hgLoadGenePred \$db -genePredExt \$table \$db.augustus.gp
+genePredCheck $dbCheck \$table
+featureBits \$db \$table > fb.\$db.\$table.txt 2>&1
+checkTableCoords -verboseBlocks -table=\$table \$db
+cat fb.\$db.\$table.txt
+_EOF_
+  );
+  $bossScript->execute();
+} # doLoad
+
+#########################################################################
+# * step: cleanup [workhorse]
+sub doCleanup {
+  my $runDir = $buildDir;
+
+  if (-e "$runDir/augustus.gtf.gz" ) {
+    die "doCleanup: looks like this was run successfully already\n" .
+      "(augustus.gtf.gz exists).  Investigate the run directory:\n" .
+	" $runDir/\n";
+  }
+  my $whatItDoes = "It cleans up or compresses intermediate files.";
+  my $bossScript = newBash HgRemoteScript("$runDir/cleanup.bash", $workhorse,
+				      $runDir, $whatItDoes);
+  $bossScript->add(<<_EOF_
+export db="$db"
+rm -fr $buildDir/fasta
+rm -fr $buildDir/run.augustus/err/
+rm -f $buildDir/run.augustus/batch.bak
+rm -fr $buildDir/run.augustus/augErr
+rm -f $buildDir/\$db.augustus.bgp
+gzip $buildDir/\$db.augustus.gtf
+gzip $buildDir/\$db.augustus.gp
+gzip $buildDir/\$db.augustusGene.rna.fa
+gzip $buildDir/\$db.augustusGene.faa
+_EOF_
+  );
+  $bossScript->execute();
+} # doCleanup
+
+#########################################################################
+# main
+
+# Prevent "Suspended (tty input)" hanging:
+&HgAutomate::closeStdin();
+
+# Make sure we have valid options and exactly 1 argument:
+&checkOptions();
+&usage(1) if (scalar(@ARGV) != 1);
+$secondsStart = `date "+%s"`;
+chomp $secondsStart;
+($db) = @ARGV;
+
+# Force debug and verbose until this is looking pretty solid:
+#$opt_debug = 1;
+#$opt_verbose = 3 if ($opt_verbose < 3);
+
+$noDbGenePredCheck = $opt_noDbGenePredCheck ? 0 : $noDbGenePredCheck;
+
+# Establish what directory we will work in.
+$buildDir = $opt_buildDir ? $opt_buildDir :
+  "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/augustus";
+$maskedSeq = $opt_maskedSeq ? $opt_maskedSeq :
+  "$HgAutomate::clusterData/$db/$db.2bit";
+$mrnas = $opt_mrnas ? $opt_mrnas : $mrnas;
+
+# Do everything.
+$stepper->execute();
+
+# Tell the user anything they should know.
+my $stopStep = $stepper->getStopStep();
+my $upThrough = ($stopStep eq 'cleanup') ? "" :
+  "  (through the '$stopStep' step)";
+
+$secondsEnd = `date "+%s"`;
+chomp $secondsEnd;
+my $elapsedSeconds = $secondsEnd - $secondsStart;
+my $elapsedMinutes = int($elapsedSeconds/60);
+$elapsedSeconds -= $elapsedMinutes * 60;
+
+&HgAutomate::verbose(1,
+	"\n *** All done !  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
+&HgAutomate::verbose(1,
+	"\n *** All done!$upThrough\n");
+&HgAutomate::verbose(1,
+	" *** Steps were performed in $buildDir\n");
+&HgAutomate::verbose(1, "\n");
+