2b42d05e58a971f49afdd6d52c2ad415e165add0
hiram
  Tue Jul 6 12:37:28 2021 -0700
create a GTF file for the xenoRefGene result refs #27512

diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl
index fc2ca9a..8aa9818 100755
--- src/hg/utils/automation/doXenoRefGene.pl
+++ src/hg/utils/automation/doXenoRefGene.pl
@@ -28,32 +28,30 @@
       { name => 'blatRun', func => \&doBlatRun },
       { name => 'filterPsl', func => \&doFilterPsl },
       { name => 'makeGp', func => \&doMakeGp },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $bigClusterHub = 'ku';
 my $workhorse = 'hgwdev';
 my $dbHost = 'hgwdev';
 my $defaultWorkhorse = 'hgwdev';
 my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit";
 my $mrnas = "/hive/data/genomes/asmHubs/xenoRefSeq";
 my $noDbGenePredCheck = 1;    # default yes, use -db for genePredCheck
-my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1";
-my $augustusConfig="$augustusDir/config";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
@@ -68,31 +66,31 @@
 _EOF_
   ;
 
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
                                 'bigClusterHub' => $bigClusterHub,
                                 'workhorse' => $defaultWorkhorse);
   print STDERR "
 Automates construction of a xeno RefSeq gene track from RefSeq mRNAs.  Steps:
     splitTarget split the masked target sequence into individual fasta sequences
     blatRun:    Run blat with the xenoRefSeq mRNAs query to target sequence
     filterPsl:  Run pslCDnaFilter on the blat psl results
     makeGp:     Transform the filtered PSL into a genePred file and create
                 bigGenePred from the genePred file
     cleanup:    Removes hard-masked fastas and output from gsBig.
 All operations are performed in the build directory which is
-$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus unless -buildDir is given.
+$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/xenoRefGene unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($db);
@@ -277,129 +275,140 @@
   if (! -e "$runDir/$db.xenoRefGene.psl") {
     die "doMakeGp: the previous step filterPsl did not complete \n" .
       "successfully ($buildDir/$db.xenoRefGene.psl does not exist).\nPlease " .
       "complete the previous step: -continue=filterPsl\n";
   } elsif (-e "$runDir/$db.xenoRefGene.bb" ) {
     die "doMakeGp: looks like this was run successfully already\n" .
       "($db.xenoRefGene.bb exists).  Either run with -continue cleanup\n" .
 	"or move aside/remove $runDir/$db.xenoRefGene.bb\nand run again.\n";
   }
 
   my $whatItDoes = "Makes bigGenePred.bb file from filterPsl output.";
   my $bossScript = newBash HgRemoteScript("$runDir/makeGp.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
-export db=$db
+export db="$db"
+export buildDir="$buildDir"
+
 if [ -s "\$db.xenoRefGene.psl" ]; then
   grep NR_ \$db.xenoRefGene.psl > NR.psl
   grep NM_ \$db.xenoRefGene.psl > NM.psl
   mrnaToGene -cdsDb=hgFixed NM.psl NM.gp
   mrnaToGene -noCds NR.psl NR.gp
   cat NM.gp NR.gp | genePredSingleCover stdin \$db.xenoRefGene.gp
   genePredCheck -db=\$db -chromSizes=\$db.chrom.sizes \$db.xenoRefGene.gp
   genePredToBed \$db.xenoRefGene.gp stdout \\
     | bedToExons stdin stdout | bedSingleCover.pl stdin > \$db.exons.bed
   export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$db.exons.bed`
   export asmSizeNoGaps=`grep sequences ../../\$db.faSize.txt | awk '{print \$5}'`
   export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'`
   printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$db.xenoRefGene.txt
   rm -f \$db.exons.bed
   genePredToBigGenePred -geneNames=$mrnas/geneOrgXref.txt \$db.xenoRefGene.gp \\
      stdout | sort -k1,1 -k2,2n > \$db.bgpInput
   sed -e 's#Alternative/human readable gene name#species of origin of the mRNA#; s#Name or ID of item, ideally both human readable and unique#RefSeq accession id#; s#Primary identifier for gene#gene name#;' \\
     \$HOME/kent/src/hg/lib/bigGenePred.as > xenoRefGene.as
   bedToBigBed -extraIndex=name,geneName -type=bed12+8 -tab -as=xenoRefGene.as \\
      \$db.bgpInput \$db.chrom.sizes \$db.xenoRefGene.bb
   \$HOME/kent/src/hg/utils/automation/xenoRefGeneIx.pl \$db.bgpInput | sort -u > \$db.ix.txt
   ixIxx \$db.ix.txt \$db.xenoRefGene.ix \$db.xenoRefGene.ixx
+  mkdir -p /dev/shm/\$db
+  cp -p \$db.xenoRefGene.gp /dev/shm/\$db/xenoRefGene.\$db
+  cd /dev/shm/\$db
+  genePredToGtf -utr file xenoRefGene.\$db stdout | gzip -c \\
+    > \$buildDir/\$db.xenoRefGene.gtf.gz
+  cd \$buildDir
+  rm -f /dev/shm/\$db/xenoRefGene.\$db
+  rmdir /dev/shm/\$db
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doMakeGp
 
 #########################################################################
 # * step: cleanup [workhorse]
 sub doCleanup {
   my $runDir = $buildDir;
 
   if (-e "$runDir/$db.xenoRefGene.gp.gz" ) {
     die "doCleanup: looks like this was run successfully already\n" .
       "($db.xenoRefGene.gp.gz exists).  Investigate the run directory:\n" .
 	" $runDir/\n";
   }
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = newBash HgRemoteScript("$runDir/cleanup.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 export db="$db"
-rm -fr $buildDir/target/
-rm -fr $buildDir/blatRun/err/
-rm -fr $buildDir/blatRun/result/
-rm -f $buildDir/blatRun/batch.bak
-rm -f $buildDir/NM.gp
-rm -f $buildDir/NR.gp
-rm -f $buildDir/NM.psl
-rm -f $buildDir/NR.psl
-if [ -s "$buildDir/\$db.bgpInput" ]; then
-  gzip $buildDir/\$db.bgpInput &
+export buildDir="$buildDir"
+rm -fr \$buildDir/target/
+rm -fr \$buildDir/blatRun/err/
+rm -fr \$buildDir/blatRun/result/
+rm -f \$buildDir/blatRun/batch.bak
+rm -f \$buildDir/NM.gp
+rm -f \$buildDir/NR.gp
+rm -f \$buildDir/NM.psl
+rm -f \$buildDir/NR.psl
+if [ -s "\$buildDir/\$db.bgpInput" ]; then
+  gzip \$buildDir/\$db.bgpInput &
 fi
-if [ -s "$buildDir/\$db.ix.txt" ]; then
-  gzip $buildDir/\$db.ix.txt &
+if [ -s "\$buildDir/\$db.ix.txt" ]; then
+  gzip \$buildDir/\$db.ix.txt &
 fi
-if [ -s "$buildDir/\$db.all.psl" ]; then
-  gzip $buildDir/\$db.all.psl &
+if [ -s "\$buildDir/\$db.all.psl" ]; then
+  gzip \$buildDir/\$db.all.psl &
 else
-  rm -f $buildDir/\$db.all.psl
+  rm -f \$buildDir/\$db.all.psl
 fi
-if [ -s "$buildDir/\$db.xenoRefGene.psl" ]; then
-  gzip $buildDir/\$db.xenoRefGene.psl &
+if [ -s "\$buildDir/\$db.xenoRefGene.psl" ]; then
+  gzip \$buildDir/\$db.xenoRefGene.psl &
 else
-  rm -f $buildDir/\$db.xenoRefGene.psl
+  rm -f \$buildDir/\$db.xenoRefGene.psl
 fi
-if [ -s "$buildDir/\$db.xenoRefGene.gp" ]; then
-gzip $buildDir/\$db.xenoRefGene.gp
+if [ -s "\$buildDir/\$db.xenoRefGene.gp" ]; then
+gzip \$buildDir/\$db.xenoRefGene.gp
 fi
 wait
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Force debug and verbose until this is looking pretty solid:
 #$opt_debug = 1;
 #$opt_verbose = 3 if ($opt_verbose < 3);
 
 $noDbGenePredCheck = $opt_noDbGenePredCheck ? 0 : $noDbGenePredCheck;
 
 # Establish what directory we will work in.
 $buildDir = $opt_buildDir ? $opt_buildDir :
-  "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/augustus";
+  "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/xenoRefGene";
 $maskedSeq = $opt_maskedSeq ? $opt_maskedSeq :
   "$HgAutomate::clusterData/$db/$db.2bit";
 $mrnas = $opt_mrnas ? $opt_mrnas : $mrnas;
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;