90b9985e1fc9106341b275badcd445531be9a48c
hiram
  Thu Oct 15 09:23:22 2020 -0700
add noAugustus and noXenoRefSeq options for bacteria hub build refs #23891

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index b64d4bd..b021e16 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -19,31 +19,33 @@
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_sourceDir
     $opt_species
     $opt_rmskSpecies
     $opt_ncbiRmsk
     $opt_noRmsk
     $opt_augustusSpecies
+    $opt_noAugustus
     $opt_xenoRefSeq
+    $opt_noXenoRefSeq
     $opt_ucscNames
     $opt_asmHubName
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download',   func => \&doDownload },
       { name => 'sequence',   func => \&doSequence },
       { name => 'assemblyGap',   func => \&doAssemblyGap },
       { name => 'chromAlias',   func => \&doChromAlias },
       { name => 'gatewayPage',   func => \&doGatewayPage },
       { name => 'cytoBand',   func => \&doCytoBand },
       { name => 'gc5Base',   func => \&doGc5Base },
       { name => 'repeatMasker',   func => \&doRepeatMasker },
       { name => 'simpleRepeat',   func => \&doSimpleRepeat },
@@ -62,30 +64,32 @@
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $species = "";       # usually found in asmId_assembly_report.txt
 my $ftpDir = "";	# will be determined from given asmId
 my $rmskSpecies = "";
 my $noRmsk = 0;	# when RepeatMasker is not possible, such as bacteria
 my $ncbiRmsk = 0;	# when =1 call doRepeatMasker.pl
                         # with -ncbiRmsk=path.out.gz and -liftSpec=...
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq";
+my $noAugustus = 0;     # bacteria do *not* create an augustus track
+my $noXenoRefSeq = 0;	# bacteria do *not* create a xenoRefSeq track
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $asmHubName = "n/a";  # directory name in: /gbdb/hubs/asmHubName
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # key is original accession name from the remove.dups.list, value is 1
 my %dupAccessionList;
 
 sub usage {
   # Usage / help / self-documentation:
@@ -106,30 +110,32 @@
     -sourceDir dir    Find assembly in dir instead of default:
        $sourceDir/GC[AF]/123/456/789/asmId
     -ucscNames        Translate NCBI/INSDC/RefSeq names to UCSC names
                       default is to use the given NCBI/INSDC/RefSeq names
     -asmHubName <name>  directory name in: /gbdb/hubs/asmHubName
     -species <name>   use this species designation if there is no
                       asmId_assembly_report.txt with an
                       'Organism name:' entry to obtain species
     -rmskSpecies <name> to override default 'species' name for repeat masker
                       the default is found in the asmId_asssembly_report.txt
                       e.g. -rmskSpecies=viruses
     -noRmsk           when RepeatMasker is not possible, such as bacteria
     -ncbiRmsk         use NCBI rm.out.gz file instead of local cluster run
                       for repeat masking
     -augustusSpecies <human|chicken|zebrafish> default 'human'
+    -noAugustus       do *not* create the Augustus gene track
+    -noXenoRefSeq     do *not* create the Xeno RefSeq gene track
     -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz
                 expanded directory of mrnas/ and xenoRefMrna.sizes, default
                 $xenoRefSeq
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $workhorse,
 						'fileServer' => $fileServer,
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Automates build of assembly hub.  Steps:
     download: sets up sym link working hierarchy from already mirrored
                 files from NCBI in:
                       $sourceDir/GC[AF]/123/456/789/asmId
@@ -828,31 +834,34 @@
 
 ### construct sequence when no AGP files exist
   if (0 == $partsDone) {
 printf STDERR "creating fake AGP\n";
     if ($ucscNames) {
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz
 hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
 _EOF_
       );
     } else {
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz
 hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
-zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
+twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\
+  | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\
+    | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\
+      | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
 _EOF_
       );
     }
 } else {
 printf STDERR "partsDone: %d\n", $partsDone;
   }
 
   $bossScript->add(<<_EOF_
 zcat *.agp.gz | gzip > ../\$asmId.agp.gz
 faToTwoBit *.fa.gz ../\$asmId.2bit
 faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
 twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt
 if [ -s "\$asmId.dups.txt" ]; then
   printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2
   cat \$asmId.dups.txt 1>&2
@@ -1713,30 +1722,34 @@
       -assemblyHub -bigClusterHub=$bigClusterHub -dbHost=$dbHost $liftSpec \\
       -target2bit="\$target2bit" \\
       -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\
       \$asmId \$asmId
 else
   printf "# ncbiRefSeq step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # ncbiRefSeq
 
 #########################################################################
 # * step: augustus [workhorse]
 sub doAugustus {
+  if ($noAugustus) {
+  &HgAutomate::verbose(1, "# -noAugustus == Augustus gene track not created\n");
+	return;
+  }
   my $runDir = "$buildDir/trackData/augustus";
   if (! -s "$buildDir/$asmId.2bit") {
     &HgAutomate::verbose(1,
 	"ERROR: augustus step can not find $buildDir/$asmId.2bit\n");
     exit 255;
   }
   if (-d "${runDir}" ) {
      if (! -s "$runDir/$asmId.augustus.bb") {
        &HgAutomate::verbose(1,
        "WARNING augustus step may already be running, but not completed ?\n");
        return;
      } elsif (! needsUpdate("$buildDir/$asmId.2bit", "$runDir/$asmId.augustus.bb")) {
        &HgAutomate::verbose(1, "# augustus step previously completed\n");
        return;
      }
@@ -1757,30 +1770,34 @@
     -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1
   time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\
     -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\
     -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > cleanup.log 2>&1
 else
   printf "# augustus genes step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doAugustus
 
 #########################################################################
 # * step: xenoRefGene [bigClusterHub]
 sub doXenoRefGene {
+  if ($noXenoRefSeq) {
+	&HgAutomate::verbose(1, "# -noXenoRefSeq == Xeno RefSeq gene track not created\n");
+	return;
+  }
   my $runDir = "$buildDir/trackData/xenoRefGene";
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run xeno RefSeq gene mapping procedures";
   my $bossScript = newBash HgRemoteScript("$runDir/doXenoRefGene.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.xenoRefGene.bb ]; then
   time (~/kent/src/hg/utils/automation/doXenoRefGene.pl -buildDir=`pwd` -dbHost=$dbHost \\
     -bigClusterHub=$bigClusterHub -mrnas=$xenoRefSeq -workhorse=$workhorse \\
     -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \$asmId) > do.log 2>&1
@@ -1887,49 +1904,54 @@
      chomp $species;
      $species =~ s/.*organism\s+name:\s+//i;
      $species =~ s/\s+\(.*//;
   } else {
      die "no -species specified and can not find $asmReport";
   }
   if (length($species) < 1) {
      die "no -species specified and can not find Organism name: in $asmReport";
   }
 }
 
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
+$noAugustus = $opt_noAugustus ? 1 : $noAugustus;   # '1' == 'TRUE'
+$noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName;
 $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;
 $noRmsk = $opt_noRmsk ? 1 : 0;
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
 printf STDERR "# asmHubName %s\n", $asmHubName;
 printf STDERR "# rmskSpecies %s\n", $rmskSpecies;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE";
 printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE";
+printf STDERR "# noAugustus %s\n", $noAugustus ? "TRUE" : "FALSE";
+printf STDERR "# noXenoRefSeq %s\n", $noXenoRefSeq ? "TRUE" : "FALSE";
+printf STDERR "# noRmsk %s\n", $noRmsk ? "TRUE" : "FALSE";
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;