6a48e07c2fbefbf0a0e5e74662dd768ef93fca88
hiram
  Wed Apr 29 11:27:30 2020 -0700
adding ncbiRmsk option to use NCBI rm.out.gz file instead of cluster run and record versionInfo.txt of RepeatMasker operation refs #23734

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 0534908..41b0111 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -16,30 +16,31 @@
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_sourceDir
     $opt_species
     $opt_rmskSpecies
+    $opt_ncbiRmsk
     $opt_augustusSpecies
     $opt_xenoRefSeq
     $opt_ucscNames
     $opt_asmHubName
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download',   func => \&doDownload },
       { name => 'sequence',   func => \&doSequence },
       { name => 'assemblyGap',   func => \&doAssemblyGap },
       { name => 'gatewayPage',   func => \&doGatewayPage },
       { name => 'cytoBand',   func => \&doCytoBand },
       { name => 'gc5Base',   func => \&doGc5Base },
       { name => 'repeatMasker',   func => \&doRepeatMasker },
@@ -54,30 +55,32 @@
       { name => 'ncbiGene',   func => \&doNcbiGene },
       { name => 'ncbiRefSeq',   func => \&doNcbiRefSeq },
       { name => 'xenoRefGene',   func => \&doXenoRefGene },
       { name => 'augustus',   func => \&doAugustus },
       { name => 'trackDb',   func => \&doTrackDb },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $species = "";       # usually found in asmId_assembly_report.txt
 my $ftpDir = "";	# will be determined from given asmId
 my $rmskSpecies = "";
+my $ncbiRmsk = 0;	# when =1 call doRepeatMasker.pl
+                        # with -ncbiRmsk=path.out.gz and -liftSpec=...
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $asmHubName = "n/a";  # directory name in: /gbdb/hubs/asmHubName
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # key is original accession name from the remove.dups.list, value is 1
 my %dupAccessionList;
 
@@ -96,30 +99,32 @@
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir     Construct assembly hub in dir instead of default
        $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/
     -sourceDir dir    Find assembly in dir instead of default:
        $sourceDir/GC[AF]/123/456/789/asmId
     -ucscNames        Translate NCBI/INSDC/RefSeq names to UCSC names
                       default is to use the given NCBI/INSDC/RefSeq names
     -asmHubName <name>  directory name in: /gbdb/hubs/asmHubName
     -species <name>   use this species designation if there is no
                       asmId_assembly_report.txt with an
                       'Organism name:' entry to obtain species
     -rmskSpecies <name> to override default 'species' name for repeat masker
                       the default is found in the asmId_asssembly_report.txt
                       e.g. -rmskSpecies=viruses
+    -ncbiRmsk         use NCBI rm.out.gz file instead of local cluster run
+                      for repeat masking
     -augustusSpecies <human|chicken|zebrafish> default 'human'
     -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz
                 expanded directory of mrnas/ and xenoRefMrna.sizes, default
                 $xenoRefSeq
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $workhorse,
 						'fileServer' => $fileServer,
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Automates build of assembly hub.  Steps:
     download: sets up sym link working hierarchy from already mirrored
                 files from NCBI in:
@@ -174,30 +179,31 @@
 }
 
 
 # Globals:
 # Command line args: asmId
 my ( $asmId);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd, $assemblySource);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'sourceDir=s',
 		      'rmskSpecies=s',
+		      'ncbiRmsk',
 		      'augustusSpecies=s',
 		      'xenoRefSeq=s',
 		      'asmHubName=s',
 		      'ucscNames',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
@@ -618,36 +624,38 @@
   my $runDir = "$buildDir/download";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "setup work hierarchy of sym links to source files in\n\t$runDir/";
   my $bossScript = newBash HgRemoteScript("$runDir/doDownload.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ ! -s \${asmId}.2bit -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then
   rm -f \${asmId}_genomic.fna.gz \\
     \${asmId}_genomic.fna.dups.gz \\
     \${asmId}_assembly_report.txt \\
     \${asmId}_rm.out.gz \\
+    \${asmId}_rm.run \\
     \${asmId}_assembly_structure \\
     \$asmId.2bit
 
   ln -s $assemblySource/\${asmId}_genomic.fna.gz .
   ln -s $assemblySource/\${asmId}_assembly_report.txt .
   ln -s $assemblySource/\${asmId}_rm.out.gz .
+  ln -s $assemblySource/\${asmId}_rm.run .
   if [ -d $assemblySource/\${asmId}_assembly_structure ]; then
     ln -s $assemblySource/\${asmId}_assembly_structure .
   fi
   faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
   twoBitDup \$asmId.2bit > \$asmId.dups.txt
   if [ -s "\$asmId.dups.txt" ]; then
     printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2
     cat \$asmId.dups.txt 1>&2
     awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list
     mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz
     faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\
       \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz
     rm -f \$asmId.2bit
     faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
   fi
@@ -1031,44 +1039,65 @@
 sub doRepeatMasker {
   my $runDir = "$buildDir/trackData/repeatMasker";
   if ( -d "$buildDir/trackData/repeatMasker/run.cluster" ) {
      if ( ! -s "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" ) {
        &HgAutomate::verbose(1,
 	"\nERROR: step repeatmasker may be running\n");
        exit 255;
      }
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct repeatMasker track data";
   my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash",
                     $workhorse, $runDir, $whatItDoes);
 
+  my $rmskOpts = "";
+  if ($ncbiRmsk) {
+     $rmskOpts = " \\
+  -ncbiRmsk=\"$buildDir/download/${asmId}_rm.out.gz\" ";
+     if ($ucscNames) {
+       $rmskOpts .= " \\
+  -liftSpec=\"$buildDir/sequence/$asmId.ncbiToUcsc.lift\"";
+     }
+  }
+
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then
 export species=`echo $rmskSpecies | sed -e 's/_/ /g;'`
 
-doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\
+rm -f versionInfo.txt
+
+doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit $rmskOpts \\
   -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId
 
-gzip \$asmId.sorted.fa.out \$asmId.fa.out \$asmId.nestedRepeats.bed
+if [ -s "\$asmId.fa.out" ]; then
+  gzip \$asmId.fa.out
+fi
+gzip \$asmId.sorted.fa.out \$asmId.nestedRepeats.bed
 
-doRepeatMasker.pl -continue=cleanup -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\
+doRepeatMasker.pl -continue=cleanup -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit $rmskOpts \\
   -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId
 
+if [ ! -s versionInfo.txt ]; then
+  if [ -s ../../download/${asmId}_rm.run ]; then
+    ln -s ../../download/${asmId}_rm.run versionInfo.txt
+  fi
+fi
+
 \$HOME/kent/src/hg/utils/automation/asmHubRepeatMasker.sh \$asmId `pwd`/\$asmId.sorted.fa.out.gz `pwd`
 else
   printf "# repeatMasker step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # repeatMasker
 
 #########################################################################
 # * step: simpleRepeat [workhorse]
 sub doSimpleRepeat {
   my $runDir = "$buildDir/trackData/simpleRepeat";
   &HgAutomate::mustMkdir($runDir);
@@ -1531,37 +1560,53 @@
   bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\
     | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt
   LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo
   printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiGene.txt
 else
   printf "# ncbiGene step previously completed\\n" 1>&2
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doNcbiGene
 
 #########################################################################
 # * step: ncbiRefSeq [workhorse]
 sub doNcbiRefSeq {
-  my $runDir = "$buildDir/trackData/ncbiRefSeq";
+  # skip this procedure if all the required files are not available
   my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz";
   if ( ! -s "${gffFile}" ) {
     printf STDERR "# step ncbiRefSeq no gff file found at:\n#  %s\n", $gffFile;
     return;
   }
+  my $filesFound = 0;
+ my @requiredFiles = qw( genomic.gff.gz rna.fna.gz rna.gbff.gz protein.faa.gz );
+  my $filesExpected = scalar(@requiredFiles);
+  foreach my $expectFile (@requiredFiles) {
+    if ( -s "$assemblySource/${asmId}_${expectFile}" ) {
+      ++$filesFound;
+    } else {
+      printf STDERR "# step ncbiRefSeq missing required file $assemblySource/${asmId}_${expectFile}\n";
+    }
+  }
 
+  if ($filesFound < $filesExpected) {
+    printf STDERR "# step ncbiRefSeq does not have all files required\n";
+    return;
+  }
+
+  my $runDir = "$buildDir/trackData/ncbiRefSeq";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run NCBI RefSeq gene procedures";
   my $bossScript = newBash HgRemoteScript("$runDir/doNcbiRefSeq.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 export buildDir="$buildDir"
 export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift"
 export target2bit="\$buildDir/\$asmId.2bit"
 
 if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then
 
 ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -toGpWarnOnly -buildDir=`pwd` \\
@@ -1736,31 +1781,31 @@
   }
   if (length($species) < 1) {
      die "no -species specified and can not find Organism name: in $asmReport";
   }
 }
 
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName;
-
+$ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
 printf STDERR "# asmHubName %s\n", $asmHubName;
 printf STDERR "# rmskSpecies %s\n", $rmskSpecies;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.