395595f60c883b2344ae20299687563b8ea6e215
hiram
  Thu Dec 15 14:10:23 2022 -0800
adding RepeatModeler option no redmine

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index a8d20df..4e4ae60 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -16,73 +16,76 @@
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_sourceDir
     $opt_species
     $opt_rmskSpecies
+    $opt_runRepeatModeler
     $opt_ncbiRmsk
     $opt_noRmsk
     $opt_augustusSpecies
     $opt_noAugustus
     $opt_xenoRefSeq
     $opt_noXenoRefSeq
     $opt_ucscNames
     $opt_dbName
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download',   func => \&doDownload },
       { name => 'sequence',   func => \&doSequence },
       { name => 'assemblyGap',   func => \&doAssemblyGap },
       { name => 'chromAlias',   func => \&doChromAlias },
       { name => 'gatewayPage',   func => \&doGatewayPage },
       { name => 'cytoBand',   func => \&doCytoBand },
       { name => 'gc5Base',   func => \&doGc5Base },
+      { name => 'repeatModeler',   func => \&doRepeatModeler },
       { name => 'repeatMasker',   func => \&doRepeatMasker },
       { name => 'simpleRepeat',   func => \&doSimpleRepeat },
       { name => 'allGaps',   func => \&doAllGaps },
       { name => 'idKeys',   func => \&doIdKeys },
       { name => 'windowMasker',   func => \&doWindowMasker },
       { name => 'addMask',   func => \&doAddMask },
       { name => 'gapOverlap',   func => \&doGapOverlap },
       { name => 'tandemDups',   func => \&doTandemDups },
       { name => 'cpgIslands',   func => \&doCpgIslands },
       { name => 'ncbiGene',   func => \&doNcbiGene },
       { name => 'ncbiRefSeq',   func => \&doNcbiRefSeq },
       { name => 'xenoRefGene',   func => \&doXenoRefGene },
       { name => 'augustus',   func => \&doAugustus },
       { name => 'trackDb',   func => \&doTrackDb },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $species = "";       # usually found in asmId_assembly_report.txt
 my $ftpDir = "";	# will be determined from given asmId
 my $rmskSpecies = "";
+my $runRepeatModeler = 0;       # default off
 my $noRmsk = 0;	# when RepeatMasker is not possible, such as bacteria
 my $ncbiRmsk = 0;	# when =1 call doRepeatMasker.pl
                         # with -ncbiRmsk=path.out.gz and -liftSpec=...
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq";
 my $noAugustus = 0;     # bacteria do *not* create an augustus track
 my $noXenoRefSeq = 0;	# bacteria do *not* create a xenoRefSeq track
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $dbName = "";  # default uses NCBI asmId for name, can specify: abcDef1
 my $defaultName = "";  # will be asmId or dbName if present
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
@@ -107,58 +110,65 @@
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir     Construct assembly hub in dir instead of default
        $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/
     -sourceDir dir    Find assembly in dir instead of default:
        $sourceDir/GC[AF]/123/456/789/asmId
     -ucscNames        Translate NCBI/INSDC/RefSeq names to UCSC names
                       default is to use the given NCBI/INSDC/RefSeq names
     -dbName <name>    name for UCSC style database name, e.g. 'abcDef1'
     -species <name>   use this species designation if there is no
                       asmId_assembly_report.txt with an
                       'Organism name:' entry to obtain species
     -rmskSpecies <name> to override default 'species' name for repeat masker
                       the default is found in the asmId_asssembly_report.txt
                       e.g. -rmskSpecies=viruses
+    -runRepeatModeler run RepeatModeler to supply custom lib to RepeatMasker,
+                      default is to not run RepeatModeler
     -noRmsk           when RepeatMasker is not possible, such as bacteria
     -ncbiRmsk         use NCBI rm.out.gz file instead of local cluster run
                       for repeat masking
     -augustusSpecies <human|chicken|zebrafish> default 'human'
     -noAugustus       do *not* create the Augustus gene track
     -noXenoRefSeq     do *not* create the Xeno RefSeq gene track
     -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz
                 expanded directory of mrnas/ and xenoRefMrna.sizes, default
                 $xenoRefSeq
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $workhorse,
 						'fileServer' => $fileServer,
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Automates build of assembly hub.  Steps:
     download: sets up sym link working hierarchy from already mirrored
                 files from NCBI in:
                       $sourceDir/GC[AF]/123/456/789/asmId
     sequence: establish AGP and 2bit file from NCBI directory
     assemblyGap: create assembly and gap bigBed files and indexes
                  for assembly track names
     chromAlias:  construct asmId.chromAlias.txt for alias name recognition
     gatewayPage: create html/asmId.description.html contents
     cytoBand: create cytoBand track and navigation ideogram
     gc5Base: create bigWig file for gc5Base track
+    repeatModeler: optionally, run RepeatModeler to construct custom library
+                   for repeatMasker run.  Use: -runRepeatModeler to perform
+                   this procedure, warning: can take a considerable amount
+                   of time (12 to 48 hours or more), and consumes an entire
+                   ku cluster node
     repeatMasker: run repeat masker cluster run and create bigBed files for
                   the composite track categories of repeats
     simpleRepeat: run trf cluster run and create bigBed file for simple repeats
     allGaps: calculate all actual real gaps due to N's in sequence, can be
                   more than were specified in the AGP file
     idKeys: calculate md5sum for each sequence in the assembly to be used to
             find identical sequences in similar assemblies
     windowMasker: run windowMasker cluster run, create windowMasker bigBed file
                   and compute intersection with repeatMasker results
     addMask: combine the higher masking of (windowMasker or repeatMasker) with
                   trf simpleRepeats into one 2bit file
     gapOverlap: find duplicated sequence on each side of a gap
     tandemDups: annotate all pairs of duplicated sequence with some gap between
     cpgIslands: run CpG islands cluster runs for both masked and unmasked
                 sequences and create bigBed files for this composite track
@@ -191,30 +201,31 @@
 }
 
 
 # Globals:
 # Command line args: asmId
 my ( $asmId);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd, $assemblySource, $asmReport);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'sourceDir=s',
 		      'rmskSpecies=s',
+		      'runRepeatModeler',
 		      'noRmsk',
 		      'ncbiRmsk',
 		      'augustusSpecies=s',
 		      'xenoRefSeq=s',
 		      'dbName=s',
 		      'noXenoRefSeq',
 		      'noAugustus',
 		      'ucscNames',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
@@ -1158,85 +1169,129 @@
     ../../\$asmId.2bit \\
       | gzip -c > \$asmId.wigVarStep.gz
   wigToBigWig \$asmId.wigVarStep.gz ../../\$asmId.chrom.sizes \$asmId.gc5Base.bw
   rm -f \$asmId.wigVarStep.gz
   touch -r ../../\$asmId.2bit \$asmId.gc5Base.bw
 else
   printf "# gc5Base step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # gc5Base
 
 #########################################################################
+# * step: repeatModeler [workhorse, bigClusterHub]
+sub doRepeatModeler {
+  if (! $runRepeatModeler) {
+       &HgAutomate::verbose(1, "# RepeatModeler not being run, add argument: -runRepeatModeler to run this step before RepeatMasker\n");
+	return;
+  }
+  my $runDir = "$buildDir/trackData/repeatModeler";
+  #  check if been run before
+  if ( -d "${runDir}" ) {
+    if ( -s "${runDir}/${defaultName}-families.fa" ) {
+      &HgAutomate::verbose(1, "\nRepeatModeler step previously completed\n");
+      return;
+    } elsif ( -s "$buildDir/trackData/repeatModeler/blastDb.bash" ) {
+        &HgAutomate::verbose(1, "\nERROR: repeatModeler step may be running\n");
+         exit 255;
+    }
+  }
+  &HgAutomate::mustMkdir($runDir);
+  my $whatItDoes = "run RepeatModeler on sequence to prepare RepeatMasker custom library";
+  my $bossScript = newBash HgRemoteScript("$runDir/doRepeatModeler.bash",
+                    $workhorse, $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+export asmId="${defaultName}"
+export buildDir="${buildDir}"
+
+doRepeatModeler.pl -buildDir=`pwd` \\
+  -unmaskedSeq=\$buildDir/\$asmId.unmasked.2bit \\
+    -bigClusterHub=$bigClusterHub -workhorse=$workhorse \$asmId
+_EOF_
+  );
+  $bossScript->execute();
+}	# sub doRepeatModeler
+
+#########################################################################
 # * step: repeatMasker [workhorse]
 sub doRepeatMasker {
   if ($noRmsk) {
        &HgAutomate::verbose(1, "# -noRmsk == RepeatMasker not being run\n");
 	return;
   }
   my $runDir = "$buildDir/trackData/repeatMasker";
   if ( -d "$buildDir/trackData/repeatMasker/run.cluster" ) {
      if ( ! -s "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" ) {
        &HgAutomate::verbose(1,
 	"\nERROR: step repeatmasker may be running\n");
        exit 255;
      }
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct repeatMasker track data";
   my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash",
                     $workhorse, $runDir, $whatItDoes);
 
+  my $repeatModeler = "$buildDir/trackData/repeatModeler";
+  my $customLib = "";
+  if ( -s "${repeatModeler}/${defaultName}-families.fa" ) {
+     $customLib = "-customLib=\"${repeatModeler}/${defaultName}-families.fa\"";
+  }
   my $rmskOpts = "";
   if ($ncbiRmsk) {
      if ( -s "$buildDir/download/${asmId}_rm.out.gz" ) {
        $rmskOpts = " \\
   -ncbiRmsk=\"$buildDir/download/${asmId}_rm.out.gz\" ";
        if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) {
          $rmskOpts .= " \\
   -dupList=\"${buildDir}/download/${asmId}.remove.dups.list\" ";
        }
        if ($ucscNames) {
          $rmskOpts .= " \\
   -liftSpec=\"$buildDir/sequence/$asmId.ncbiToUcsc.lift\"";
        }
      }
   }
 
+  my $speciesOrLib = "-species=\"\$species\"";
+  if ( length(${customLib}) ) {
+     $speciesOrLib = "${customLib}";
+  }
   $bossScript->add(<<_EOF_
 export asmId=$defaultName
 export ncbiAsmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then
 export species=`echo $rmskSpecies | sed -e 's/_/ /g;'`
 
 rm -f versionInfo.txt
 
 doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit $rmskOpts \\
-  -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId
+  -bigClusterHub=$bigClusterHub -workhorse=$workhorse $speciesOrLib \$asmId
 
 if [ -s "\$asmId.fa.out" ]; then
   gzip \$asmId.fa.out
 fi
 gzip \$asmId.sorted.fa.out \$asmId.nestedRepeats.bed
 
 doRepeatMasker.pl -continue=cleanup -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit $rmskOpts \\
-  -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId
+  -bigClusterHub=$bigClusterHub -workhorse=$workhorse $speciesOrLib \$asmId
 
 if [ ! -s versionInfo.txt ]; then
   if [ -s ../../download/\${ncbiAsmId}_rm.run ]; then
     ln -s ../../download/\${ncbiAsmId}_rm.run versionInfo.txt
   fi
 fi
 
 \$HOME/kent/src/hg/utils/automation/asmHubRepeatMasker.sh \$asmId `pwd`/\$asmId.sorted.fa.out.gz `pwd`
 else
   printf "# repeatMasker step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
@@ -2045,54 +2100,56 @@
   if (-s "$asmReport") {
      $species = `grep -i "organism name:" $asmReport`;
      chomp $species;
      $species =~ s/.*organism\s+name:\s+//i;
      $species =~ s/\s+\(.*//;
   } else {
      die "ERROR: no -species specified and can not find $asmReport";
   }
   if (length($species) < 1) {
      die "ERROR: no -species specified and can not find Organism name: in $asmReport";
   }
 }
 
 $dbName = $opt_dbName ? $opt_dbName : $dbName;
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
+$runRepeatModeler = $opt_runRepeatModeler ? $opt_runRepeatModeler : $runRepeatModeler;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
 $noAugustus = $opt_noAugustus ? 1 : $noAugustus;   # '1' == 'TRUE'
 $noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;
 $noRmsk = $opt_noRmsk ? 1 : 0;
 $defaultName = $asmId;
 $defaultName = $dbName if (length($dbName));
 
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
 if (length($dbName)) {
 printf STDERR "# dbName %s - building in /hive/data/genomes/%s\n", $dbName, $dbName;
 }
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
+printf STDERR "# runRepeatModeler %s\n", $runRepeatModeler ? "TRUE" : "FALSE";
 printf STDERR "# rmskSpecies %s\n", $rmskSpecies;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE";
 printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE";
 printf STDERR "# noAugustus %s\n", $noAugustus ? "TRUE" : "FALSE";
 printf STDERR "# noXenoRefSeq %s\n", $noXenoRefSeq ? "TRUE" : "FALSE";
 printf STDERR "# noRmsk %s\n", $noRmsk ? "TRUE" : "FALSE";
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";