64f5aaf6b778b991cadb76a605325760cef3fd71
hiram
  Fri Oct 31 15:25:00 2025 -0700
Adding Gene Meyers' FasTAN tandem repeat finder procedure

diff --git src/hg/utils/automation/doFasTAN.pl src/hg/utils/automation/doFasTAN.pl
new file mode 100755
index 00000000000..109d20972c9
--- /dev/null
+++ src/hg/utils/automation/doFasTAN.pl
@@ -0,0 +1,300 @@
+#!/usr/bin/env perl
+
+# DO NOT EDIT the /cluster/bin/scripts copy of this file --
+# edit ~/kent/src/hg/utils/automation/doFasTAN.pl instead.
+
+use Getopt::Long;
+use warnings;
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin";
+use HgAutomate;
+use HgRemoteScript;
+use HgStepManager;
+
+# Option variable names, both common and peculiar to this script:
+use vars @HgAutomate::commonOptionVars;
+use vars @HgStepManager::optionVars;
+use vars qw/
+    $opt_buildDir
+    $opt_unmaskedSeq
+    /;
+
+# Specify the steps supported with -continue / -stop:
+my $stepper = new HgStepManager(
+    [ { name => 'setup',   func => \&doSetup },
+      { name => 'cluster', func => \&doCluster },
+      { name => 'bedResult', func => \&doBedResult },
+      { name => 'cleanup', func => \&doCleanup },
+    ]
+				);
+
+# FasTAN and tanbed commands
+my $FasTAN = "/cluster/bin/x86_64/FasTAN";
+my $tanbed = "/cluster/bin/x86_64/tanbed";
+
+# Option defaults:
+my $bigClusterHub = 'hgwdev';
+my $defaultWorkhorse = 'least loaded';
+my $dbHost = 'hgwdev';
+my $unmaskedSeq = "";
+
+my $base = $0;
+$base =~ s/^(.*\/)?//;
+
+sub usage {
+  # Usage / help / self-documentation:
+  my ($status, $detailed) = @_;
+  # Basic help (for incorrect usage):
+  print STDERR "usage: $base db path/to/unmasked.2bit
+options:
+";
+  print STDERR $stepper->getOptionHelp();
+  print STDERR <<_EOF_
+    -buildDir dir         Use dir instead of default
+                          $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/longdust.\$date
+                          (necessary when continuing at a later date).
+_EOF_
+  ;
+  print STDERR HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
+					'workhorse' => $defaultWorkhorse);
+  print STDERR "
+Automates Gene Meyers' 'FasTAN' process for the given unmasked.2bit sequence.
+Steps:
+    setup:   Prepares partitioned sequence listings from the given unmasked.2bit
+    cluster: Does the cluster run of 'FasTAN' on the partitioned sequences
+    bedResult: Gathers the individual cluster job results into
+               one .bed and .bb bigBed result
+    cleanup: Removes temporary files
+All operations are performed in the build directory which is
+$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/fasTan.\$date unless -buildDir is given.";
+  # There is no detailed help (-help):
+  print "\n";
+  exit $status;
+}
+
+
+# Globals:
+# Command line args: db
+my ($db);
+# Other:
+my ($buildDir);
+
+sub checkOptions {
+  # Make sure command line options are valid/supported.
+  my $ok = GetOptions(@HgStepManager::optionSpec,
+		      'buildDir=s',
+		      'unmaskedSeq=s',
+		      @HgAutomate::commonOptionSpec,
+		      );
+  usage(1) if (!$ok);
+  usage(0, 1) if ($opt_help);
+  HgAutomate::processCommonOptions();
+  my $err = $stepper->processOptions();
+  usage(1) if ($err);
+  $dbHost = $opt_dbHost if ($opt_dbHost);
+}
+
+#########################################################################
+# * step: setup [workhorse]
+sub doSetup {
+  my $runDir = "$buildDir";
+  if ( ! $opt_debug && (-s "$runDir/chrom.sizes" && -s "$runDir/part.list" )) {
+     printf STDERR "# setup step already complete\n";
+     return;
+  }
+  if (! $opt_debug) {
+    my @outs = ("$runDir/setup.bash",
+                "$runDir/chrom.sizes",
+                "$runDir/part.list");
+    HgAutomate::checkCleanSlate('setup', 'cluster', @outs);
+  }
+  HgAutomate::mustMkdir($runDir);
+
+  my $whatItDoes = "prepare files for FasTAN cluster run.";
+  my $workhorse = $opt_debug ? "hgwdev" : HgAutomate::chooseWorkhorse();
+  my $bossScript = newBash HgRemoteScript("$runDir/setup.bash", $workhorse,
+				      $runDir, $whatItDoes);
+
+  my $tmpDir = HgAutomate::tmpDir();
+  $bossScript->add(<<_EOF_
+rm -fr unmasked.2bit listFiles
+ln -s "$unmaskedSeq" unmasked.2bit
+
+twoBitInfo unmasked.2bit stdout | sort -k2nr > chrom.sizes
+
+export seqMax=`head -1 chrom.sizes | awk '{printf "%d", \$2+1}'`
+
+partitionSequence.pl -lstDir listFiles \$seqMax 0 \\
+  unmasked.2bit chrom.sizes 10000
+ls -S listFiles/*.lst > part.list
+printf "#####################################################\n" > version.txt
+printf "# FasTAN version: %s\n" "`ls -og $FasTAN`" >> version.txt
+printf "# tanbed version: %s\n" "`ls -og $tanbed`" >> version.txt
+printf "#####################################################\n" >> version.txt
+printf "# running FasTAN with -T8 option for 8 threads\n" >> version.txt
+printf "#####################################################\n" >> version.txt
+printf "# %s\n" "$FasTAN -v tmp/seqName.fa tmp/seqName" >> version.txt
+printf "# %s\n" "$tanbed tmp/seqName.1aln >> result/seqName.bed.gz" >> version.txt
+printf "#####################################################\n" >> version.txt
+$FasTAN >> version.txt 2>&1 || true
+printf "#####################################################\n" >> version.txt
+_EOF_
+  );
+  $bossScript->execute() if (! $opt_debug);
+} # doSetup
+
+#########################################################################
+# * step: cluster [bigClusterHub]
+sub doCluster {
+  my $paraHub = $bigClusterHub;
+  my $runDir = "$buildDir";
+  if ( ! $opt_debug && -s "$runDir/run.time") {
+     printf STDERR "# cluster step already complete\n";
+     return;
+  }
+  my $partList = "part.list";	# from doSetup
+  HgAutomate::checkExistsUnlessDebug('setup', 'bedResult', "$runDir/part.list");
+  my $whatItDoes = "Cluster run FasTAN on the part.list sequences.  Results into ./result/*.bed.gz";
+  my $templateCmd = ('runOne $(path1) {check out exists result/$(root1).bed.gz}');
+  HgAutomate::makeGsub($runDir, $templateCmd);
+  `touch "$runDir/para_hub_$paraHub"`;
+  my $paraRun = <<'_EOF_';
+# 24 Gb = 3 Gb each thread X 8 threads
+para make -cpu=8 -ram=24g jobList
+para check
+para time > run.time
+cat run.time
+_EOF_
+
+  my $bossScript = newBash HgRemoteScript("$runDir/doCluster.bash", $paraHub,
+				      $runDir, $whatItDoes);
+  $bossScript->add(<<_EOF_
+cat <<'_EOM_' > runOne
+#!/bin/bash
+
+set -beEu -o pipefail
+
+# do not let a lot of jobs all start up at the same instant
+# sleep between 12 and 32 seconds
+export sleepTime=\$((RANDOM % 21 + 12))
+sleep "\$sleepTime"
+
+export listFile="\${1}"
+export resultBedGz="\${2}"
+export resultBed="\${resultBedGz%.gz}"
+
+mkdir -p tmp result
+
+rm -f "\${resultBed}" "\${resultBedGz}"
+touch "\${resultBed}"
+
+for seqSpec in `cat \$listFile`
+do
+   seqName=`basename \$seqSpec | cut -d":" -f2`
+   rm -f tmp/\${seqName}.fa
+   twoBitToFa \$seqSpec tmp/\${seqName}.fa
+   $FasTAN -T8 -v tmp/\${seqName}.fa tmp/\${seqName}
+   $tanbed tmp/\${seqName}.1aln >> \${resultBed}
+   rm -f tmp/\${seqName}.fa tmp/\$seqName.1aln
+done
+gzip \${resultBed}
+_EOM_
+
+chmod +x runOne
+
+gensub2 $partList single gsub jobList
+$paraRun
+_EOF_
+  );
+
+  $bossScript->execute() if (! $opt_debug);
+} # doCluster
+
+#########################################################################
+# * step: bedResult [fileServer]
+sub doBedResult {
+  my $runDir = "$buildDir";
+  if ( ! $opt_debug && -s "$runDir/fasTAN.bb") {
+     printf STDERR "# bedResult step already complete\n";
+     return;
+  }
+  my $whatItDoes = "Consolidate the cluster run bed.gz files.  Make single bed and bigBed file.";
+  HgAutomate::checkExistsUnlessDebug('cluster', 'cleanup', "$runDir/run.time");
+  my $fileServer = $opt_debug ? "hgwdev" : HgAutomate::chooseFileServer($runDir);
+  my $bossScript = newBash HgRemoteScript("$runDir/makeBed.bash", $fileServer,
+				      $runDir, $whatItDoes);
+  $bossScript->add(<<_EOF_
+ls -S result/*.bed.gz | xargs zcat | gzip -c > fasTAN.bed.gz
+bedToBigBed -type=bed5 fasTAN.bed.gz chrom.sizes fasTAN.bb
+export totalBases=`ave -col=2 chrom.sizes | grep total | awk '{printf "%d", \$NF}'`
+export basesCovered=`bigBedInfo fasTAN.bb | grep basesCovered | awk '{printf "%s", \$NF}' | tr -d ','`
+export percentCovered=`echo \$basesCovered \$totalBases | awk '{printf "%.2f", 100*\$1/\$2}'`
+printf "%d bases of %d (%s%%) in intersection\n" "\$basesCovered" "\$totalBases" "\$percentCovered" > fb.fasTAN.txt
+cat fb.fasTAN.txt
+_EOF_
+  );
+  $bossScript->execute() if (! $opt_debug);
+} #doBedResult
+
+#########################################################################
+# * step: cleanup [fileServer]
+sub doCleanup {
+  my $runDir = "$buildDir";
+  if ( ! $opt_debug && ( ! -d "$runDir/tmp" && ! -d "$runDir/result")) {
+     printf STDERR "# cleanup step already complete\n";
+     return;
+  }
+  my $whatItDoes = "Cleans up or compresses intermediate files.";
+  my $fileServer = $opt_debug ? "hgwdev" : HgAutomate::chooseFileServer($runDir);
+  my $bossScript = newBash HgRemoteScript("$runDir/cleanup.bash", $fileServer,
+				      $runDir, $whatItDoes);
+  $bossScript->add(<<_EOF_
+rm -fr tmp result liftFiles err part.list chrom.sizes
+_EOF_
+  );
+  $bossScript->execute() if (! $opt_debug);
+} # doCleanup
+
+#########################################################################
+# main
+
+# Prevent "Suspended (tty input)" hanging:
+HgAutomate::closeStdin();
+
+# Make sure we have valid options and exactly 1 argument:
+checkOptions();
+usage(1) if (scalar(@ARGV) != 2);
+my $secondsStart = `date "+%s"`;
+chomp $secondsStart;
+($db, $unmaskedSeq) = @ARGV;
+
+# Force debug and verbose until this is looking pretty solid:
+#$opt_debug = 1;
+$opt_verbose = 3 if ($opt_verbose < 3);
+
+# Establish what directory we will work in.
+my $date = `date +%Y-%m-%d`;
+chomp $date;
+$buildDir = $opt_buildDir ? $opt_buildDir :
+  "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/longdust.$date";
+
+# Do everything.
+$stepper->execute();
+
+# Tell the user anything they should know.
+my $stopStep = $stepper->getStopStep();
+my $upThrough = ($stopStep eq 'cleanup') ? "" :
+  "  (through the '$stopStep' step)";
+
+my $secondsEnd = `date "+%s"`;
+chomp $secondsEnd;
+my $elapsedSeconds = $secondsEnd - $secondsStart;
+my $elapsedMinutes = int($elapsedSeconds/60);
+$elapsedSeconds -= $elapsedMinutes * 60;
+
+HgAutomate::verbose(1,
+	"\n *** All done !$upThrough - Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
+HgAutomate::verbose(1,
+	" *** Steps were performed in $buildDir\n");
+HgAutomate::verbose(1, "\n");