901ed01df267d53d48309161193d3791bbc5d780
hiram
  Wed Jan 10 12:40:18 2024 -0800
alternate split procedure for large genomes in CpG Island track construction refs #29545

diff --git src/hg/utils/automation/doCpgIslands.pl src/hg/utils/automation/doCpgIslands.pl
index b674f3f..fe4b21b 100755
--- src/hg/utils/automation/doCpgIslands.pl
+++ src/hg/utils/automation/doCpgIslands.pl
@@ -32,30 +32,34 @@
     ]
 				);
 
 my $cpgLh = "/hive/data/staging/data/cpgIslandExt/cpglh";
 
 # Option defaults:
 # my $bigClusterHub = 'swarm';
 my $bigClusterHub = 'ku';
 # my $smallClusterHub = 'encodek';
 my $smallClusterHub = 'ku';
 my $workhorse = 'hgwdev';
 my $dbHost = 'hgwdev';
 my $defaultWorkhorse = 'hgwdev';
 my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit";
 my $chromSizes = "$HgAutomate::clusterData/\$db/chrom.sizes";
+my $maxSplitSize = 0;	# will be set to maxSeqSize or 1,000,000,000 when split
+my $splitRun = 0;	# normally not, will become true if total genome
+                        # size if more than 4Gb or if single largest sequence
+                        # is more than 1Gb
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
@@ -110,60 +114,143 @@
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $workhorse = $opt_workhorse if ($opt_workhorse);
   $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
   $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 # * step: hard mask [workhorse]
+#  the hard masking is always going to happen in the doCpg() step during
+#  setup.  If the incoming sequence is not masked at all, there will be
+#  no hard masking.  If it is masked, it will be hard masked.
 sub doHardMask {
   printf STDERR "# doHardMask: obsolete step, no longer needed\n";
   return 0;
 } # doHardMask
 
 #########################################################################
 # * step: cpg [workhorse]
 sub doCpg {
   # Set up and perform the cluster run to run the CpG function on the
   #     hard masked sequence.
   my $paraHub = $bigClusterHub;
   my $runDir = $buildDir;
   # Second, make sure we're starting clean.
   if (-e "$runDir/cpglh.result") {
     die "doCpg: looks like this was run successfully already " .
       "(cpglh.result exists).  Either run with -continue makeBed or some later " .
 	"stage, or move aside/remove $runDir/ and run again.\n";
   }
   &HgAutomate::mustMkdir($runDir);
+  my $bossScript;
+
+  if ($splitRun) {
+    my $templateCmd = ("./runCpg.bash " . '$(root1) '
+                . '{check out exists results/$(root1).cpg}');
+    &HgAutomate::makeGsub($runDir, $templateCmd);
+    `touch "$runDir/para_hub_$paraHub"`;
+
+    my $fh = &HgAutomate::mustOpen(">$runDir/runCpg.bash");
+    print $fh <<_EOF_
+#!/bin/bash
+set -beEu -o pipefail
+export partName=\$1
+export part2bit=partFa/\$partName.2bit
+export result=\$2
+twoBitToFa \$part2bit stdout | /hive/data/staging/data/cpgIslandExt/cpglh /dev/stdin > \$result
+_EOF_
+    ;
+    close($fh);
+
+    my $fh = &HgAutomate::mustOpen(">$runDir/oneSplit.bash");
+    print $fh <<_EOF_
+#!/bin/bash
+set -beEu -o pipefail
+export tmpFile=`mktemp -p /dev/shm doCpg.\$\$.XXXXX`
+export chromSizes=$chromSizes
+export fileSpec="\${1}"
+export file=`echo \$fileSpec | cut -d':' -f1`
+export seq=`echo \$fileSpec | cut -d':' -f2`
+export range=`echo \$fileSpec | cut -d':' -f3`
+export start=`echo \$range | cut -d'-' -f1`
+export end=`echo \$range | cut -d'-' -f2`
+export seqSize=`grep -w "\${seq}" \$chromSizes | awk '{print \$2}'`
+
+twoBitToFa \$fileSpec stdout | maskOutFa stdin hard stdout | /hive/data/staging/data/cpgIslandExt/cpglh \\
+   /dev/stdin | sed -e "s/\\t /\\t/g;" > "\${tmpFile}"
+printf "%d\\t%s:%d-%d\\t%d\\t%s\\t%d\\n" "\${start}" "\${seq}"  "\${start}" "\${end}" "\${seqSize}" "\${seq}" "\${seqSize}" \\
+  | liftUp -type=.bed results/\${seq}:\${start}-\${end}.cpg stdin error "\${tmpFile}"
+
+rm -f "\${tmpFile}"
+_EOF_
+    ;
+    close($fh);
 
   my $whatItDoes = "Run /hive/data/staging/data/cpgIslandExt/cpglh on masked sequence.";
-  my $bossScript = newBash HgRemoteScript("$runDir/doCpg.bash", $workhorse,
+  $bossScript = newBash HgRemoteScript("$runDir/doCpg.bash", $workhorse,
+				      $runDir, $whatItDoes);
+  my $paraRun = &HgAutomate::paraRun();
+  my $gensub2 = &HgAutomate::gensub2();
+
+  $bossScript->add(<<_EOF_
+export twoBit=\"$maskedSeq\"
+rm -fr parts partFa
+mkdir partFa
+/cluster/bin/scripts/partitionSequence.pl -lstDir parts $maxSplitSize 0 \$twoBit $chromSizes 1000 > part.list
+for L in parts/part*.lst
+do
+  B=`basename \$L | sed -e 's/.lst//;'`
+  sed -e 's/.*.2bit://; s/:0-.*//;' \${L} > \${B}.list
+  twoBitToFa -seqList=\$B.list \${twoBit} stdout | maskOutFa stdin hard stdout \\
+     | faToTwoBit stdin partFa/\$B.t.2bit
+  rm -f \${B}.list
+  twoBitToFa partFa/\$B.t.2bit stdout | faCount stdin | egrep -v \"^total|^#seq\" | awk '\$2-\$7 > 200 { printf \"%s\\n\", \$1}' > \$B.list
+  if [ -s \$B.list ]; then
+    twoBitToFa -seqList=\$B.list partFa/\$B.t.2bit stdout | faToTwoBit stdin partFa/\$B.2bit
+  fi
+  rm -f partFa/\$B.t.2bit \$B.list
+done
+mkdir -p results
+chmod a+x runCpg.bash oneSplit.bash
+grep -v "parts/part" part.list | xargs -L 1 --no-run-if-empty ./oneSplit.bash
+rm -f file.list
+find ./partFa -type f > file.list
+$gensub2 file.list single gsub jobList
+$paraRun
+catDir -r results | sort -k1,1 -k2,2n > cpglh.result
+_EOF_
+  );
+  } else {
+
+  my $whatItDoes = "Run /hive/data/staging/data/cpgIslandExt/cpglh on masked sequence.";
+  $bossScript = newBash HgRemoteScript("$runDir/doCpg.bash", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 export twoBit=\"$maskedSeq\"
 twoBitToFa \$twoBit stdout | maskOutFa stdin hard stdout \\
    | /hive/data/staging/data/cpgIslandExt/cpglh /dev/stdin 2> cpglh.stderr \\
      > cpglh.result
 _EOF_
   );
+  }
 
   $bossScript->execute();
 } # doCpg
 
 #########################################################################
 # * step: make bed [workhorse]
 sub doMakeBed {
   my $runDir = $buildDir;
   &HgAutomate::mustMkdir($runDir);
 
   # First, make sure we're starting clean.
   if (-e "$runDir/cpgIsland.bed") {
     die "doMakeBed: looks like this was run successfully already " .
       "(cpgIsland.bed exists).  Either run with -continue load or cleanup " .
 	"or move aside/remove $runDir/cpgIsland.bed and run again.\n";
@@ -181,56 +268,56 @@
      | sort -k1,1 -k2,2n > cpgIsland.bed
 bedToBigBed -tab -type=bed4+6 -as=\$HOME/kent/src/hg/lib/cpgIslandExt.as \\
   cpgIsland.bed $chromSizes $db.$tableName.bb
 _EOF_
   );
   $bossScript->execute();
 } # doMakeBed
 
 #########################################################################
 # * step: load [dbHost]
 sub doLoadCpg {
   my $runDir = $buildDir;
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "Loads cpgIsland.bed.";
-  my $bossScript = new HgRemoteScript("$runDir/doLoadCpg.csh", $dbHost,
+  my $bossScript = newBash HgRemoteScript("$runDir/doLoadCpg.bash", $dbHost,
 				      $runDir, $whatItDoes);
 
   if (! -e "$runDir/cpgIsland.bed") {
     die "doLoadCpg previous step doMakeBed has not completed, cpgIsland.bed not found\n";
   }
 
   $bossScript->add(<<_EOF_
-set C=`cut -f1 cpgIsland.bed | sort -u | awk '{print length(\$0)}' | sort -rn | sed -n -e '1,1 p'`
+export C=`cut -f1 cpgIsland.bed | sort -u | awk '{print length(\$0)}' | sort -rn | sed -n -e '1,1 p'`
 sed -e "s/14/\${C}/" \$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql
 hgLoadBed -noLoad -sqlTable=cpgIslandExt.sql -tab $db $tableName cpgIsland.bed
 hgLoadSqlTab $db $tableName cpgIslandExt.sql bed.tab
 checkTableCoords -verboseBlocks -table=$tableName $db
 featureBits $db $tableName >&fb.$db.$tableName.txt
 _EOF_
   );
   $bossScript->execute();
 } # doLoad
 
 #########################################################################
 # * step: cleanup [fileServer]
 sub doCleanup {
   my $runDir = $buildDir;
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
-  my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $fileServer,
+  my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 rm -f bed.tab cpgIslandExt.sql
 gzip cpgIsland.bed
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
@@ -242,30 +329,44 @@
 ($db) = @ARGV;
 
 # Force debug and verbose until this is looking pretty solid:
 #$opt_debug = 1;
 #$opt_verbose = 3 if ($opt_verbose < 3);
 
 # Establish what directory we will work in.
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/cpgIslands";
 $maskedSeq = $opt_maskedSeq ? $opt_maskedSeq :
   "$HgAutomate::clusterData/$db/$db.2bit";
 $chromSizes = $opt_chromSizes ? $opt_chromSizes :
   "$HgAutomate::clusterData/$db/chrom.sizes";
 $tableName = $opt_tableName ? $opt_tableName : "cpgIslandExt";
 
+my $maxSeqSize=`sort -k2,2nr $chromSizes | head -1 | awk '{printf "%d", \$NF}'`;
+my $totalSeqSize=`ave -col=2 $chromSizes | grep -w total | awk '{printf "%d", \$NF}'`;
+chomp $maxSeqSize;
+chomp $totalSeqSize;
+$maxSplitSize = $maxSeqSize;
+$splitRun = 0;
+#   big genomes are over 4Gb: 4*1024*1024*1024 = 4294967296
+#   or if maxSeqSize over 1Gb
+if ( ($maxSeqSize > 4*1024**3) || ($maxSeqSize > 1024**3) ) {
+   $splitRun = 1;
+   $maxSplitSize = 1000000000;
+}
+printf STDERR "# total genome size %d, max sequence size: %d, splitRun: %s\n", $totalSeqSize, $maxSeqSize, $splitRun ? "TRUE" : "FALSE";
+
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 &HgAutomate::verbose(1,