7c934496e3650521dc129b65658326d542cbd20b
hiram
  Tue May 17 12:52:51 2022 -0700
trying out new rmsk format bigBed joined display no redmine

diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl
index d27099e..216085e 100755
--- src/hg/utils/automation/doRepeatMasker.pl
+++ src/hg/utils/automation/doRepeatMasker.pl
@@ -212,40 +212,43 @@
   if ( $opt_useHMMER ) {
     # NOTE: This is only applicable for 8gb one-job-per-node scheduling
     $RepeatMaskerEngine = "-engine hmmer -pa 4";
     $parasolRAM = "-cpu=4 -ram=32g";
   }
 
   # Script to do a dummy run of RepeatMasker, to test our invocation and
   # unpack library files before kicking off a large cluster run.
   #  And now that RM is being run from local /scratch/data/RepeatMasker/
   #  this is also done in the cluster run script so each node will have
   #	its library initialized
   my $fh = &HgAutomate::mustOpen(">$runDir/dummyRun.csh");
   print $fh <<_EOF_
 #!/bin/csh -ef
 
+set path = (/cluster/software/bin \$path)
 $RepeatMasker $RepeatMaskerEngine $repeatLib /dev/null
 _EOF_
   ;
   close($fh);
 
   # Cluster job script:
   $fh = &HgAutomate::mustOpen(">$runDir/RMRun.csh");
   print $fh <<_EOF_
 #!/bin/csh -ef
 
+set path = (/cluster/software/bin \$path)
+
 set finalOut = \$1
 
 set inLst = \$finalOut:r
 set inLft = \$inLst:r.lft
 set alignOut = \$finalOut:r.align
 set catOut = \$finalOut:r.cat
 
 # Use local disk for output, and move the final result to \$outPsl
 # when done, to minimize I/O.
 set tmpDir = `mktemp -d -p /scratch/tmp doRepeatMasker.cluster.XXXXXX`
 pushd \$tmpDir
 
 # Initialize local library
 $RepeatMasker $RepeatMaskerEngine $repeatLib /dev/null
 
@@ -558,65 +561,78 @@
    fi
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doCat
 
 #########################################################################
 # * step: mask [workhorse]
 sub doMask {
   my $runDir = "$buildDir";
   &HgAutomate::checkExistsUnlessDebug('cat', 'mask', "$buildDir/$db.sorted.fa.out");
 
   my $whatItDoes = "It makes a masked .2bit in this build directory.";
   my $workhorse = &HgAutomate::chooseWorkhorse();
-  my $bossScript = new HgRemoteScript("$runDir/doMask.csh", $workhorse,
+  my $bossScript = newBash HgRemoteScript("$runDir/doMask.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
-twoBitMask $unmaskedSeq $db.sorted.fa.out $db.rmsk$updateTable.2bit
-twoBitToFa $db.rmsk$updateTable.2bit stdout | faSize stdin > faSize.rmsk$updateTable.txt
+export db=$db
+twoBitMask $unmaskedSeq \$db.sorted.fa.out \$db.rmsk$updateTable.2bit
+twoBitToFa \$db.rmsk$updateTable.2bit stdout | faSize stdin > faSize.rmsk$updateTable.txt &
 _EOF_
   );
   $bossScript->execute();
 } # doMask
 
-
 #########################################################################
 # * step: install [dbHost, maybe fileServer]
 sub doInstall {
   my $runDir = "$buildDir";
   &HgAutomate::checkExistsUnlessDebug('cat', 'install', "$buildDir/$db.sorted.fa.out");
 
   my $split = "";
   $split = " (split)" if ($opt_splitTables);
   my $whatItDoes =
 "It loads $db.sorted.fa.out into the$split rmsk$updateTable table and $db.nestedRepeats.bed\n" .
 "into the nestedRepeats table.  It also installs the masked 2bit.";
   my $bossScript = newBash HgRemoteScript("$runDir/doLoad.bash", $dbHost,
 				      $runDir, $whatItDoes);
 
   $split = "-nosplit";
   $split = "-split" if ($opt_splitTables);
   my $installDir = "$HgAutomate::clusterData/$db";
   $bossScript->add(<<_EOF_
 export db=$db
 
+# ensure sort functions properly despite kluster node environment
+export LC_COLLATE=C
+
 hgLoadOut -table=rmsk$updateTable $split \$db \$db.sorted.fa.out
 hgLoadOut -verbose=2 -tabFile=\$db.rmsk$updateTable.tab -table=rmsk$updateTable -nosplit \$db \$db.sorted.fa.out 2> \$db.bad.records.txt
 # construct bbi files for assembly hub
+$RepeatMaskerPath/util/rmToTrackHub.pl -out \$db.sorted.fa.out -align \$db.fa.align
+# in place same file sort using the -o output option
+sort -k1,1 -k2,2n -o \$db.fa.align.tsv \$db.fa.align.tsv &
+sort -k1,1 -k2,2n -o \$db.sorted.fa.join.tsv \$db.sorted.fa.join.tsv
+wait
+bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigRmskAlignBed.as -type=bed3+14 \\
+  \$db.fa.align.tsv ../../chrom.sizes \$db.rmsk.align.bb &
+bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigRmskBed.as -type=bed9+5 \\
+  \$db.sorted.fa.join.tsv ../../chrom.sizes \$db.rmsk.bb
+wait
 rm -fr classBed classBbi rmskClass
 mkdir classBed classBbi rmskClass
 sort -k12,12 \$db.rmsk$updateTable.tab \\
   | splitFileByColumn -ending=tab  -col=12 -tab stdin rmskClass
 for T in SINE LINE LTR DNA Simple Low_complexity Satellite
 do
     fileCount=`(ls rmskClass/\${T}*.tab 2> /dev/null || true) | wc -l`
     if [ "\$fileCount" -gt 0 ]; then
        echo "working: "`ls rmskClass/\${T}*.tab | xargs echo`
        \$HOME/kent/src/hg/utils/automation/rmskBed6+10.pl rmskClass/\${T}*.tab \\
         | sort -k1,1 -k2,2n > classBed/\$db.rmsk.\${T}.bed
        bedToBigBed -tab -type=bed6+10 -as=\$HOME/kent/src/hg/lib/rmskBed6+10.as \\
          classBed/\$db.rmsk.\${T}.bed ../../chrom.sizes \\
            classBbi/\$db.rmsk.\${T}.bb
     fi
@@ -676,36 +692,37 @@
   }
 
   $bossScript->add(<<_EOF_
 rm -f $installDir/\$db.rmsk$updateTable.2bit
 ln -s $buildDir/\$db.rmsk$updateTable.2bit $installDir/\$db.rmsk$updateTable.2bit
 _EOF_
   );
   $bossScript->execute();
 
   # Make a new script for the fileServer if chrom-based:
   if ($chromBased) {
     my $fileServer = &HgAutomate::chooseFileServer($runDir);
     $whatItDoes =
 "It splits $db.sorted.fa.out into per-chromosome files in chromosome directories\n" .
 "where makeDownload.pl will expect to find them.\n";
-    my $bossScript = new HgRemoteScript("$runDir/doSplit.csh", $fileServer,
+    my $bossScript = newBash HgRemoteScript("$runDir/doSplit.bash", $fileServer,
 					$runDir, $whatItDoes);
     $bossScript->add(<<_EOF_
-head -3 $db.sorted.fa.out > /tmp/rmskHead.txt
-tail -n +4 $db.sorted.fa.out \\
-| splitFileByColumn -col=5 stdin /cluster/data/$db -chromDirs \\
+export db=$db
+head -3 \$db.sorted.fa.out > /tmp/rmskHead.txt
+tail -n +4 \$db.sorted.fa.out \\
+| splitFileByColumn -col=5 stdin /cluster/data/\$db -chromDirs \\
     -ending=.fa.out -head=/tmp/rmskHead.txt
 _EOF_
     );
     $bossScript->execute();
   }
 } # doInstall
 
 
 #########################################################################
 # * step: cleanup [fileServer]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
   my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer,