7c934496e3650521dc129b65658326d542cbd20b hiram Tue May 17 12:52:51 2022 -0700 trying out new rmsk format bigBed joined display no redmine diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl index d27099e..216085e 100755 --- src/hg/utils/automation/doRepeatMasker.pl +++ src/hg/utils/automation/doRepeatMasker.pl @@ -212,40 +212,43 @@ if ( $opt_useHMMER ) { # NOTE: This is only applicable for 8gb one-job-per-node scheduling $RepeatMaskerEngine = "-engine hmmer -pa 4"; $parasolRAM = "-cpu=4 -ram=32g"; } # Script to do a dummy run of RepeatMasker, to test our invocation and # unpack library files before kicking off a large cluster run. # And now that RM is being run from local /scratch/data/RepeatMasker/ # this is also done in the cluster run script so each node will have # its library initialized my $fh = &HgAutomate::mustOpen(">$runDir/dummyRun.csh"); print $fh <<_EOF_ #!/bin/csh -ef +set path = (/cluster/software/bin \$path) $RepeatMasker $RepeatMaskerEngine $repeatLib /dev/null _EOF_ ; close($fh); # Cluster job script: $fh = &HgAutomate::mustOpen(">$runDir/RMRun.csh"); print $fh <<_EOF_ #!/bin/csh -ef +set path = (/cluster/software/bin \$path) + set finalOut = \$1 set inLst = \$finalOut:r set inLft = \$inLst:r.lft set alignOut = \$finalOut:r.align set catOut = \$finalOut:r.cat # Use local disk for output, and move the final result to \$outPsl # when done, to minimize I/O. set tmpDir = `mktemp -d -p /scratch/tmp doRepeatMasker.cluster.XXXXXX` pushd \$tmpDir # Initialize local library $RepeatMasker $RepeatMaskerEngine $repeatLib /dev/null @@ -558,65 +561,78 @@ fi fi _EOF_ ); $bossScript->execute(); } # doCat ######################################################################### # * step: mask [workhorse] sub doMask { my $runDir = "$buildDir"; &HgAutomate::checkExistsUnlessDebug('cat', 'mask', "$buildDir/$db.sorted.fa.out"); my $whatItDoes = "It makes a masked .2bit in this build directory."; my $workhorse = &HgAutomate::chooseWorkhorse(); - my $bossScript = new HgRemoteScript("$runDir/doMask.csh", $workhorse, + my $bossScript = newBash HgRemoteScript("$runDir/doMask.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -twoBitMask $unmaskedSeq $db.sorted.fa.out $db.rmsk$updateTable.2bit -twoBitToFa $db.rmsk$updateTable.2bit stdout | faSize stdin > faSize.rmsk$updateTable.txt +export db=$db +twoBitMask $unmaskedSeq \$db.sorted.fa.out \$db.rmsk$updateTable.2bit +twoBitToFa \$db.rmsk$updateTable.2bit stdout | faSize stdin > faSize.rmsk$updateTable.txt & _EOF_ ); $bossScript->execute(); } # doMask - ######################################################################### # * step: install [dbHost, maybe fileServer] sub doInstall { my $runDir = "$buildDir"; &HgAutomate::checkExistsUnlessDebug('cat', 'install', "$buildDir/$db.sorted.fa.out"); my $split = ""; $split = " (split)" if ($opt_splitTables); my $whatItDoes = "It loads $db.sorted.fa.out into the$split rmsk$updateTable table and $db.nestedRepeats.bed\n" . "into the nestedRepeats table. It also installs the masked 2bit."; my $bossScript = newBash HgRemoteScript("$runDir/doLoad.bash", $dbHost, $runDir, $whatItDoes); $split = "-nosplit"; $split = "-split" if ($opt_splitTables); my $installDir = "$HgAutomate::clusterData/$db"; $bossScript->add(<<_EOF_ export db=$db +# ensure sort functions properly despite kluster node environment +export LC_COLLATE=C + hgLoadOut -table=rmsk$updateTable $split \$db \$db.sorted.fa.out hgLoadOut -verbose=2 -tabFile=\$db.rmsk$updateTable.tab -table=rmsk$updateTable -nosplit \$db \$db.sorted.fa.out 2> \$db.bad.records.txt # construct bbi files for assembly hub +$RepeatMaskerPath/util/rmToTrackHub.pl -out \$db.sorted.fa.out -align \$db.fa.align +# in place same file sort using the -o output option +sort -k1,1 -k2,2n -o \$db.fa.align.tsv \$db.fa.align.tsv & +sort -k1,1 -k2,2n -o \$db.sorted.fa.join.tsv \$db.sorted.fa.join.tsv +wait +bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigRmskAlignBed.as -type=bed3+14 \\ + \$db.fa.align.tsv ../../chrom.sizes \$db.rmsk.align.bb & +bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigRmskBed.as -type=bed9+5 \\ + \$db.sorted.fa.join.tsv ../../chrom.sizes \$db.rmsk.bb +wait rm -fr classBed classBbi rmskClass mkdir classBed classBbi rmskClass sort -k12,12 \$db.rmsk$updateTable.tab \\ | splitFileByColumn -ending=tab -col=12 -tab stdin rmskClass for T in SINE LINE LTR DNA Simple Low_complexity Satellite do fileCount=`(ls rmskClass/\${T}*.tab 2> /dev/null || true) | wc -l` if [ "\$fileCount" -gt 0 ]; then echo "working: "`ls rmskClass/\${T}*.tab | xargs echo` \$HOME/kent/src/hg/utils/automation/rmskBed6+10.pl rmskClass/\${T}*.tab \\ | sort -k1,1 -k2,2n > classBed/\$db.rmsk.\${T}.bed bedToBigBed -tab -type=bed6+10 -as=\$HOME/kent/src/hg/lib/rmskBed6+10.as \\ classBed/\$db.rmsk.\${T}.bed ../../chrom.sizes \\ classBbi/\$db.rmsk.\${T}.bb fi @@ -676,36 +692,37 @@ } $bossScript->add(<<_EOF_ rm -f $installDir/\$db.rmsk$updateTable.2bit ln -s $buildDir/\$db.rmsk$updateTable.2bit $installDir/\$db.rmsk$updateTable.2bit _EOF_ ); $bossScript->execute(); # Make a new script for the fileServer if chrom-based: if ($chromBased) { my $fileServer = &HgAutomate::chooseFileServer($runDir); $whatItDoes = "It splits $db.sorted.fa.out into per-chromosome files in chromosome directories\n" . "where makeDownload.pl will expect to find them.\n"; - my $bossScript = new HgRemoteScript("$runDir/doSplit.csh", $fileServer, + my $bossScript = newBash HgRemoteScript("$runDir/doSplit.bash", $fileServer, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -head -3 $db.sorted.fa.out > /tmp/rmskHead.txt -tail -n +4 $db.sorted.fa.out \\ -| splitFileByColumn -col=5 stdin /cluster/data/$db -chromDirs \\ +export db=$db +head -3 \$db.sorted.fa.out > /tmp/rmskHead.txt +tail -n +4 \$db.sorted.fa.out \\ +| splitFileByColumn -col=5 stdin /cluster/data/\$db -chromDirs \\ -ending=.fa.out -head=/tmp/rmskHead.txt _EOF_ ); $bossScript->execute(); } } # doInstall ######################################################################### # * step: cleanup [fileServer] sub doCleanup { my $runDir = "$buildDir"; my $whatItDoes = "It cleans up or compresses intermediate files."; my $fileServer = &HgAutomate::chooseFileServer($runDir); my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer,