463df2eb31411fdb0a5462ff1ac4c7bc7c834ade hiram Thu Jul 25 21:42:06 2019 -0700 manage long sequence names going into RM and work around nested repeats errors refs #23734 diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl index 482bdb3..5d17072 100755 --- src/hg/utils/automation/doRepeatMasker.pl +++ src/hg/utils/automation/doRepeatMasker.pl @@ -212,37 +212,56 @@ set inLft = \$inLst:r.lft set alignOut = \$finalOut:r.align set catOut = \$finalOut:r.cat # Use local disk for output, and move the final result to \$outPsl # when done, to minimize I/O. set tmpDir = `mktemp -d -p /scratch/tmp doRepeatMasker.cluster.XXXXXX` pushd \$tmpDir # Initialize local library $RepeatMasker $RepeatMaskerEngine $repeatLib /dev/null foreach spec (`cat \$inLst`) # Remove path and .2bit filename to get just the seq:start-end spec: set base = `echo \$spec | sed -r -e 's/^[^:]+://'` + # RM has a limitation of the length of a sequence name + # in the case of name too long, create a shorter name, then lift + set nameLength = `echo \$base | wc -c` + set shortName = `echo \$base | sed -e 's/>//;' | md5sum | cut -d' ' -f1` # If \$spec is the whole sequence, twoBitToFa removes the :start-end part, # which causes liftUp to barf later. So tweak the header back to # seq:start-end for liftUp's sake: - twoBitToFa \$spec stdout \\ - | sed -e "s/^>.*/>\$base/" > \$base.fa + if ( \$nameLength > 45 ) then + twoBitToFa \$spec stdout | sed -e "s/^>.*/>\$shortName/" > \$base.fa + set fastaSize = `faSize \$base.fa | grep bases | cut -d' ' -f1` + /bin/printf "0\\t\$shortName\\t\$fastaSize\\t\$base\\t\$fastaSize" > lift.\$base.txt + else + twoBitToFa \$spec stdout | sed -e "s/^>.*/>\$base/" > \$base.fa + endif $RepeatMasker $RepeatMaskerEngine -align $repeatLib \$base.fa + if ( \$nameLength > 45 ) then + liftUp -type=.out stdout lift.\$base.txt error \$base.fa.out > lift.\$base.out + liftUp -type=.align stdout lift.\$base.txt error \$base.fa.align > lift.\$base.align + mv -f lift.\$base.out \$base.fa.out + mv -f lift.\$base.align \$base.fa.align + endif if (-e \$base.fa.cat) then + if ( \$nameLength > 45 ) then + liftUp -type=.align stdout lift.\$base.txt error \$base.fa.cat > lift.\$base.cat + mv -f lift.\$base.cat \$base.fa.cat + endif mv \$base.fa.cat \$catOut endif end # Lift up (leave the RepeatMasker header in place because we'll liftUp # again later): liftUp -type=.out stdout \$inLft error *.fa.out > tmpOut__out set nonomatch set alignFiles = ( *.align ) if ( \${#alignFiles} && -e \$alignFiles[1] ) then liftUp -type=.align stdout \$inLft error *.align \\ > tmpOut__align else touch tmpOut__align @@ -350,82 +369,99 @@ } # doCluster ######################################################################### # * step: cat [fileServer] sub doCat { my $runDir = "$buildDir"; &HgAutomate::checkExistsUnlessDebug('cluster', 'cat', "$buildDir/run.cluster/run.time"); my $whatItDoes = "It concatenates .out files from cluster run into a single $db.sorted.fa.out.\n" . "liftUp (with no lift specs) is used to concatenate .out files because it\n" . "uniquifies (per input file) the .out IDs which can then be used to join\n" . "fragmented repeats in the Nested Repeats track."; my $fileServer = &HgAutomate::chooseFileServer($runDir); - my $bossScript = new HgRemoteScript("$runDir/doCat.csh", $fileServer, + my $bossScript = newBash HgRemoteScript("$runDir/doCat.bash", $fileServer, $runDir, $whatItDoes); # Use symbolic link created in cluster step: my $partDir = "$buildDir/RMPart"; my $indent = ""; my $path = ""; my $levels = $opt_debug ? 1 : `cat $partDir/levels`; chomp $levels; $bossScript->add("# Debug mode -- don't know the real number of levels\n") if ($opt_debug); # Make the appropriate number of nested foreach's to match the number # of levels from the partitioning. If $levels is 1, no foreach required. for (my $l = $levels - 2; $l >= 0; $l--) { my $dir = ($l == ($levels - 2)) ? $partDir : "\$d" . ($l + 1); $bossScript->add(<<_EOF_ -${indent}foreach d$l ($dir/???) +${indent}for d$l in $dir/??? +do _EOF_ ); $indent .= ' '; $path .= "\$d$l/"; } for (my $l = 0; $l <= $levels - 2; $l++) { $bossScript->add(<<_EOF_ ${indent} liftUp ${path}cat.out /dev/null carry ${path}???/*.out ${indent} liftUp ${path}cat.align /dev/null carry ${path}???/*.align -${indent}end +${indent}done _EOF_ ); $indent =~ s/ //; $path =~ s/\$d\d+\/$//; } $bossScript->add(<<_EOF_ liftUp $db.fa.out /dev/null carry $partDir/???/*.out liftUp $db.fa.align /dev/null carry $partDir/???/*.align head -3 $db.fa.out > $db.sorted.fa.out tail -n +4 $db.fa.out | sort -k5,5 -k6,6n >> $db.sorted.fa.out _EOF_ ); $bossScript->add(<<_EOF_ # Use the ID column to join up fragments of interrupted repeats for the -# Nested Repeats track. -$Bin/extractNestedRepeats.pl $db.fa.out | sort -k1,1 -k2,2n > $db.nestedRepeats.bed +# Nested Repeats track. Try to avoid the Undefined id errors. +($Bin/extractNestedRepeats.pl $db.fa.out 2> nestRep.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed +if [ -s "nestRep.err" ]; then + export lineCount=`(grep "Undefined id, line" nestRep.err || true) | cut -d' ' -f6 | wc -l` + if [ "\${lineCount}" -gt 0 ]; then + if [ "\${lineCount}" -gt 10 ]; then + printf "ERROR: too many Undefined id lines (> 10) reported by extractNestedRepeats.pl" 1>&2 + exit 255 + fi + export sedExpr=`grep "Undefined id, line" nestRep.err | cut -d' ' -f6 | sed -e 's/\$/d;/;' | xargs echo` + export sedCmd="sed -i.broken -e '\$sedExpr'" + eval \$sedCmd $db.fa.out + ($Bin/extractNestedRepeats.pl $db.fa.out 2> nestRep2.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed + if [ -s "nestRep2.err" ]; then + printf "ERROR: the attempt of cleaning nestedRepeats did not work" 1>&2 + exit 255 + fi + fi +fi _EOF_ ); $bossScript->execute(); } # doCat - ######################################################################### # * step: mask [workhorse] sub doMask { my $runDir = "$buildDir"; &HgAutomate::checkExistsUnlessDebug('cat', 'mask', "$buildDir/$db.sorted.fa.out"); my $whatItDoes = "It makes a masked .2bit in this build directory."; my $workhorse = &HgAutomate::chooseWorkhorse(); my $bossScript = new HgRemoteScript("$runDir/doMask.csh", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ twoBitMask $unmaskedSeq $db.sorted.fa.out $db.rmsk$updateTable.2bit twoBitToFa $db.rmsk$updateTable.2bit stdout | faSize stdin > faSize.rmsk$updateTable.txt _EOF_