463df2eb31411fdb0a5462ff1ac4c7bc7c834ade
hiram
  Thu Jul 25 21:42:06 2019 -0700
manage long sequence names going into RM and work around nested repeats errors refs #23734

diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl
index 482bdb3..5d17072 100755
--- src/hg/utils/automation/doRepeatMasker.pl
+++ src/hg/utils/automation/doRepeatMasker.pl
@@ -212,37 +212,56 @@
 set inLft = \$inLst:r.lft
 set alignOut = \$finalOut:r.align
 set catOut = \$finalOut:r.cat
 
 # Use local disk for output, and move the final result to \$outPsl
 # when done, to minimize I/O.
 set tmpDir = `mktemp -d -p /scratch/tmp doRepeatMasker.cluster.XXXXXX`
 pushd \$tmpDir
 
 # Initialize local library
 $RepeatMasker $RepeatMaskerEngine $repeatLib /dev/null
 
 foreach spec (`cat \$inLst`)
   # Remove path and .2bit filename to get just the seq:start-end spec:
   set base = `echo \$spec | sed -r -e 's/^[^:]+://'`
+  # RM has a limitation of the length of a sequence name
+  # in the case of name too long, create a shorter name, then lift
+  set nameLength = `echo \$base | wc -c`
+  set shortName = `echo \$base  | sed -e 's/>//;' | md5sum | cut -d' ' -f1`
   # If \$spec is the whole sequence, twoBitToFa removes the :start-end part,
   # which causes liftUp to barf later.  So tweak the header back to
   # seq:start-end for liftUp's sake:
-  twoBitToFa \$spec stdout \\
-  | sed -e "s/^>.*/>\$base/" > \$base.fa
+  if ( \$nameLength > 45 ) then
+    twoBitToFa \$spec stdout | sed -e "s/^>.*/>\$shortName/" > \$base.fa
+    set fastaSize = `faSize \$base.fa | grep bases | cut -d' ' -f1`
+    /bin/printf "0\\t\$shortName\\t\$fastaSize\\t\$base\\t\$fastaSize" > lift.\$base.txt
+  else
+    twoBitToFa \$spec stdout | sed -e "s/^>.*/>\$base/" > \$base.fa
+  endif
   $RepeatMasker $RepeatMaskerEngine -align $repeatLib \$base.fa
+  if ( \$nameLength > 45 ) then
+    liftUp -type=.out stdout lift.\$base.txt error \$base.fa.out > lift.\$base.out
+    liftUp -type=.align stdout lift.\$base.txt error \$base.fa.align > lift.\$base.align
+    mv -f lift.\$base.out \$base.fa.out
+    mv -f lift.\$base.align \$base.fa.align
+  endif
   if (-e \$base.fa.cat) then
+    if ( \$nameLength > 45 ) then
+      liftUp -type=.align stdout lift.\$base.txt error \$base.fa.cat > lift.\$base.cat
+      mv -f lift.\$base.cat \$base.fa.cat
+    endif
     mv \$base.fa.cat \$catOut
   endif
 end
 
 # Lift up (leave the RepeatMasker header in place because we'll liftUp
 # again later):
 liftUp -type=.out stdout \$inLft error *.fa.out > tmpOut__out
 
 set nonomatch
 set alignFiles = ( *.align )
 if ( \${#alignFiles} && -e \$alignFiles[1] ) then
   liftUp -type=.align stdout \$inLft error *.align \\
     > tmpOut__align
 else
   touch tmpOut__align
@@ -350,82 +369,99 @@
 } # doCluster
 
 #########################################################################
 # * step: cat [fileServer]
 sub doCat {
   my $runDir = "$buildDir";
   &HgAutomate::checkExistsUnlessDebug('cluster', 'cat',
 				      "$buildDir/run.cluster/run.time");
 
   my $whatItDoes =
 "It concatenates .out files from cluster run into a single $db.sorted.fa.out.\n" .
 "liftUp (with no lift specs) is used to concatenate .out files because it\n" .
 "uniquifies (per input file) the .out IDs which can then be used to join\n" .
 "fragmented repeats in the Nested Repeats track.";
   my $fileServer = &HgAutomate::chooseFileServer($runDir);
-  my $bossScript = new HgRemoteScript("$runDir/doCat.csh", $fileServer,
+  my $bossScript = newBash HgRemoteScript("$runDir/doCat.bash", $fileServer,
 				      $runDir, $whatItDoes);
 
   # Use symbolic link created in cluster step:
   my $partDir = "$buildDir/RMPart";
   my $indent = "";
   my $path = "";
   my $levels = $opt_debug ? 1 : `cat $partDir/levels`;
   chomp $levels;
   $bossScript->add("# Debug mode -- don't know the real number of levels\n")
     if ($opt_debug);
 
   # Make the appropriate number of nested foreach's to match the number
   # of levels from the partitioning.  If $levels is 1, no foreach required.
   for (my $l = $levels - 2;  $l >= 0;  $l--) {
     my $dir = ($l == ($levels - 2)) ? $partDir : "\$d" . ($l + 1);
     $bossScript->add(<<_EOF_
-${indent}foreach d$l ($dir/???)
+${indent}for d$l in $dir/???
+do
 _EOF_
     );
     $indent .= '  ';
     $path .= "\$d$l/";
   }
   for (my $l = 0;  $l <= $levels - 2;  $l++) {
     $bossScript->add(<<_EOF_
 ${indent}  liftUp ${path}cat.out /dev/null carry ${path}???/*.out
 ${indent}  liftUp ${path}cat.align /dev/null carry ${path}???/*.align
-${indent}end
+${indent}done
 _EOF_
     );
     $indent =~ s/  //;
     $path =~ s/\$d\d+\/$//;
   }
   $bossScript->add(<<_EOF_
 
 liftUp $db.fa.out /dev/null carry $partDir/???/*.out
 liftUp $db.fa.align /dev/null carry $partDir/???/*.align
 head -3 $db.fa.out > $db.sorted.fa.out
 tail -n +4 $db.fa.out | sort -k5,5 -k6,6n >> $db.sorted.fa.out
 _EOF_
   );
   $bossScript->add(<<_EOF_
 
 # Use the ID column to join up fragments of interrupted repeats for the
-# Nested Repeats track.
-$Bin/extractNestedRepeats.pl $db.fa.out | sort -k1,1 -k2,2n > $db.nestedRepeats.bed
+# Nested Repeats track.  Try to avoid the Undefined id errors.
+($Bin/extractNestedRepeats.pl $db.fa.out 2> nestRep.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed
+if [ -s "nestRep.err" ]; then
+  export lineCount=`(grep "Undefined id, line" nestRep.err || true) | cut -d' ' -f6 | wc -l`
+   if [ "\${lineCount}" -gt 0 ]; then
+     if [ "\${lineCount}" -gt 10 ]; then
+        printf "ERROR: too many Undefined id lines (> 10) reported by extractNestedRepeats.pl" 1>&2
+        exit 255
+     fi
+     export sedExpr=`grep "Undefined id, line" nestRep.err | cut -d' ' -f6 | sed -e 's/\$/d;/;' | xargs echo`
+     export sedCmd="sed -i.broken -e '\$sedExpr'"
+     eval \$sedCmd $db.fa.out
+     ($Bin/extractNestedRepeats.pl $db.fa.out 2> nestRep2.err || true) | sort -k1,1 -k2,2n > $db.nestedRepeats.bed
+     if [ -s "nestRep2.err" ]; then
+        printf "ERROR: the attempt of cleaning nestedRepeats did not work" 1>&2
+        exit 255
+     fi
+   fi
+fi
 _EOF_
   );
   $bossScript->execute();
 } # doCat
 
-
 #########################################################################
 # * step: mask [workhorse]
 sub doMask {
   my $runDir = "$buildDir";
   &HgAutomate::checkExistsUnlessDebug('cat', 'mask', "$buildDir/$db.sorted.fa.out");
 
   my $whatItDoes = "It makes a masked .2bit in this build directory.";
   my $workhorse = &HgAutomate::chooseWorkhorse();
   my $bossScript = new HgRemoteScript("$runDir/doMask.csh", $workhorse,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 twoBitMask $unmaskedSeq $db.sorted.fa.out $db.rmsk$updateTable.2bit
 twoBitToFa $db.rmsk$updateTable.2bit stdout | faSize stdin > faSize.rmsk$updateTable.txt
 _EOF_