3c8ffb11a3e0a8830f64c3e19ca7f7ff8b167f33
hiram
  Wed Jul 20 17:19:51 2022 -0700
handle cases where there is no result from RepeatMasker or TRF simple repeats for virus browser building refs #29484

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index cdae480..0d8dfb5 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -1262,31 +1262,39 @@
   }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 export buildDir=$buildDir
 
 if [ \$buildDir/\$asmId.2bit -nt trfMask.bed.gz ]; then
   doSimpleRepeat.pl -stop=filter -buildDir=`pwd` \\
     -unmaskedSeq=\$buildDir/\$asmId.2bit \\
       -trf409=6 -dbHost=$dbHost -smallClusterHub=$trfClusterHub \\
         -workhorse=$workhorse \$asmId
   doSimpleRepeat.pl -buildDir=`pwd` \\
     -continue=cleanup -stop=cleanup -unmaskedSeq=\$buildDir/\$asmId.2bit \\
       -trf409=6 -dbHost=$dbHost -smallClusterHub=$trfClusterHub \\
         -workhorse=$workhorse \$asmId
-  gzip simpleRepeat.bed trfMask.bed
+  if [ -s simpleRepeat.bed ]; then
+    gzip simpleRepeat.bed &
+  else
+    rm -f simpleRepeat.bed
+  fi
+  if [ -s trfMask.bed ]; then
+    gzip trfMask.bed &
+  fi
+  wait
 fi
 _EOF_
   );
   $bossScript->execute();
 } # simpleRepeat
 
 ##   my $rmskResult = "$buildDir/trackData/repeatMasker/$asmId.rmsk.2bit";
 ##   if (! -s $rmskResult) {
 ##     die "simpleRepeat: previous step repeatMasker has not completed\n" .
 ##       "# not found: $rmskResult\n";
 ##   }
 ##   twoBitMask ../repeatMasker/\$asmId.rmsk.2bit -add trfMask.bed \\
 ##     \$asmId.RM_TRF_masked.2bit
 
 #########################################################################
@@ -1415,63 +1423,72 @@
   my $runDir = "$buildDir/trackData/addMask";
 
   my $goNoGo = 0;
   if (! $noRmsk) {
     if ( ! -s "$buildDir/trackData/repeatMasker/$asmId.rmsk.2bit" ) {
        printf STDERR "ERROR: repeatMasker step not completed\n";
        printf STDERR "can not find: $buildDir/trackData/repeatMasker/$asmId.rmsk.2bit\n";
        $goNoGo = 1;
     }
   }
   if ( ! -s "$buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit" ) {
       printf STDERR "ERROR: windowMasker step not completed\n";
       printf STDERR "can not find: $buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit\n";
       $goNoGo = 1;
   }
-  if ( ! -s "$buildDir/trackData/simpleRepeat/trfMask.bed.gz" ) {
+  if ( ! -s "$buildDir/trackData/simpleRepeat/doCleanup.csh" ) {
       printf STDERR "ERROR: simpleRepeat step not completed\n";
-      printf STDERR "can not find: $buildDir/trackData/simpleRepeat/trfMask.bed.gz\n";
+      printf STDERR "can not find: $buildDir/trackData/simpleRepeat/doCleanup.csh\n";
       $goNoGo = 1;
   }
   if ($goNoGo) {
       printf STDERR "ERROR: must complete repeatMasker, windowMasker and simpleRepeat before addMask\n";
       exit 255;
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "add together (windowMasker or repeatMasker) and trf/simpleRepeats to construct masked 2bit file";
   my $bossScript = newBash HgRemoteScript("$runDir/doAddMask.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   my $wmMasked=`grep "masked total" $buildDir/trackData/windowMasker/faSize.$asmId.cleanWMSdust.txt | awk '{print \$1}' | sed -e 's/%//;'`;
   my $rmMasked = 0;
   if (! $noRmsk) {
     $rmMasked=`grep "masked total" $buildDir/trackData/repeatMasker/faSize.rmsk.txt | awk '{print \$1}' | sed -e 's/%//;'`;
   }
 
   my $src2BitToMask = "../repeatMasker/$asmId.rmsk.2bit";
   if ($noRmsk || ($wmMasked > $rmMasked)) {
     $src2BitToMask = "../windowMasker/$asmId.cleanWMSdust.2bit";
   }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
+export src2Bit=$src2BitToMask
 export accessionId=`echo \$asmId | cut -d'_' -f1-2`
 
+# if simple repeat has a result, add it, otherwise no add
+if [ -s ../simpleRepeat/trfMask.bed.gz ]; then
   if [ ../simpleRepeat/trfMask.bed.gz -nt \$asmId.masked.faSize.txt ]; then
-  twoBitMask $src2BitToMask -type=.bed \\
+    twoBitMask \$src2Bit -type=.bed \\
        -add ../simpleRepeat/trfMask.bed.gz \$asmId.masked.2bit
+  fi
+else
+  cp -p \$src2Bit \$asmId.masked.2bit
+fi
+
+if [ \$asmId.masked.2bit -nt \$asmId.masked.faSize.txt ]; then
   twoBitToFa \$asmId.masked.2bit stdout | faSize stdin > \$asmId.masked.faSize.txt
   touch -r \$asmId.masked.2bit \$asmId.masked.faSize.txt
   bptForTwoBit \$asmId.masked.2bit \$asmId.masked.2bit.bpt
   touch -r \$asmId.masked.2bit \$asmId.masked.2bit.bpt
   twoBitToFa \$asmId.masked.2bit stdout | gzip -c > \$asmId.fa.gz
   touch -r \$asmId.masked.2bit \$asmId.fa.gz
   cp -p \$asmId.fa.gz ../../\$asmId.fa.gz
   cp -p \$asmId.masked.faSize.txt ../../\$asmId.faSize.txt
   cp -p \$asmId.masked.2bit.bpt ../../\$asmId.2bit.bpt
   size=`grep -w bases \$asmId.masked.faSize.txt | cut -d' ' -f1`
   if [ \$size -lt 4294967297 ]; then
     ln \$asmId.masked.2bit \$accessionId.2bit
     gfServer -trans index ../../\$accessionId.trans.gfidx \$accessionId.2bit &
     gfServer -stepSize=5 index ../../\$accessionId.untrans.gfidx \$accessionId.2bit
     wait
@@ -1493,52 +1510,59 @@
 
 #########################################################################
 # * step: windowMasker [workhorse]
 sub doWindowMasker {
   my $runDir = "$buildDir/trackData/windowMasker";
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "run windowMasker procedure";
   my $bossScript = newBash HgRemoteScript("$runDir/doWindowMasker.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
-### if [ ../../\$asmId.unmasked.2bit -nt fb.\$asmId.rmsk.windowmaskerSdust.txt ]; then
 if [ ../../\$asmId.unmasked.2bit -nt faSize.\$asmId.cleanWMSdust.txt ]; then
   \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -stop=twobit -buildDir=`pwd` -dbHost=$dbHost \\
     -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId
   bedInvert.pl ../../\$asmId.chrom.sizes ../allGaps/\$asmId.allGaps.bed.gz \\
     > not.gap.bed
   bedIntersect -minCoverage=0.0000000014 windowmasker.sdust.bed \\
      not.gap.bed stdout | sort -k1,1  -k2,2n > cleanWMask.bed
   twoBitMask $buildDir/\$asmId.unmasked.2bit cleanWMask.bed \\
     \$asmId.cleanWMSdust.2bit
   twoBitToFa \$asmId.cleanWMSdust.2bit stdout \\
     | faSize stdin > faSize.\$asmId.cleanWMSdust.txt
+  export intersectRmskWM=0
   if [ -s ../repeatMasker/\$asmId.sorted.fa.out.gz ]; then
     zcat ../repeatMasker/\$asmId.sorted.fa.out.gz | sed -e 's/^  *//; /^\$/d;' \\
-      | egrep -v "^SW|^score" | awk '{printf "%s\\t%d\\t%d\\n", \$5, \$6-1, \$7}' \\
+      | (egrep -v "^SW|^score" || true) | awk '{printf "%s\\t%d\\t%d\\n", \$5, \$6-1, \$7}' \\
         | bedSingleCover.pl stdin > rmsk.bed
+    if [ -s rmsk.bed ]; then
+      anyOneHome=`bedIntersect -minCoverage=0.0000000014 cleanWMask.bed \\
+          rmsk.bed stdout | bedSingleCover.pl stdin | wc -l`
+      if [ \$anyOneHome -gt 0 ]; then
        intersectRmskWM=`bedIntersect -minCoverage=0.0000000014 cleanWMask.bed \\
           rmsk.bed stdout | bedSingleCover.pl stdin | ave -col=4 stdin \\
            | grep "^total" | awk '{print \$2}' | sed -e 's/.000000//;'`
-  else
+       if [ "x\${intersectRmskWM}y" == "xy" ]; then
          intersectRmskWM=0
        fi
+      fi
+    fi
+  fi
   chromSize=`ave -col=2 ../../\$asmId.chrom.sizes \\
      | grep "^total" | awk '{print \$2}' | sed -e 's/.000000//;'`
   echo \$intersectRmskWM \$chromSize | \\
      awk '{ printf "%d bases of %d (%.3f%%) in intersection\\n", \$1, \$2, 100.0*\$1/\$2}' \\
       > fb.\$asmId.rmsk.windowmaskerSdust.txt
   rm -f not.gap.bed rmsk.bed
   bedToBigBed -type=bed3 cleanWMask.bed ../../\$asmId.chrom.sizes \$asmId.windowMasker.bb
   gzip cleanWMask.bed
   \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\
     -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId
 else
   printf "# windowMasker step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_