3c8ffb11a3e0a8830f64c3e19ca7f7ff8b167f33 hiram Wed Jul 20 17:19:51 2022 -0700 handle cases where there is no result from RepeatMasker or TRF simple repeats for virus browser building refs #29484 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index cdae480..0d8dfb5 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -1262,31 +1262,39 @@ } $bossScript->add(<<_EOF_ export asmId=$asmId export buildDir=$buildDir if [ \$buildDir/\$asmId.2bit -nt trfMask.bed.gz ]; then doSimpleRepeat.pl -stop=filter -buildDir=`pwd` \\ -unmaskedSeq=\$buildDir/\$asmId.2bit \\ -trf409=6 -dbHost=$dbHost -smallClusterHub=$trfClusterHub \\ -workhorse=$workhorse \$asmId doSimpleRepeat.pl -buildDir=`pwd` \\ -continue=cleanup -stop=cleanup -unmaskedSeq=\$buildDir/\$asmId.2bit \\ -trf409=6 -dbHost=$dbHost -smallClusterHub=$trfClusterHub \\ -workhorse=$workhorse \$asmId - gzip simpleRepeat.bed trfMask.bed + if [ -s simpleRepeat.bed ]; then + gzip simpleRepeat.bed & + else + rm -f simpleRepeat.bed + fi + if [ -s trfMask.bed ]; then + gzip trfMask.bed & + fi + wait fi _EOF_ ); $bossScript->execute(); } # simpleRepeat ## my $rmskResult = "$buildDir/trackData/repeatMasker/$asmId.rmsk.2bit"; ## if (! -s $rmskResult) { ## die "simpleRepeat: previous step repeatMasker has not completed\n" . ## "# not found: $rmskResult\n"; ## } ## twoBitMask ../repeatMasker/\$asmId.rmsk.2bit -add trfMask.bed \\ ## \$asmId.RM_TRF_masked.2bit ######################################################################### @@ -1415,63 +1423,72 @@ my $runDir = "$buildDir/trackData/addMask"; my $goNoGo = 0; if (! $noRmsk) { if ( ! -s "$buildDir/trackData/repeatMasker/$asmId.rmsk.2bit" ) { printf STDERR "ERROR: repeatMasker step not completed\n"; printf STDERR "can not find: $buildDir/trackData/repeatMasker/$asmId.rmsk.2bit\n"; $goNoGo = 1; } } if ( ! -s "$buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit" ) { printf STDERR "ERROR: windowMasker step not completed\n"; printf STDERR "can not find: $buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit\n"; $goNoGo = 1; } - if ( ! -s "$buildDir/trackData/simpleRepeat/trfMask.bed.gz" ) { + if ( ! -s "$buildDir/trackData/simpleRepeat/doCleanup.csh" ) { printf STDERR "ERROR: simpleRepeat step not completed\n"; - printf STDERR "can not find: $buildDir/trackData/simpleRepeat/trfMask.bed.gz\n"; + printf STDERR "can not find: $buildDir/trackData/simpleRepeat/doCleanup.csh\n"; $goNoGo = 1; } if ($goNoGo) { printf STDERR "ERROR: must complete repeatMasker, windowMasker and simpleRepeat before addMask\n"; exit 255; } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "add together (windowMasker or repeatMasker) and trf/simpleRepeats to construct masked 2bit file"; my $bossScript = newBash HgRemoteScript("$runDir/doAddMask.bash", $workhorse, $runDir, $whatItDoes); my $wmMasked=`grep "masked total" $buildDir/trackData/windowMasker/faSize.$asmId.cleanWMSdust.txt | awk '{print \$1}' | sed -e 's/%//;'`; my $rmMasked = 0; if (! $noRmsk) { $rmMasked=`grep "masked total" $buildDir/trackData/repeatMasker/faSize.rmsk.txt | awk '{print \$1}' | sed -e 's/%//;'`; } my $src2BitToMask = "../repeatMasker/$asmId.rmsk.2bit"; if ($noRmsk || ($wmMasked > $rmMasked)) { $src2BitToMask = "../windowMasker/$asmId.cleanWMSdust.2bit"; } $bossScript->add(<<_EOF_ export asmId=$asmId +export src2Bit=$src2BitToMask export accessionId=`echo \$asmId | cut -d'_' -f1-2` +# if simple repeat has a result, add it, otherwise no add +if [ -s ../simpleRepeat/trfMask.bed.gz ]; then if [ ../simpleRepeat/trfMask.bed.gz -nt \$asmId.masked.faSize.txt ]; then - twoBitMask $src2BitToMask -type=.bed \\ + twoBitMask \$src2Bit -type=.bed \\ -add ../simpleRepeat/trfMask.bed.gz \$asmId.masked.2bit + fi +else + cp -p \$src2Bit \$asmId.masked.2bit +fi + +if [ \$asmId.masked.2bit -nt \$asmId.masked.faSize.txt ]; then twoBitToFa \$asmId.masked.2bit stdout | faSize stdin > \$asmId.masked.faSize.txt touch -r \$asmId.masked.2bit \$asmId.masked.faSize.txt bptForTwoBit \$asmId.masked.2bit \$asmId.masked.2bit.bpt touch -r \$asmId.masked.2bit \$asmId.masked.2bit.bpt twoBitToFa \$asmId.masked.2bit stdout | gzip -c > \$asmId.fa.gz touch -r \$asmId.masked.2bit \$asmId.fa.gz cp -p \$asmId.fa.gz ../../\$asmId.fa.gz cp -p \$asmId.masked.faSize.txt ../../\$asmId.faSize.txt cp -p \$asmId.masked.2bit.bpt ../../\$asmId.2bit.bpt size=`grep -w bases \$asmId.masked.faSize.txt | cut -d' ' -f1` if [ \$size -lt 4294967297 ]; then ln \$asmId.masked.2bit \$accessionId.2bit gfServer -trans index ../../\$accessionId.trans.gfidx \$accessionId.2bit & gfServer -stepSize=5 index ../../\$accessionId.untrans.gfidx \$accessionId.2bit wait @@ -1493,52 +1510,59 @@ ######################################################################### # * step: windowMasker [workhorse] sub doWindowMasker { my $runDir = "$buildDir/trackData/windowMasker"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run windowMasker procedure"; my $bossScript = newBash HgRemoteScript("$runDir/doWindowMasker.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId -### if [ ../../\$asmId.unmasked.2bit -nt fb.\$asmId.rmsk.windowmaskerSdust.txt ]; then if [ ../../\$asmId.unmasked.2bit -nt faSize.\$asmId.cleanWMSdust.txt ]; then \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -stop=twobit -buildDir=`pwd` -dbHost=$dbHost \\ -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId bedInvert.pl ../../\$asmId.chrom.sizes ../allGaps/\$asmId.allGaps.bed.gz \\ > not.gap.bed bedIntersect -minCoverage=0.0000000014 windowmasker.sdust.bed \\ not.gap.bed stdout | sort -k1,1 -k2,2n > cleanWMask.bed twoBitMask $buildDir/\$asmId.unmasked.2bit cleanWMask.bed \\ \$asmId.cleanWMSdust.2bit twoBitToFa \$asmId.cleanWMSdust.2bit stdout \\ | faSize stdin > faSize.\$asmId.cleanWMSdust.txt + export intersectRmskWM=0 if [ -s ../repeatMasker/\$asmId.sorted.fa.out.gz ]; then zcat ../repeatMasker/\$asmId.sorted.fa.out.gz | sed -e 's/^ *//; /^\$/d;' \\ - | egrep -v "^SW|^score" | awk '{printf "%s\\t%d\\t%d\\n", \$5, \$6-1, \$7}' \\ + | (egrep -v "^SW|^score" || true) | awk '{printf "%s\\t%d\\t%d\\n", \$5, \$6-1, \$7}' \\ | bedSingleCover.pl stdin > rmsk.bed + if [ -s rmsk.bed ]; then + anyOneHome=`bedIntersect -minCoverage=0.0000000014 cleanWMask.bed \\ + rmsk.bed stdout | bedSingleCover.pl stdin | wc -l` + if [ \$anyOneHome -gt 0 ]; then intersectRmskWM=`bedIntersect -minCoverage=0.0000000014 cleanWMask.bed \\ rmsk.bed stdout | bedSingleCover.pl stdin | ave -col=4 stdin \\ | grep "^total" | awk '{print \$2}' | sed -e 's/.000000//;'` - else + if [ "x\${intersectRmskWM}y" == "xy" ]; then intersectRmskWM=0 fi + fi + fi + fi chromSize=`ave -col=2 ../../\$asmId.chrom.sizes \\ | grep "^total" | awk '{print \$2}' | sed -e 's/.000000//;'` echo \$intersectRmskWM \$chromSize | \\ awk '{ printf "%d bases of %d (%.3f%%) in intersection\\n", \$1, \$2, 100.0*\$1/\$2}' \\ > fb.\$asmId.rmsk.windowmaskerSdust.txt rm -f not.gap.bed rmsk.bed bedToBigBed -type=bed3 cleanWMask.bed ../../\$asmId.chrom.sizes \$asmId.windowMasker.bb gzip cleanWMask.bed \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\ -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId else printf "# windowMasker step previously completed\\n" 1>&2 exit 0 fi _EOF_