2b2a19a35ceb667d2c0659ff422a980e7e1e22c1 hiram Thu Jun 23 13:23:49 2022 -0700 correctly sizing numbers in allGaps refs #29545 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index bed66a8..cdae480 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -1302,74 +1302,74 @@ $bossScript->add(<<_EOF_ export asmId=$asmId export buildDir=$buildDir if [ \$buildDir/\$asmId.2bit -nt \$asmId.allGaps.bb ]; then twoBitInfo -nBed ../../\$asmId.2bit stdout | awk '{printf "%s\\t%d\\t%d\\t%d\\t%d\\t+\\n", \$1, \$2, \$3, NR, \$3-\$2}' > \$asmId.allGaps.bed if [ ! -s \$asmId.allGaps.bed ]; then exit 0 fi if [ -s ../assemblyGap/\$asmId.gap.bb ]; then bigBedToBed ../assemblyGap/\$asmId.gap.bb \$asmId.gap.bed # verify the 'all' gaps should include the gap track items bedIntersect -minCoverage=0.0000000014 \$asmId.allGaps.bed \$asmId.gap.bed \\ \$asmId.verify.annotated.gap.bed gapTrackCoverage=`awk '{print \$3-\$2}' \$asmId.gap.bed \\ - | ave stdin | grep "^total" | sed -e 's/.000000//;'` + | ave stdin | grep "^total" | awk '{print \$NF}' | sed -e 's/.000000//;'` intersectCoverage=`ave -col=5 \$asmId.verify.annotated.gap.bed \\ - | grep "^total" | sed -e 's/.000000//;'` + | grep "^total" | awk '{print \$NF}' | sed -e 's/.000000//;'` if [ \$gapTrackCoverage -ne \$intersectCoverage ]; then printf "ERROR: 'all' gaps does not include gap track coverage\\n" 1>&2 printf "gapTrackCoverage: \$gapTrackCoverage != \$intersectCoverage intersection\\n" 1>&2 exit 255 fi else touch \$asmId.gap.bed fi bedInvert.pl ../../\$asmId.chrom.sizes \$asmId.allGaps.bed \\ > \$asmId.NOT.allGaps.bed # verify bedInvert worked correctly # sum of both sizes should equal genome size both=`cat \$asmId.NOT.allGaps.bed \$asmId.allGaps.bed \\ | awk '{print \$3-\$2}' | ave stdin | grep "^total" \\ - | sed -e 's/.000000//;'` + | awk '{print \$NF}' | sed -e 's/.000000//;'` genomeSize=`ave -col=2 ../../\$asmId.chrom.sizes | grep "^total" \\ - | sed -e 's/.000000//;'` + | awk '{print \$NF}' | sed -e 's/.000000//;'` if [ \$genomeSize -ne \$both ]; then printf "ERROR: bedInvert.pl did not function correctly on allGaps.bed\n" 1>&2 printf "genomeSize: \$genomeSize != \$both both gaps data\n" 1>&2 exit 255 fi bedInvert.pl ../../\$asmId.chrom.sizes \$asmId.gap.bed \\ > \$asmId.NOT.gap.bed # again, verify bedInvert is working correctly, sum of both == genomeSize both=`cat \$asmId.NOT.gap.bed \$asmId.gap.bed \\ | awk '{print \$3-\$2}' | ave stdin | grep "^total" \\ - | sed -e 's/.000000//;'` + | awk '{print \$NF}' | sed -e 's/.000000//;'` if [ \$genomeSize -ne \$both ]; then printf "ERROR: bedInvert did not function correctly on gap.bed\n" 1>&2 printf "genomeSize: \$genomeSize != \$both both gaps data\n" 1>&2 exit 255 fi bedIntersect -minCoverage=0.0000000014 \$asmId.allGaps.bed \\ \$asmId.NOT.gap.bed \$asmId.notAnnotated.gap.bed # verify the intersect functioned correctly # sum of new gaps plus gap track should equal all gaps allGapCoverage=`awk '{print \$3-\$2}' \$asmId.allGaps.bed \\ - | ave stdin | grep "^total" | sed -e 's/.000000//;'` + | ave stdin | grep "^total" | awk '{print \$NF}' | sed -e 's/.000000//;'` both=`cat \$asmId.notAnnotated.gap.bed \$asmId.gap.bed \\ - | awk '{print \$3-\$2}' | ave stdin | grep "^total" | sed -e 's/.000000//;'` + | awk '{print \$3-\$2}' | ave stdin | grep "^total" | awk '{print \$NF}' | sed -e 's/.000000//;'` if [ \$allGapCoverage -ne \$both ]; then printf "ERROR: bedIntersect to identify new gaps did not function correctly\n" 1>&2 printf "allGaps: \$allGapCoverage != \$both (new + gap track)\n" 1>&2 fi cut -f1-3 \$asmId.allGaps.bed | sort -k1,1 -k2,2n > toBbi.bed bedToBigBed -type=bed3 toBbi.bed ../../\$asmId.chrom.sizes \$asmId.allGaps.bb rm -f toBbi.bed gzip *.bed else printf "# allgaps step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute();