9c0a3a2843c69011bd7061bc61a73c04033a8af0 hiram Fri Dec 6 15:20:47 2019 -0800 now building a fake AGP when there are none in the source no redmine diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index fa6f160..a6a243b 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -498,131 +498,152 @@ ######################################################################### # * step: sequence [workhorse] sub doSequence { my $runDir = "$buildDir/sequence"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "process source files into 2bit sequence and agp"; my $bossScript = newBash HgRemoteScript("$runDir/doSequence.bash", $workhorse, $runDir, $whatItDoes); my $twoBitFile = "$buildDir/download/$asmId.2bit"; my $otherChrParts = 0; # to see if this is unplaced scaffolds only my $primaryAssembly = "$buildDir/download/${asmId}_assembly_structure/Primary_Assembly"; + my $partsDone = 0; ########### Assembled chromosomes ################ my $chr2acc = "$primaryAssembly/assembled_chromosomes/chr2acc"; if ( -s $chr2acc ) { ++$otherChrParts; my $agpSource = "$primaryAssembly/assembled_chromosomes/AGP"; my $agpOutput = "$runDir/$asmId.chr.agp.gz"; my $agpNames = "$runDir/$asmId.chr.names"; my $fastaOut = "$runDir/$asmId.chr.fa.gz"; + $partsDone += 1; if (needsUpdate($chr2acc, $agpOutput)) { compositeAgp($chr2acc, $agpSource, $agpOutput, $agpNames); `touch -r $chr2acc $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { compositeFasta($chr2acc, $twoBitFile, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } ########### unlocalized sequence ################ my $chr2scaf = "$primaryAssembly/unlocalized_scaffolds/unlocalized.chr2scaf"; if ( -s $chr2scaf ) { ++$otherChrParts; my $agpSource = "$primaryAssembly/unlocalized_scaffolds/AGP"; my $agpOutput = "$runDir/$asmId.unlocalized.agp.gz"; my $agpNames = "$runDir/$asmId.unlocalized.names"; my $fastaOut = "$runDir/$asmId.unlocalized.fa.gz"; + $partsDone += 1; if (needsUpdate($chr2scaf, $agpOutput)) { unlocalizedAgp($chr2scaf, $agpSource, $agpOutput, $agpNames); `touch -r $chr2scaf $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { unlocalizedFasta($chr2scaf, $twoBitFile, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } ########### unplaced sequence ################ my $unplacedScafAgp = "$primaryAssembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; if ( -s $unplacedScafAgp ) { my $chrPrefix = ""; # no prefix if no other chrom parts $chrPrefix = "chrUn_" if ($otherChrParts); my $agpOutput = "$runDir/$asmId.unplaced.agp.gz"; my $agpNames = "$runDir/$asmId.unplaced.names"; + $partsDone += 1; if (needsUpdate($unplacedScafAgp, $agpOutput)) { unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix); `touch -r $unplacedScafAgp $agpOutput`; } my $fastaOut = "$runDir/$asmId.unplaced.fa.gz"; if (needsUpdate($twoBitFile, $fastaOut)) { unplacedFasta($unplacedScafAgp, $twoBitFile, $chrPrefix, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } ########### non-nuclear chromosome sequence ################ my $nonNucAsm = "$buildDir/download/${asmId}_assembly_structure/non-nuclear"; my $nonNucChr2acc = "$nonNucAsm/assembled_chromosomes/chr2acc"; my $agpSource = "$nonNucAsm/assembled_chromosomes/AGP"; if ( -s $nonNucChr2acc ) { my $agpOutput = "$runDir/$asmId.nonNucChr.agp.gz"; my $agpNames = "$runDir/$asmId.nonNucChr.names"; my $fastaOut = "$runDir/$asmId.nonNucChr.fa.gz"; + $partsDone += 1; if (needsUpdate($nonNucChr2acc, $agpOutput)) { compositeAgp($nonNucChr2acc, $agpSource, $agpOutput, $agpNames); `touch -r $nonNucChr2acc $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { compositeFasta($nonNucChr2acc, $twoBitFile, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } ########### non-nuclear scaffold unlocalized sequence ################ my $nonNucChr2scaf = "$nonNucAsm/unlocalized_scaffolds/unlocalized.chr2scaf"; if ( -s $nonNucChr2scaf ) { my $agpOutput = "$runDir/$asmId.nonNucUnlocalized.agp.gz"; my $agpNames = "$runDir/$asmId.nonNucUnlocalized.names"; my $fastaOut = "$runDir/$asmId.nonNucUnlocalized.fa.gz"; + $partsDone += 1; if (needsUpdate($nonNucChr2scaf, $agpOutput)) { compositeAgp($nonNucChr2scaf, $agpSource, $agpOutput, $agpNames); `touch -r $nonNucChr2scaf $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { compositeFasta($nonNucChr2scaf, $twoBitFile, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } -### XXX TO BE DONE - construct sequence when no AGP files exist - $bossScript->add(<<_EOF_ export asmId="$asmId" if [ -s ../\$asmId.chrom.sizes ]; then printf "sequence step previously completed\\n" 1>&2 exit 0 fi +_EOF_ + ); + +### construct sequence when no AGP files exist + if (0 == $partsDone) { +printf STDERR "creating fake AGP\n"; + $bossScript->add(<<_EOF_ +twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz +hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz +zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names +_EOF_ + ); +} else { +printf STDERR "partsDone: %d\n", $partsDone; + } + + $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit twoBitDup -keyList=stdout ../\$asmId.unmasked.2bit > \$asmId.dupCheck.txt (grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\n" 1>&2 grep "are identical" \$asmId.dupCheck.txt 1>&2 exit 255 else rm -f \$asmId.dups.txt fi touch -r ../download/\$asmId.2bit ../\$asmId.2bit touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz @@ -880,30 +901,33 @@ # * step: allGaps [workhorse] sub doAllGaps { my $runDir = "$buildDir/trackData/allGaps"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct 'all' gap track data"; my $bossScript = newBash HgRemoteScript("$runDir/doAllGaps.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId export buildDir=$buildDir if [ \$buildDir/\$asmId.2bit -nt \$asmId.allGaps.bb ]; then twoBitInfo -nBed ../../\$asmId.2bit stdout | awk '{printf "%s\\t%d\\t%d\\t%d\\t%d\\t+\\n", \$1, \$2, \$3, NR, \$3-\$2}' > \$asmId.allGaps.bed + if [ ! -s \$asmId.allGaps.bed ]; then + exit 0 + fi if [ -s ../assemblyGap/\$asmId.gap.bb ]; then bigBedToBed ../assemblyGap/\$asmId.gap.bb \$asmId.gap.bed # verify the 'all' gaps should include the gap track items bedIntersect -minCoverage=0.0000000014 \$asmId.allGaps.bed \$asmId.gap.bed \\ \$asmId.verify.annotated.gap.bed gapTrackCoverage=`awk '{print \$3-\$2}' \$asmId.gap.bed \\ | ave stdin | grep "^total" | sed -e 's/.000000//;'` intersectCoverage=`ave -col=5 \$asmId.verify.annotated.gap.bed \\ | grep "^total" | sed -e 's/.000000//;'` if [ \$gapTrackCoverage -ne \$intersectCoverage ]; then printf "ERROR: 'all' gaps does not include gap track coverage\\n" 1>&2 printf "gapTrackCoverage: \$gapTrackCoverage != \$intersectCoverage intersection\\n" 1>&2 exit 255 fi else