1c0cb9b18d0e449cbdae2c681d0a0abdcacd1e7e hiram Fri Jul 19 10:34:46 2019 -0700 add augustusSpecies argument rearrange masking fix allGaps refs #23734 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 4b538f6..b4829df 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -14,119 +14,121 @@ use File::stat; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir + $opt_augustusSpecies $opt_ucscNames /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'gc5Base', func => \&doGc5Base }, { name => 'repeatMasker', func => \&doRepeatMasker }, { name => 'simpleRepeat', func => \&doSimpleRepeat }, { name => 'allGaps', func => \&doAllGaps }, { name => 'idKeys', func => \&doIdKeys }, - { name => 'addMask', func => \&doAddMask }, { name => 'windowMasker', func => \&doWindowMasker }, + { name => 'addMask', func => \&doAddMask }, { name => 'cpgIslands', func => \&doCpgIslands }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; +my $augustusSpecies = "human"; my $ucscNames = 0; # default 'FALSE' (== 0) my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base [options] genbank|refseq subGroup species asmId required arguments: genbank|refseq - specify either genbank or refseq hierarchy source subGroup - specify subGroup at NCBI FTP site, examples: - vertebrate_mammalian vertebrate_other plant etc... species - species directory at NCBI FTP site, examples: - Homo_sapiens Mus_musculus etc... asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Construct assembly hub in dir instead of default $HgAutomate::clusterData/asmHubs/{genbank|refseq}/subGroup/species/asmId/ - -sourceDir dir Find assembly in dir instead of default - $sourceDir - the assembly is found at: - sourceDir/{genbank|refseq}/subGroup/species/all_assembly_versions/asmId/ + -sourceDir dir Find assembly in dir instead of default: + $sourceDir//subGroup/species/all_assembly_versions/asmId -ucscNames Translate NCBI/INSDC/RefSeq names to UCSC names default is to use the given NCBI/INSDC/RefSeq names + -augustusSpecies default 'human' _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => $fileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates build of assembly hub. Steps: download: sets up sym link working hierarchy from already mirrored files from NCBI in: $sourceDir/{genbank|refseq}/ sequence: establish AGP and 2bit file from NCBI directory assemblyGap: create assembly and gap bigBed files and indexes for assembly track names gatewayPage: create html/asmId.description.html contents gc5Base: create bigWig file for gc5Base track repeatMasker: run repeat masker cluster run and create bigBed files for the composite track categories of repeats simpleRepeat: run trf cluster run and create bigBed file for simple repeats allGaps: calculate all actual real gaps due to N's in sequence, can be more than were specified in the AGP file idKeys: calculate md5sum for each sequence in the assembly to be used to find identical sequences in similar assemblies - addMask: combine repeatMasker and trf simpleRepeats into one 2bit file windowMasker: run windowMasker cluster run, create windowMasker bigBed file and compute intersection with repeatMasker results + addMask: combine the higher masking of (windowMasker or repeatMasker) with + trf simpleRepeats into one 2bit file cpgIslands: run CpG islands cluster runs for both masked and unmasked sequences and create bigBed files for this composite track trackDb: create trackDb.txt file for assembly hub to include all constructed bigBed and bigWig tracks cleanup: Removes or compresses intermediate files. All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/template.\$date unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from \$db.2bit. @@ -137,30 +139,32 @@ print "\n"; exit $status; } # Globals: # Command line args: genbankRefseq subGroup species asmId my ($genbankRefseq, $subGroup, $species, $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', + 'sourceDir=s', + 'augustusSpecies=s', 'ucscNames', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### ######################################################################### # assistant subroutines here. The 'do' steps follow this section ######################################################################### @@ -590,30 +594,31 @@ faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit touch -r ../download/\$asmId.2bit ../\$asmId.2bit touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes # verify everything is there twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"` export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"` if [ "\$newTotal" != "\$oldTotal" ]; then printf "# ERROR: sequence construction error: not same totals source vs. new:\n" 1>&2 printf "# \$newTotal != \$oldTotal\n" 1>&2 exit 255 fi +rm source.\$asmId.chrom.sizes export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1` if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\n" 1>&2 exit 255 fi twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt _EOF_ ); $bossScript->execute(); } # doSequence @@ -811,47 +816,48 @@ ######################################################################### # * step: allGaps [workhorse] sub doAllGaps { my $runDir = "$buildDir/trackData/allGaps"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct 'all' gap track data"; my $bossScript = newBash HgRemoteScript("$runDir/doAllGaps.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId export buildDir=$buildDir if [ \$buildDir/\$asmId.2bit -nt \$asmId.allGaps.bb ]; then - findMotif -motif=gattaca -verbose=4 -strand=+ ../../\$asmId.2bit \\ - > \$asmId.findMotif.txt 2>&1 - grep "^#GAP " \$asmId.findMotif.txt | sed -e 's/^#GAP //;' \\ - > \$asmId.allGaps.bed + twoBitInfo -nBed ../../\$asmId.2bit stdout | awk '{printf "%s\\t%d\\t%d\\t%d\\t%d\\t+\\n", \$1, \$2, \$3, NR, \$3-\$2}' > \$asmId.allGaps.bed + if [ -s ../assemblyGap/\$asmId.gap.bb ]; then bigBedToBed ../assemblyGap/\$asmId.gap.bb \$asmId.gap.bed # verify the 'all' gaps should include the gap track items bedIntersect -minCoverage=0.0000000014 \$asmId.allGaps.bed \$asmId.gap.bed \\ \$asmId.verify.annotated.gap.bed gapTrackCoverage=`awk '{print \$3-\$2}' \$asmId.gap.bed \\ | ave stdin | grep "^total" | sed -e 's/.000000//;'` intersectCoverage=`ave -col=5 \$asmId.verify.annotated.gap.bed \\ | grep "^total" | sed -e 's/.000000//;'` if [ \$gapTrackCoverage -ne \$intersectCoverage ]; then - printf "ERROR: 'all' gaps does not include gap track coverage\n" 1>&2 - printf "gapTrackCoverage: \$gapTrackCoverage != \$intersectCoverage intersection\n" 1>&2 + printf "ERROR: 'all' gaps does not include gap track coverage\\n" 1>&2 + printf "gapTrackCoverage: \$gapTrackCoverage != \$intersectCoverage intersection\\n" 1>&2 exit 255 fi + else + touch \$asmId.gap.bed + fi bedInvert.pl ../../\$asmId.chrom.sizes \$asmId.allGaps.bed \\ > \$asmId.NOT.allGaps.bed # verify bedInvert worked correctly # sum of both sizes should equal genome size both=`cat \$asmId.NOT.allGaps.bed \$asmId.allGaps.bed \\ | awk '{print \$3-\$2}' | ave stdin | grep "^total" \\ | sed -e 's/.000000//;'` genomeSize=`ave -col=2 ../../\$asmId.chrom.sizes | grep "^total" \\ | sed -e 's/.000000//;'` if [ \$genomeSize -ne \$both ]; then printf "ERROR: bedInvert.pl did not function correctly on allGaps.bed\n" 1>&2 printf "genomeSize: \$genomeSize != \$both both gaps data\n" 1>&2 exit 255 fi bedInvert.pl ../../\$asmId.chrom.sizes \$asmId.gap.bed \\ @@ -868,31 +874,31 @@ bedIntersect -minCoverage=0.0000000014 \$asmId.allGaps.bed \\ \$asmId.NOT.gap.bed \$asmId.notAnnotated.gap.bed # verify the intersect functioned correctly # sum of new gaps plus gap track should equal all gaps allGapCoverage=`awk '{print \$3-\$2}' \$asmId.allGaps.bed \\ | ave stdin | grep "^total" | sed -e 's/.000000//;'` both=`cat \$asmId.notAnnotated.gap.bed \$asmId.gap.bed \\ | awk '{print \$3-\$2}' | ave stdin | grep "^total" | sed -e 's/.000000//;'` if [ \$allGapCoverage -ne \$both ]; then printf "ERROR: bedIntersect to identify new gaps did not function correctly\n" 1>&2 printf "allGaps: \$allGapCoverage != \$both (new + gap track)\n" 1>&2 fi cut -f1-3 \$asmId.allGaps.bed | sort -k1,1 -k2,2n > toBbi.bed bedToBigBed -type=bed3 toBbi.bed ../../\$asmId.chrom.sizes \$asmId.allGaps.bb rm -f toBbi.bed - gzip *.bed *.txt + gzip *.bed else printf "# allgaps step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # allGaps ######################################################################### # * step: idKeys [workhorse] sub doIdKeys { my $runDir = "$buildDir/trackData/idKeys"; &HgAutomate::mustMkdir($runDir); @@ -913,58 +919,72 @@ ); $bossScript->execute(); } # idKeys ######################################################################### # * step: addMask [workhorse] sub doAddMask { my $runDir = "$buildDir/trackData/addMask"; my $goNoGo = 0; if ( ! -s "$buildDir/trackData/repeatMasker/$asmId.rmsk.2bit" ) { printf STDERR "ERROR: repeatMasker step not completed\n"; printf STDERR "can not find: $buildDir/trackData/repeatMasker/$asmId.rmsk.2bit\n"; $goNoGo = 1; } + if ( ! -s "$buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit" ) { + printf STDERR "ERROR: windowMasker step not completed\n"; + printf STDERR "can not find: $buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit\n"; + $goNoGo = 1; + } if ( ! -s "$buildDir/trackData/simpleRepeat/trfMask.bed.gz" ) { printf STDERR "ERROR: simpleRepeat step not completed\n"; printf STDERR "can not find: $buildDir/trackData/simpleRepeat/trfMask.bed.gz\n"; $goNoGo = 1; } if ($goNoGo) { - printf STDERR "ERROR: must complete both repeatMasker and simpleRepeat before addMask\n"; + printf STDERR "ERROR: must complete repeatMasker, windowMasker and simpleRepeat before addMask\n"; exit 255; } &HgAutomate::mustMkdir($runDir); - my $whatItDoes = "add together repeatMasker and trf/simpleRepeats to construct masked 2bit file"; + my $whatItDoes = "add together (windowMasker or repeatMasker) and trf/simpleRepeats to construct masked 2bit file"; my $bossScript = newBash HgRemoteScript("$runDir/doAddMask.bash", $workhorse, $runDir, $whatItDoes); + my $wmMasked=`grep "masked total" $buildDir/trackData/windowMasker/faSize.$asmId.cleanWMSdust.txt | awk '{print \$1}' | sed -e 's/%//;'`; + my $rmMasked=`grep "masked total" $buildDir/trackData/repeatMasker/faSize.rmsk.txt | awk '{print \$1}' | sed -e 's/%//;'`; + + my $src2BitToMask = "../repeatMasker/$asmId.rmsk.2bit"; + if ($wmMasked > $rmMasked) { + $src2BitToMask = "../windowMasker/$asmId.cleanWMSdust.2bit"; + } + $bossScript->add(<<_EOF_ export asmId=$asmId -if [ ../simpleRepeat/trfMask.bed.gz -nt \$asmId.trfRM.faSize.txt ]; then - twoBitMask ../repeatMasker/\$asmId.rmsk.2bit -type=.bed \\ - -add ../simpleRepeat/trfMask.bed.gz \$asmId.trfRM.2bit - twoBitToFa \$asmId.trfRM.2bit stdout | faSize stdin > \$asmId.trfRM.faSize.txt +if [ ../simpleRepeat/trfMask.bed.gz -nt \$asmId.masked.faSize.txt ]; then + twoBitMask $src2BitToMask -type=.bed \\ + -add ../simpleRepeat/trfMask.bed.gz \$asmId.masked.2bit + twoBitToFa \$asmId.masked.2bit stdout | faSize stdin > \$asmId.masked.faSize.txt else printf "# addMask step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); + $bossScript->execute(); } # addMask ######################################################################### # * step: windowMasker [workhorse] sub doWindowMasker { my $runDir = "$buildDir/trackData/windowMasker"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run windowMasker procedure"; my $bossScript = newBash HgRemoteScript("$runDir/doWindowMasker.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ @@ -1025,69 +1045,69 @@ cd unmasked if [ ../../../\$asmId.unmasked.2bit -nt \$asmId.cpgIslandExtUnmasked.bb ]; then doCpgIslands.pl -stop=makeBed -buildDir=`pwd` -dbHost=$dbHost \\ -smallClusterHub=$smallClusterHub -bigClusterHub=$bigClusterHub -tableName=cpgIslandExtUnmasked \\ -workhorse=$workhorse -maskedSeq=$buildDir/\$asmId.unmasked.2bit \\ -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId doCpgIslands.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` \\ -dbHost=$dbHost \\ -smallClusterHub=$smallClusterHub -bigClusterHub=$bigClusterHub -tableName=cpgIslandExtUnmasked \\ -workhorse=$workhorse -maskedSeq=$buildDir/\$asmId.unmasked.2bit \\ -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId else printf "# cpgIslands unmasked previously completed\\n" 1>&2 fi cd ../masked -if [ ../../addMask/\$asmId.trfRM.2bit -nt \$asmId.cpgIslandExt.bb ]; then +if [ ../../addMask/\$asmId.masked.2bit -nt \$asmId.cpgIslandExt.bb ]; then doCpgIslands.pl -stop=makeBed -buildDir=`pwd` -dbHost=$dbHost \\ -smallClusterHub=$smallClusterHub -bigClusterHub=$bigClusterHub -workhorse=$workhorse \\ - -maskedSeq=$buildDir/trackData/addMask/\$asmId.trfRM.2bit \\ + -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \\ -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId doCpgIslands.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` \\ -dbHost=$dbHost \\ -smallClusterHub=$smallClusterHub -bigClusterHub=$bigClusterHub -workhorse=$workhorse \\ - -maskedSeq=$buildDir/trackData/addMask/\$asmId.trfRM.2bit \\ + -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \\ -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId else printf "# cpgIslands masked previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # sub doCpgIslands ######################################################################### # * step: augustus [workhorse] sub doAugustus { my $runDir = "$buildDir/trackData/augustus"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run Augustus gene prediction procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then - time (/cluster/home/hiram/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\ - -bigClusterHub=$bigClusterHub -species=human -workhorse=$workhorse \\ + time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\ + -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1 - time (/cluster/home/hiram/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\ - -bigClusterHub=$bigClusterHub -species=human -workhorse=$workhorse \\ + time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\ + -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > cleanup.log 2>&1 else printf "# augustus genes previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # windowMasker ######################################################################### # * step: trackDb [workhorse] sub doTrackDb { my $runDir = "$buildDir"; &HgAutomate::mustMkdir($runDir); @@ -1132,40 +1152,43 @@ $secondsStart = `date "+%s"`; chomp $secondsStart; # expected command line arguments after options are processed ($genbankRefseq, $subGroup, $species, $asmId) = @ARGV; # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; # $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/asmHubs/$genbankRefseq/$subGroup/$species/$asmId"; $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir; +$augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; -$assemblySource = "$sourceDir/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId"; +$assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId"; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; +printf STDERR "# sourceDir %s\n", $sourceDir; +printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# assemblySource: %s\n", $assemblySource; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60;