4da1a14630708d887c3481e3b16df5044c86c59a hiram Tue Oct 13 11:16:59 2020 -0700 better batch size for xenoRefGene and fixup to handle assemblies without repeat masker and RM can not run on those species refs #26347 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 2a6f7b4..b64d4bd 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -17,30 +17,31 @@ use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir $opt_species $opt_rmskSpecies $opt_ncbiRmsk + $opt_noRmsk $opt_augustusSpecies $opt_xenoRefSeq $opt_ucscNames $opt_asmHubName /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'chromAlias', func => \&doChromAlias }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, @@ -56,34 +57,35 @@ { name => 'ncbiGene', func => \&doNcbiGene }, { name => 'ncbiRefSeq', func => \&doNcbiRefSeq }, { name => 'xenoRefGene', func => \&doXenoRefGene }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $species = ""; # usually found in asmId_assembly_report.txt my $ftpDir = ""; # will be determined from given asmId my $rmskSpecies = ""; +my $noRmsk = 0; # when RepeatMasker is not possible, such as bacteria my $ncbiRmsk = 0; # when =1 call doRepeatMasker.pl # with -ncbiRmsk=path.out.gz and -liftSpec=... my $augustusSpecies = "human"; -my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; +my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq"; my $ucscNames = 0; # default 'FALSE' (== 0) my $asmHubName = "n/a"; # directory name in: /gbdb/hubs/asmHubName my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; # key is original accession name from the remove.dups.list, value is 1 my %dupAccessionList; sub usage { # Usage / help / self-documentation: @@ -100,30 +102,31 @@ print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Construct assembly hub in dir instead of default $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/ -sourceDir dir Find assembly in dir instead of default: $sourceDir/GC[AF]/123/456/789/asmId -ucscNames Translate NCBI/INSDC/RefSeq names to UCSC names default is to use the given NCBI/INSDC/RefSeq names -asmHubName <name> directory name in: /gbdb/hubs/asmHubName -species <name> use this species designation if there is no asmId_assembly_report.txt with an 'Organism name:' entry to obtain species -rmskSpecies <name> to override default 'species' name for repeat masker the default is found in the asmId_asssembly_report.txt e.g. -rmskSpecies=viruses + -noRmsk when RepeatMasker is not possible, such as bacteria -ncbiRmsk use NCBI rm.out.gz file instead of local cluster run for repeat masking -augustusSpecies <human|chicken|zebrafish> default 'human' -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz expanded directory of mrnas/ and xenoRefMrna.sizes, default $xenoRefSeq _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => $fileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates build of assembly hub. Steps: @@ -181,30 +184,31 @@ } # Globals: # Command line args: asmId my ( $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', 'rmskSpecies=s', + 'noRmsk', 'ncbiRmsk', 'augustusSpecies=s', 'xenoRefSeq=s', 'asmHubName=s', 'ucscNames', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } @@ -643,32 +647,36 @@ $bossScript->add(<<_EOF_ export asmId=$asmId if [ ! -s \${asmId}.2bit -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then rm -f \${asmId}_genomic.fna.gz \\ \${asmId}_genomic.fna.dups.gz \\ \${asmId}_assembly_report.txt \\ \${asmId}_rm.out.gz \\ \${asmId}_rm.run \\ \${asmId}_assembly_structure \\ \$asmId.2bit ln -s $assemblySource/\${asmId}_genomic.fna.gz . ln -s $assemblySource/\${asmId}_assembly_report.txt . + if [ -s $assemblySource/\${asmId}_rm.out.gz ]; then ln -s $assemblySource/\${asmId}_rm.out.gz . + fi + if [ -s $assemblySource/\${asmId}_rm.run ]; then ln -s $assemblySource/\${asmId}_rm.run . + fi if [ -d $assemblySource/\${asmId}_assembly_structure ]; then ln -s $assemblySource/\${asmId}_assembly_structure . fi faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit twoBitDup \$asmId.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2 cat \$asmId.dups.txt 1>&2 awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\ \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz rm -f \$asmId.2bit faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit fi @@ -809,37 +817,46 @@ $bossScript->add(<<_EOF_ export asmId="$asmId" if [ -s ../\$asmId.chrom.sizes ]; then printf "sequence step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); ### construct sequence when no AGP files exist if (0 == $partsDone) { printf STDERR "creating fake AGP\n"; + if ($ucscNames) { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names _EOF_ ); } else { + $bossScript->add(<<_EOF_ +twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz +hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz +zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names +_EOF_ + ); + } +} else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2 cat \$asmId.dups.txt 1>&2 awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list mv ../\$asmId.unmasked.2bit ../\$asmId.unmasked.dups.2bit twoBitToFa ../\$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\ stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz @@ -1077,53 +1094,59 @@ wigToBigWig \$asmId.wigVarStep.gz ../../\$asmId.chrom.sizes \$asmId.gc5Base.bw rm -f \$asmId.wigVarStep.gz touch -r ../../\$asmId.2bit \$asmId.gc5Base.bw else printf "# gc5Base step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # gc5Base ######################################################################### # * step: repeatMasker [workhorse] sub doRepeatMasker { + if ($noRmsk) { + &HgAutomate::verbose(1, "# -noRmsk == RepeatMasker not being run\n"); + return; + } my $runDir = "$buildDir/trackData/repeatMasker"; if ( -d "$buildDir/trackData/repeatMasker/run.cluster" ) { if ( ! -s "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" ) { &HgAutomate::verbose(1, "\nERROR: step repeatmasker may be running\n"); exit 255; } } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct repeatMasker track data"; my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash", $workhorse, $runDir, $whatItDoes); my $rmskOpts = ""; if ($ncbiRmsk) { + if ( -s "$buildDir/download/${asmId}_rm.out.gz" ) { $rmskOpts = " \\ -ncbiRmsk=\"$buildDir/download/${asmId}_rm.out.gz\" "; if ($ucscNames) { $rmskOpts .= " \\ -liftSpec=\"$buildDir/sequence/$asmId.ncbiToUcsc.lift\""; } } + } $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then export species=`echo $rmskSpecies | sed -e 's/_/ /g;'` rm -f versionInfo.txt doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit $rmskOpts \\ -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId if [ -s "\$asmId.fa.out" ]; then gzip \$asmId.fa.out fi @@ -1300,60 +1323,65 @@ else printf "# idKeys step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # doIdKeys ######################################################################### # * step: addMask [workhorse] sub doAddMask { my $runDir = "$buildDir/trackData/addMask"; my $goNoGo = 0; + if (! $noRmsk) { if ( ! -s "$buildDir/trackData/repeatMasker/$asmId.rmsk.2bit" ) { printf STDERR "ERROR: repeatMasker step not completed\n"; printf STDERR "can not find: $buildDir/trackData/repeatMasker/$asmId.rmsk.2bit\n"; $goNoGo = 1; } + } if ( ! -s "$buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit" ) { printf STDERR "ERROR: windowMasker step not completed\n"; printf STDERR "can not find: $buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit\n"; $goNoGo = 1; } if ( ! -s "$buildDir/trackData/simpleRepeat/trfMask.bed.gz" ) { printf STDERR "ERROR: simpleRepeat step not completed\n"; printf STDERR "can not find: $buildDir/trackData/simpleRepeat/trfMask.bed.gz\n"; $goNoGo = 1; } if ($goNoGo) { printf STDERR "ERROR: must complete repeatMasker, windowMasker and simpleRepeat before addMask\n"; exit 255; } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "add together (windowMasker or repeatMasker) and trf/simpleRepeats to construct masked 2bit file"; my $bossScript = newBash HgRemoteScript("$runDir/doAddMask.bash", $workhorse, $runDir, $whatItDoes); my $wmMasked=`grep "masked total" $buildDir/trackData/windowMasker/faSize.$asmId.cleanWMSdust.txt | awk '{print \$1}' | sed -e 's/%//;'`; - my $rmMasked=`grep "masked total" $buildDir/trackData/repeatMasker/faSize.rmsk.txt | awk '{print \$1}' | sed -e 's/%//;'`; + my $rmMasked = 0; + if (! $noRmsk) { + $rmMasked=`grep "masked total" $buildDir/trackData/repeatMasker/faSize.rmsk.txt | awk '{print \$1}' | sed -e 's/%//;'`; + } my $src2BitToMask = "../repeatMasker/$asmId.rmsk.2bit"; - if ($wmMasked > $rmMasked) { + if ($noRmsk || ($wmMasked > $rmMasked)) { $src2BitToMask = "../windowMasker/$asmId.cleanWMSdust.2bit"; } $bossScript->add(<<_EOF_ export asmId=$asmId if [ ../simpleRepeat/trfMask.bed.gz -nt \$asmId.masked.faSize.txt ]; then twoBitMask $src2BitToMask -type=.bed \\ -add ../simpleRepeat/trfMask.bed.gz \$asmId.masked.2bit twoBitToFa \$asmId.masked.2bit stdout | faSize stdin > \$asmId.masked.faSize.txt touch -r \$asmId.masked.2bit \$asmId.masked.faSize.txt cp -p \$asmId.masked.faSize.txt ../../\$asmId.faSize.txt else printf "# addMask step previously completed\\n" 1>&2 exit 0 @@ -1378,36 +1406,40 @@ $bossScript->add(<<_EOF_ export asmId=$asmId ### if [ ../../\$asmId.unmasked.2bit -nt fb.\$asmId.rmsk.windowmaskerSdust.txt ]; then if [ ../../\$asmId.unmasked.2bit -nt faSize.\$asmId.cleanWMSdust.txt ]; then \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -stop=twobit -buildDir=`pwd` -dbHost=$dbHost \\ -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId bedInvert.pl ../../\$asmId.chrom.sizes ../allGaps/\$asmId.allGaps.bed.gz \\ > not.gap.bed bedIntersect -minCoverage=0.0000000014 windowmasker.sdust.bed \\ not.gap.bed stdout | sort -k1,1 -k2,2n > cleanWMask.bed twoBitMask $buildDir/\$asmId.unmasked.2bit cleanWMask.bed \\ \$asmId.cleanWMSdust.2bit twoBitToFa \$asmId.cleanWMSdust.2bit stdout \\ | faSize stdin > faSize.\$asmId.cleanWMSdust.txt + if [ -s ../repeatMasker/\$asmId.sorted.fa.out.gz ]; then zcat ../repeatMasker/\$asmId.sorted.fa.out.gz | sed -e 's/^ *//; /^\$/d;' \\ | egrep -v "^SW|^score" | awk '{printf "%s\\t%d\\t%d\\n", \$5, \$6-1, \$7}' \\ | bedSingleCover.pl stdin > rmsk.bed intersectRmskWM=`bedIntersect -minCoverage=0.0000000014 cleanWMask.bed \\ rmsk.bed stdout | bedSingleCover.pl stdin | ave -col=4 stdin \\ | grep "^total" | awk '{print \$2}' | sed -e 's/.000000//;'` + else + intersectRmskWM=0 + fi chromSize=`ave -col=2 ../../\$asmId.chrom.sizes \\ | grep "^total" | awk '{print \$2}' | sed -e 's/.000000//;'` echo \$intersectRmskWM \$chromSize | \\ awk '{ printf "%d bases of %d (%.3f%%) in intersection\\n", \$1, \$2, 100.0*\$1/\$2}' \\ > fb.\$asmId.rmsk.windowmaskerSdust.txt rm -f not.gap.bed rmsk.bed bedToBigBed -type=bed3 cleanWMask.bed ../../\$asmId.chrom.sizes \$asmId.windowMasker.bb gzip cleanWMask.bed \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\ -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId else printf "# windowMasker step previously completed\\n" 1>&2 exit 0 fi _EOF_ @@ -1861,30 +1893,31 @@ if (length($species) < 1) { die "no -species specified and can not find Organism name: in $asmReport"; } } $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName; $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0; +$noRmsk = $opt_noRmsk ? 1 : 0; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; printf STDERR "# asmHubName %s\n", $asmHubName; printf STDERR "# rmskSpecies %s\n", $rmskSpecies; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE"; printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE"; # Do everything. $stepper->execute();