623b6e9bb69c43d27321823c079d9367582c75e9 hiram Thu Jan 30 17:19:40 2020 -0800 add rmskSpecies argument and protect against empty xenoRefGene result refs #23891 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 8a1f131..05d1cb2 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -14,30 +14,31 @@ use File::stat; use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir + $opt_rmskSpecies $opt_augustusSpecies $opt_xenoRefSeq $opt_ucscNames $opt_asmHubName /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, { name => 'repeatMasker', func => \&doRepeatMasker }, @@ -49,30 +50,31 @@ { name => 'gapOverlap', func => \&doGapOverlap }, { name => 'tandemDups', func => \&doTandemDups }, { name => 'cpgIslands', func => \&doCpgIslands }, { name => 'ncbiGene', func => \&doNcbiGene }, { name => 'ncbiRefSeq', func => \&doNcbiRefSeq }, { name => 'xenoRefGene', func => \&doXenoRefGene }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; +my $rmskSpecies = ""; my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; my $ucscNames = 0; # default 'FALSE' (== 0) my $asmHubName = "n/a"; # directory name in: /gbdb/hubs/asmHubName my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; # key is original accession name from the remove.dups.list, value is 1 my %dupAccessionList = {}; @@ -90,30 +92,32 @@ - Homo_sapiens Mus_musculus etc... asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Construct assembly hub in dir instead of default $HgAutomate::clusterData/asmHubs/{genbank|refseq}/subGroup/species/asmId/ -sourceDir dir Find assembly in dir instead of default: $sourceDir/<genbank|refseq>/subGroup/species/all_assembly_versions/asmId -ucscNames Translate NCBI/INSDC/RefSeq names to UCSC names default is to use the given NCBI/INSDC/RefSeq names -asmHubName <name> directory name in: /gbdb/hubs/asmHubName + -rmskSpecies to override command line 'species' name for repeat masker + e.g. -rmskSpecies=viruses -augustusSpecies <human|chicken|zebrafish> default 'human' -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz expanded directory of mrnas/ and xenoRefMrna.sizes, default $xenoRefSeq _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => $fileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates build of assembly hub. Steps: download: sets up sym link working hierarchy from already mirrored files from NCBI in: @@ -167,30 +171,31 @@ exit $status; } # Globals: # Command line args: genbankRefseq subGroup species asmId my ($genbankRefseq, $subGroup, $species, $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', + 'rmskSpecies=s', 'augustusSpecies=s', 'xenoRefSeq=s', 'asmHubName=s', 'ucscNames', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### @@ -1018,31 +1023,31 @@ ######################################################################### # * step: repeatMasker [workhorse] sub doRepeatMasker { my $runDir = "$buildDir/trackData/repeatMasker"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct repeatMasker track data"; my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then -export species=`echo $species | sed -e 's/_/ /g;'` +export species=`echo $rmskSpecies | sed -e 's/_/ /g;'` doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\ -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId gzip \$asmId.sorted.fa.out \$asmId.fa.out \$asmId.nestedRepeats.bed doRepeatMasker.pl -continue=cleanup -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\ -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId \$HOME/kent/src/hg/utils/automation/asmHubRepeatMasker.sh \$asmId `pwd`/\$asmId.sorted.fa.out.gz `pwd` else printf "# repeatMasker step previously completed\\n" 1>&2 exit 0 fi _EOF_ @@ -1395,31 +1400,31 @@ -smallClusterHub=$smallClusterHub -bigClusterHub=$bigClusterHub -workhorse=$workhorse \\ -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \\ -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId else printf "# cpgIslands masked previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # sub doCpgIslands ######################################################################### # * step: ncbiGene [workhorse] sub doNcbiGene { - my $gffFile = "$sourceDir/${asmId}_genomic.gff.gz"; + my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz"; if ( ! -s "${gffFile}" ) { printf STDERR "# step ncbiGene: no gff file found at:\n# %s\n", $gffFile; return; } if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) { printf STDERR "# ERROR: ncbiGene: can not find ../../sequence/$asmId.ncbiToUcsc.lift\n"; exit 255; } my $runDir = "$buildDir/trackData/ncbiGene"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "translate NCBI GFF3 gene definitions into a track"; my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash", $workhorse, $runDir, $whatItDoes); @@ -1548,33 +1553,35 @@ my $runDir = "$buildDir/trackData/xenoRefGene"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run xeno RefSeq gene mapping procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doXenoRefGene.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.xenoRefGene.bb ]; then time (~/kent/src/hg/utils/automation/doXenoRefGene.pl -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -mrnas=$xenoRefSeq -workhorse=$workhorse \\ -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \$asmId) > do.log 2>&1 + if [ -s "\$asmId.xenoRefGene.bb" ]; then bigBedInfo \$asmId.xenoRefGene.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$asmId.xenoRefGene.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# xenoRefGene %s %'d %s %'d\\n" `cat \$asmId.xenoRefGene.stats.txt` | xargs echo + fi else printf "# xenoRefGene previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doXenoRefGene ######################################################################### # * step: trackDb [workhorse] sub doTrackDb { my $runDir = "$buildDir"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct asmId.trackDb.txt file"; @@ -1618,30 +1625,31 @@ $secondsStart = `date "+%s"`; chomp $secondsStart; # expected command line arguments after options are processed ($genbankRefseq, $subGroup, $species, $asmId) = @ARGV; # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; # $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/asmHubs/$genbankRefseq/$subGroup/$species/$asmId"; $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir; +$rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName; $assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId"; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; printf STDERR "# augustusSpecies %s\n", $augustusSpecies;