565a1be447a21c7cc142eb1aa6571387aecd3b37 hiram Thu Aug 8 21:31:28 2019 -0700 adding doXenoRefGene procedure refs #23734 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 2aab979..56bab0f 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -15,61 +15,64 @@ use warnings; use strict; use FindBin qw($Bin); use lib "$Bin"; use HgAutomate; use HgRemoteScript; use HgStepManager; # Option variable names, both common and peculiar to this script: use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_sourceDir $opt_augustusSpecies + $opt_xenoRefSeq $opt_ucscNames /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, { name => 'repeatMasker', func => \&doRepeatMasker }, { name => 'simpleRepeat', func => \&doSimpleRepeat }, { name => 'allGaps', func => \&doAllGaps }, { name => 'idKeys', func => \&doIdKeys }, { name => 'windowMasker', func => \&doWindowMasker }, { name => 'addMask', func => \&doAddMask }, { name => 'gapOverlap', func => \&doGapOverlap }, { name => 'tandemDups', func => \&doTandemDups }, { name => 'cpgIslands', func => \&doCpgIslands }, { name => 'ncbiGene', func => \&doNcbiGene }, + { name => 'xenoRefGene', func => \&doXenoRefGene }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $augustusSpecies = "human"; +my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; my $ucscNames = 0; # default 'FALSE' (== 0) my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base [options] genbank|refseq subGroup species asmId @@ -81,64 +84,74 @@ - Homo_sapiens Mus_musculus etc... asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Construct assembly hub in dir instead of default $HgAutomate::clusterData/asmHubs/{genbank|refseq}/subGroup/species/asmId/ -sourceDir dir Find assembly in dir instead of default: $sourceDir//subGroup/species/all_assembly_versions/asmId -ucscNames Translate NCBI/INSDC/RefSeq names to UCSC names default is to use the given NCBI/INSDC/RefSeq names -augustusSpecies default 'human' + -xenoRefSeq - location of xenoRefMrna.fa.gz + expanded directory of mrnas/ and xenoRefMrna.sizes, default + $xenoRefSeq _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'workhorse' => $workhorse, 'fileServer' => $fileServer, 'bigClusterHub' => $bigClusterHub, 'smallClusterHub' => $smallClusterHub); print STDERR " Automates build of assembly hub. Steps: download: sets up sym link working hierarchy from already mirrored files from NCBI in: $sourceDir/{genbank|refseq}/ sequence: establish AGP and 2bit file from NCBI directory assemblyGap: create assembly and gap bigBed files and indexes for assembly track names gatewayPage: create html/asmId.description.html contents cytoBand: create cytoBand track and navigation ideogram gc5Base: create bigWig file for gc5Base track repeatMasker: run repeat masker cluster run and create bigBed files for the composite track categories of repeats simpleRepeat: run trf cluster run and create bigBed file for simple repeats allGaps: calculate all actual real gaps due to N's in sequence, can be more than were specified in the AGP file idKeys: calculate md5sum for each sequence in the assembly to be used to find identical sequences in similar assemblies windowMasker: run windowMasker cluster run, create windowMasker bigBed file and compute intersection with repeatMasker results addMask: combine the higher masking of (windowMasker or repeatMasker) with trf simpleRepeats into one 2bit file + gapOverlap: find duplicated sequence on each side of a gap + tandemDups: annotate all pairs of duplicated sequence with some gap between cpgIslands: run CpG islands cluster runs for both masked and unmasked sequences and create bigBed files for this composite track + ncbiGene: on RefSeq assemblies, construct a gene track from the + NCBI gff3 predictions + xenoRefSeq: map RefSeq mRNAs to the assembly to construct a 'xeno' + gene prediction track + augustus: run the augustus gene prediction on the assembly trackDb: create trackDb.txt file for assembly hub to include all constructed bigBed and bigWig tracks - cleanup: Removes or compresses intermediate files. + cleanup: Removes or compresses intermediate files. (NOOP at this time !) All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/template.\$date unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from \$db.2bit. 3. The \$db.2bit files have already been distributed to cluster-scratch (/scratch/hg or /iscratch, /san etc). 4. template? " if ($detailed); print "\n"; @@ -146,30 +159,31 @@ } # Globals: # Command line args: genbankRefseq subGroup species asmId my ($genbankRefseq, $subGroup, $species, $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', 'augustusSpecies=s', + 'xenoRefSeq=s', 'ucscNames', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### ######################################################################### # assistant subroutines here. The 'do' steps follow this section ######################################################################### @@ -1231,61 +1245,86 @@ printf "# ncbiGene: failing bedToBigBed\\n" 1>&2 exit 255 fi touch -r\$gffFile \$asmId.ncbiGene.bb bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo else printf "# ncbiGene previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doNcbiGene - ######################################################################### # * step: augustus [workhorse] sub doAugustus { my $runDir = "$buildDir/trackData/augustus"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run Augustus gene prediction procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1 time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > cleanup.log 2>&1 else printf "# augustus genes previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doAugustus ######################################################################### +# * step: xenoRefGene [bigClusterHub] +sub doXenoRefGene { + my $runDir = "$buildDir/trackData/xenoRefGene"; + + &HgAutomate::mustMkdir($runDir); + + my $whatItDoes = "run xeno RefSeq gene mapping procedures"; + my $bossScript = newBash HgRemoteScript("$runDir/doXenoRefGene.bash", + $workhorse, $runDir, $whatItDoes); + + $bossScript->add(<<_EOF_ +export asmId=$asmId + +if [ $buildDir/\$asmId.2bit -nt \$asmId.xenoRefGene.bb ]; then + time (~/kent/src/hg/utils/automation/doXenoRefGene.pl -buildDir=`pwd` -dbHost=$dbHost \\ + -bigClusterHub=$bigClusterHub -mrnas=$xenoRefSeq -workhorse=$workhorse \\ + -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \$asmId) > do.log 2>&1 +else + printf "# xenoRefGene previously completed\\n" 1>&2 +fi +_EOF_ + ); + $bossScript->execute(); +} # doXenoRefGene + +######################################################################### # * step: trackDb [workhorse] sub doTrackDb { my $runDir = "$buildDir"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct asmId.trackDb.txt file"; my $bossScript = newBash HgRemoteScript("$runDir/doTrackDb.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId \$HOME/kent/src/hg/utils/automation/asmHubTrackDb.sh $genbankRefseq \$asmId $runDir \\ > \$asmId.trackDb.txt @@ -1321,42 +1360,44 @@ chomp $secondsStart; # expected command line arguments after options are processed ($genbankRefseq, $subGroup, $species, $asmId) = @ARGV; # Force debug and verbose until this is looking pretty solid: # $opt_debug = 1; # $opt_verbose = 3 if ($opt_verbose < 3); # Establish what directory we will work in. $buildDir = $opt_buildDir ? $opt_buildDir : "$HgAutomate::clusterData/asmHubs/$genbankRefseq/$subGroup/$species/$asmId"; $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; +$xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $assemblySource = $opt_sourceDir ? "$sourceDir" : "$sourceDir/$genbankRefseq/$subGroup/$species/all_assembly_versions/$asmId"; die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource); printf STDERR "# buildDir: %s\n", $buildDir; printf STDERR "# sourceDir %s\n", $sourceDir; printf STDERR "# augustusSpecies %s\n", $augustusSpecies; +printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq; printf STDERR "# assemblySource: %s\n", $assemblySource; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart; my $elapsedMinutes = int($elapsedSeconds/60); $elapsedSeconds -= $elapsedMinutes * 60;