940dbe5d9aa8d7d58f364d2e152324f2ec4ec331 hiram Tue Jan 14 11:38:33 2020 -0800 now running up ncbiRefSeq gene tracks refs #24748 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index b39de80..fe6735a 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -38,30 +38,31 @@ { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, { name => 'repeatMasker', func => \&doRepeatMasker }, { name => 'simpleRepeat', func => \&doSimpleRepeat }, { name => 'allGaps', func => \&doAllGaps }, { name => 'idKeys', func => \&doIdKeys }, { name => 'windowMasker', func => \&doWindowMasker }, { name => 'addMask', func => \&doAddMask }, { name => 'gapOverlap', func => \&doGapOverlap }, { name => 'tandemDups', func => \&doTandemDups }, { name => 'cpgIslands', func => \&doCpgIslands }, { name => 'ncbiGene', func => \&doNcbiGene }, + { name => 'ncbiRefSeq', func => \&doNcbiRefSeq }, { name => 'xenoRefGene', func => \&doXenoRefGene }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; my $ucscNames = 0; # default 'FALSE' (== 0) my $asmHubName = "n/a"; # directory name in: /gbdb/hubs/asmHubName my $workhorse = "hgwdev"; # default workhorse when none chosen @@ -128,30 +129,32 @@ simpleRepeat: run trf cluster run and create bigBed file for simple repeats allGaps: calculate all actual real gaps due to N's in sequence, can be more than were specified in the AGP file idKeys: calculate md5sum for each sequence in the assembly to be used to find identical sequences in similar assemblies windowMasker: run windowMasker cluster run, create windowMasker bigBed file and compute intersection with repeatMasker results addMask: combine the higher masking of (windowMasker or repeatMasker) with trf simpleRepeats into one 2bit file gapOverlap: find duplicated sequence on each side of a gap tandemDups: annotate all pairs of duplicated sequence with some gap between cpgIslands: run CpG islands cluster runs for both masked and unmasked sequences and create bigBed files for this composite track ncbiGene: on RefSeq assemblies, construct a gene track from the NCBI gff3 predictions + ncbiRefSeq on RefSeq assemblies, construct a gene track from the + NCBI gff3 predictions xenoRefSeq: map RefSeq mRNAs to the assembly to construct a 'xeno' gene prediction track augustus: run the augustus gene prediction on the assembly trackDb: create trackDb.txt file for assembly hub to include all constructed bigBed and bigWig tracks cleanup: Removes or compresses intermediate files. (NOOP at this time !) All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/template.\$date unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from @@ -1458,30 +1461,69 @@ exit 255 fi touch -r\$gffFile \$asmId.ncbiGene.bb bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo else printf "# ncbiGene previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # doNcbiGene ######################################################################### +# * step: ncbiRefSeq [workhorse] +sub doNcbiRefSeq { + my $runDir = "$buildDir/trackData/ncbiRefSeq"; + my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz"; + if ( ! -s "${gffFile}" ) { + printf STDERR "# step ncbiRefSeq no gff file found at:\n# %s\n", $gffFile; + return; + } + + &HgAutomate::mustMkdir($runDir); + + my $whatItDoes = "run NCBI RefSeq gene procedures"; + my $bossScript = newBash HgRemoteScript("$runDir/doNcbiRefSeq.bash", + $workhorse, $runDir, $whatItDoes); + + $bossScript->add(<<_EOF_ +export asmId="$asmId" +export buildDir="$buildDir" +export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift" +export target2bit="\$buildDir/\$asmId.2bit" + +if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then + +~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \\ + -bigClusterHub=$bigClusterHub -dbHost=$dbHost \\ + -liftFile="\$liftFile" \\ + -target2bit="\$target2bit" \\ + -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\ + $genbankRefseq $subGroup $species \\ + \$asmId \$asmId +else + printf "# ncbiRefSeq previously completed\\n" 1>&2 +fi +_EOF_ + ); + $bossScript->execute(); +} # ncbiRefSeq + +######################################################################### # * step: augustus [workhorse] sub doAugustus { my $runDir = "$buildDir/trackData/augustus"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run Augustus gene prediction procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\