bffb0697d62c7c19251d617ecd0233995921afb4 hiram Thu Aug 8 21:30:15 2019 -0700 a little bit quick and dirty but might be good enough for now refs #23734 diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl index d8c559d7..d0cbe6c 100755 --- src/hg/utils/automation/doXenoRefGene.pl +++ src/hg/utils/automation/doXenoRefGene.pl @@ -16,31 +16,30 @@ use vars @HgAutomate::commonOptionVars; use vars @HgStepManager::optionVars; use vars qw/ $opt_buildDir $opt_maskedSeq $opt_mrnas $opt_noDbGenePredCheck /; # Specify the steps supported with -continue / -stop: my $stepper = new HgStepManager( [ { name => 'splitTarget', func => \&doSplitTarget }, { name => 'blatRun', func => \&doBlatRun }, { name => 'filterPsl', func => \&doFilterPsl }, { name => 'makeGp', func => \&doMakeGp }, - { name => 'makeBigGp', func => \&doMakeBigGp }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $bigClusterHub = 'ku'; my $workhorse = 'hgwdev'; my $dbHost = 'hgwdev'; my $defaultWorkhorse = 'hgwdev'; my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit"; my $mrnas = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; my $noDbGenePredCheck = 1; # default yes, use -db for genePredCheck my $mrnas = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1"; my $augustusConfig="$augustusDir/config"; @@ -66,32 +65,32 @@ -mrnas - location of xenoRefMrna.fa.gz expanded directory of mrnas/ and xenoRefMrna.sizes, default $mrnas -noDbGenePredCheck do not use -db= on genePredCheck, there is no real db _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'bigClusterHub' => $bigClusterHub, 'workhorse' => $defaultWorkhorse); print STDERR " Automates construction of a xeno RefSeq gene track from RefSeq mRNAs. Steps: splitTarget split the masked target sequence into individual fasta sequences blatRun: Run blat with the xenoRefSeq mRNAs query to target sequence filterPsl: Run pslCDnaFilter on the blat psl results - makeGp: Transform the filtered PSL into a genePred file - makeBigGp: Construct the bigGenePred from the genePred file + makeGp: Transform the filtered PSL into a genePred file and create + bigGenePred from the genePred file cleanup: Removes hard-masked fastas and output from gsBig. All operations are performed in the build directory which is $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. " if ($detailed); print "\n"; exit $status; } @@ -240,128 +239,89 @@ pslCheck -targetSizes=\$db.chrom.sizes \\ -querySizes=/hive/data/genomes/asmHubs/VGP/xenoRefSeq/xenoRefMrna.sizes \\ \$db.xenoRefGene.psl _EOF_ ); $bossScript->execute(); } # doFilterPsl ######################################################################### # * step: make gp [workhorse] sub doMakeGp { my $runDir = $buildDir; &HgAutomate::mustMkdir($runDir); # First, make sure we're starting clean. - if (! -e "$runDir/run.augustus/run.time") { - die "doMakeGp: the previous step augustus did not complete \n" . - "successfully ($buildDir/run.augustus/run.time does not exist).\nPlease " . - "complete the previous step: -continue=augustus\n"; - } elsif (-e "$runDir/$db.augustus.bb" ) { + if (! -e "$runDir/$db.xenoRefGene.psl") { + die "doMakeGp: the previous step filterPsl did not complete \n" . + "successfully ($buildDir/$db.xenoRefGene.psl does not exist).\nPlease " . + "complete the previous step: -continue=filterPsl\n"; + } elsif (-e "$runDir/$db.xenoRefGene.bb" ) { die "doMakeGp: looks like this was run successfully already\n" . - "($db.augustus.bb exists). Either run with -continue load or cleanup\n" . - "or move aside/remove $runDir/$db.augustus.bb\nand run again.\n"; + "($db.xenoRefGene.bb exists). Either run with -continue cleanup\n" . + "or move aside/remove $runDir/$db.xenoRefGene.bb\nand run again.\n"; } - my $whatItDoes = "Makes genePred file from augustus gff output."; + my $whatItDoes = "Makes bigGenePred.bb file from filterPsl output."; my $bossScript = newBash HgRemoteScript("$runDir/makeGp.bash", $workhorse, $runDir, $whatItDoes); - my $dbCheck = "-db=$db"; - $dbCheck = "" if (0 == $noDbGenePredCheck); - $bossScript->add(<<_EOF_ export db=$db -find ./run.augustus/gtf -type f | grep ".gtf.gz\$" \\ - | sed -e 's#/# _D_ #g; s#\\.# _dot_ #g;' \\ - | sort -k11,11 -k13,13n \\ - | sed -e 's# _dot_ #.#g; s# _D_ #/#g' | xargs zcat \\ - | $augustusDir/scripts/join_aug_pred.pl \\ - | grep -P "\\t(CDS|exon|stop_codon|start_codon|tts|tss)\\t" \\ - > \$db.augustus.gtf -gtfToGenePred -genePredExt -infoOut=\$db.info \$db.augustus.gtf \$db.augustus.gp -genePredCheck $dbCheck \$db.augustus.gp -genePredToBigGenePred \$db.augustus.gp stdout | sort -k1,1 -k2,2n > \$db.augustus.bgp -bedToBigBed -type=bed12+8 -tab -as=$ENV{'HOME'}/kent/src/hg/lib/bigGenePred.as \$db.augustus.bgp partition/\$db.chrom.sizes \$db.augustus.bb -getRnaPred -genePredExt -keepMasking -genomeSeqs=$maskedSeq \$db \$db.augustus.gp all \$db.augustusGene.rna.fa -getRnaPred -genePredExt -peptides -genomeSeqs=$maskedSeq \$db \$db.augustus.gp all \$db.augustusGene.faa +grep NR_ \$db.xenoRefGene.psl > NR.psl +grep NM_ \$db.xenoRefGene.psl > NM.psl +mrnaToGene -cdsDb=hgFixed NM.psl NM.gp +mrnaToGene -noCds NR.psl NR.gp +cat NM.gp NR.gp | genePredSingleCover stdin \$db.xenoRefGene.gp +genePredCheck -db=\$db -chromSizes=\$db.chrom.sizes \$db.xenoRefGene.gp +genePredToBigGenePred -geneNames=$mrnas/geneOrgXref.txt \$db.xenoRefGene.gp \\ + stdout | sort -k1,1 -k2,2n > \$db.bgpInput +bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\ + \$db.bgpInput \$db.chrom.sizes \$db.xenoRefGene.bb _EOF_ ); $bossScript->execute(); } # doMakeGp ######################################################################### -# * step: load [dbHost] -sub doLoadAugustus { - my $runDir = $buildDir; - &HgAutomate::mustMkdir($runDir); - my $tableName = "augustusGene"; - - if (! -e "$runDir/$db.augustus.gp") { - die "doLoadAugustus: the previous step makeGp did not complete \n" . - "successfully ($db.augustus.gp does not exists).\nPlease " . - "complete the previous step: -continue=-makeGp\n"; - } elsif (-e "$runDir/fb.$db.$tableName.txt" ) { - die "doLoadAugustus: looks like this was run successfully already\n" . - "(fb.$db.$tableName.txt exists). Either run with -continue cleanup\n" . - "or move aside/remove\n$runDir/fb.$db.$tableName.txt and run again.\n"; - } - my $whatItDoes = "Loads $db.augustus.gp."; - my $bossScript = newBash HgRemoteScript("$runDir/loadAugustus.bash", $dbHost, - $runDir, $whatItDoes); - - my $dbCheck = "-db=$db"; - $dbCheck = "" if (0 == $noDbGenePredCheck); - - $bossScript->add(<<_EOF_ -export db="$db" -export table="$tableName" -genePredCheck $dbCheck \$db.augustus.gp -hgLoadGenePred \$db -genePredExt \$table \$db.augustus.gp -genePredCheck $dbCheck \$table -featureBits \$db \$table > fb.\$db.\$table.txt 2>&1 -checkTableCoords -verboseBlocks -table=\$table \$db -cat fb.\$db.\$table.txt -_EOF_ - ); - $bossScript->execute(); -} # doLoad - -######################################################################### # * step: cleanup [workhorse] sub doCleanup { my $runDir = $buildDir; - if (-e "$runDir/augustus.gtf.gz" ) { + if (-e "$runDir/$db.xenoRefGene.gp.gz" ) { die "doCleanup: looks like this was run successfully already\n" . - "(augustus.gtf.gz exists). Investigate the run directory:\n" . + "($db.xenoRefGene.gp.gz exists). Investigate the run directory:\n" . " $runDir/\n"; } my $whatItDoes = "It cleans up or compresses intermediate files."; my $bossScript = newBash HgRemoteScript("$runDir/cleanup.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export db="$db" -rm -fr $buildDir/fasta -rm -fr $buildDir/run.augustus/err/ -rm -f $buildDir/run.augustus/batch.bak -rm -fr $buildDir/run.augustus/augErr -rm -f $buildDir/\$db.augustus.bgp -gzip $buildDir/\$db.augustus.gtf -gzip $buildDir/\$db.augustus.gp -gzip $buildDir/\$db.augustusGene.rna.fa -gzip $buildDir/\$db.augustusGene.faa +rm -fr $buildDir/target/ +rm -fr $buildDir/blatRun/err/ +rm -fr $buildDir/blatRun/result/ +rm -f $buildDir/blatRun/batch.bak +rm -f $buildDir/NM.gp +rm -f $buildDir/NR.gp +rm -f $buildDir/NM.psl +rm -f $buildDir/NR.psl +rm -f $buildDir/\$db.bgpInput +gzip $buildDir/\$db.all.psl & +gzip $buildDir/\$db.xenoRefGene.psl & +gzip $buildDir/\$db.xenoRefGene.gp +wait _EOF_ ); $bossScript->execute(); } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`;