2b42d05e58a971f49afdd6d52c2ad415e165add0 hiram Tue Jul 6 12:37:28 2021 -0700 create a GTF file for the xenoRefGene result refs #27512 diff --git src/hg/utils/automation/doXenoRefGene.pl src/hg/utils/automation/doXenoRefGene.pl index fc2ca9a..8aa9818 100755 --- src/hg/utils/automation/doXenoRefGene.pl +++ src/hg/utils/automation/doXenoRefGene.pl @@ -28,32 +28,30 @@ { name => 'blatRun', func => \&doBlatRun }, { name => 'filterPsl', func => \&doFilterPsl }, { name => 'makeGp', func => \&doMakeGp }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $bigClusterHub = 'ku'; my $workhorse = 'hgwdev'; my $dbHost = 'hgwdev'; my $defaultWorkhorse = 'hgwdev'; my $maskedSeq = "$HgAutomate::clusterData/\$db/\$db.2bit"; my $mrnas = "/hive/data/genomes/asmHubs/xenoRefSeq"; my $noDbGenePredCheck = 1; # default yes, use -db for genePredCheck -my $augustusDir = "/hive/data/outside/augustus/augustus-3.3.1"; -my $augustusConfig="$augustusDir/config"; my $base = $0; $base =~ s/^(.*\/)?//; sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base db options: "; print STDERR $stepper->getOptionHelp(); print STDERR <<_EOF_ -buildDir dir Use dir instead of default @@ -68,31 +66,31 @@ _EOF_ ; print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, 'bigClusterHub' => $bigClusterHub, 'workhorse' => $defaultWorkhorse); print STDERR " Automates construction of a xeno RefSeq gene track from RefSeq mRNAs. Steps: splitTarget split the masked target sequence into individual fasta sequences blatRun: Run blat with the xenoRefSeq mRNAs query to target sequence filterPsl: Run pslCDnaFilter on the blat psl results makeGp: Transform the filtered PSL into a genePred file and create bigGenePred from the genePred file cleanup: Removes hard-masked fastas and output from gsBig. All operations are performed in the build directory which is -$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/augustus unless -buildDir is given. +$HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/xenoRefGene unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for database/assembly \$db. " if ($detailed); print "\n"; exit $status; } # Globals: # Command line args: db my ($db); @@ -277,129 +275,140 @@ if (! -e "$runDir/$db.xenoRefGene.psl") { die "doMakeGp: the previous step filterPsl did not complete \n" . "successfully ($buildDir/$db.xenoRefGene.psl does not exist).\nPlease " . "complete the previous step: -continue=filterPsl\n"; } elsif (-e "$runDir/$db.xenoRefGene.bb" ) { die "doMakeGp: looks like this was run successfully already\n" . "($db.xenoRefGene.bb exists). Either run with -continue cleanup\n" . "or move aside/remove $runDir/$db.xenoRefGene.bb\nand run again.\n"; } my $whatItDoes = "Makes bigGenePred.bb file from filterPsl output."; my $bossScript = newBash HgRemoteScript("$runDir/makeGp.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -export db=$db +export db="$db" +export buildDir="$buildDir" + if [ -s "\$db.xenoRefGene.psl" ]; then grep NR_ \$db.xenoRefGene.psl > NR.psl grep NM_ \$db.xenoRefGene.psl > NM.psl mrnaToGene -cdsDb=hgFixed NM.psl NM.gp mrnaToGene -noCds NR.psl NR.gp cat NM.gp NR.gp | genePredSingleCover stdin \$db.xenoRefGene.gp genePredCheck -db=\$db -chromSizes=\$db.chrom.sizes \$db.xenoRefGene.gp genePredToBed \$db.xenoRefGene.gp stdout \\ | bedToExons stdin stdout | bedSingleCover.pl stdin > \$db.exons.bed export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$db.exons.bed` export asmSizeNoGaps=`grep sequences ../../\$db.faSize.txt | awk '{print \$5}'` export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'` printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$db.xenoRefGene.txt rm -f \$db.exons.bed genePredToBigGenePred -geneNames=$mrnas/geneOrgXref.txt \$db.xenoRefGene.gp \\ stdout | sort -k1,1 -k2,2n > \$db.bgpInput sed -e 's#Alternative/human readable gene name#species of origin of the mRNA#; s#Name or ID of item, ideally both human readable and unique#RefSeq accession id#; s#Primary identifier for gene#gene name#;' \\ \$HOME/kent/src/hg/lib/bigGenePred.as > xenoRefGene.as bedToBigBed -extraIndex=name,geneName -type=bed12+8 -tab -as=xenoRefGene.as \\ \$db.bgpInput \$db.chrom.sizes \$db.xenoRefGene.bb \$HOME/kent/src/hg/utils/automation/xenoRefGeneIx.pl \$db.bgpInput | sort -u > \$db.ix.txt ixIxx \$db.ix.txt \$db.xenoRefGene.ix \$db.xenoRefGene.ixx + mkdir -p /dev/shm/\$db + cp -p \$db.xenoRefGene.gp /dev/shm/\$db/xenoRefGene.\$db + cd /dev/shm/\$db + genePredToGtf -utr file xenoRefGene.\$db stdout | gzip -c \\ + > \$buildDir/\$db.xenoRefGene.gtf.gz + cd \$buildDir + rm -f /dev/shm/\$db/xenoRefGene.\$db + rmdir /dev/shm/\$db fi _EOF_ ); $bossScript->execute(); } # doMakeGp ######################################################################### # * step: cleanup [workhorse] sub doCleanup { my $runDir = $buildDir; if (-e "$runDir/$db.xenoRefGene.gp.gz" ) { die "doCleanup: looks like this was run successfully already\n" . "($db.xenoRefGene.gp.gz exists). Investigate the run directory:\n" . " $runDir/\n"; } my $whatItDoes = "It cleans up or compresses intermediate files."; my $bossScript = newBash HgRemoteScript("$runDir/cleanup.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export db="$db" -rm -fr $buildDir/target/ -rm -fr $buildDir/blatRun/err/ -rm -fr $buildDir/blatRun/result/ -rm -f $buildDir/blatRun/batch.bak -rm -f $buildDir/NM.gp -rm -f $buildDir/NR.gp -rm -f $buildDir/NM.psl -rm -f $buildDir/NR.psl -if [ -s "$buildDir/\$db.bgpInput" ]; then - gzip $buildDir/\$db.bgpInput & +export buildDir="$buildDir" +rm -fr \$buildDir/target/ +rm -fr \$buildDir/blatRun/err/ +rm -fr \$buildDir/blatRun/result/ +rm -f \$buildDir/blatRun/batch.bak +rm -f \$buildDir/NM.gp +rm -f \$buildDir/NR.gp +rm -f \$buildDir/NM.psl +rm -f \$buildDir/NR.psl +if [ -s "\$buildDir/\$db.bgpInput" ]; then + gzip \$buildDir/\$db.bgpInput & fi -if [ -s "$buildDir/\$db.ix.txt" ]; then - gzip $buildDir/\$db.ix.txt & +if [ -s "\$buildDir/\$db.ix.txt" ]; then + gzip \$buildDir/\$db.ix.txt & fi -if [ -s "$buildDir/\$db.all.psl" ]; then - gzip $buildDir/\$db.all.psl & +if [ -s "\$buildDir/\$db.all.psl" ]; then + gzip \$buildDir/\$db.all.psl & else - rm -f $buildDir/\$db.all.psl + rm -f \$buildDir/\$db.all.psl fi -if [ -s "$buildDir/\$db.xenoRefGene.psl" ]; then - gzip $buildDir/\$db.xenoRefGene.psl & +if [ -s "\$buildDir/\$db.xenoRefGene.psl" ]; then + gzip \$buildDir/\$db.xenoRefGene.psl & else - rm -f $buildDir/\$db.xenoRefGene.psl + rm -f \$buildDir/\$db.xenoRefGene.psl fi -if [ -s "$buildDir/\$db.xenoRefGene.gp" ]; then -gzip $buildDir/\$db.xenoRefGene.gp +if [ -s "\$buildDir/\$db.xenoRefGene.gp" ]; then +gzip \$buildDir/\$db.xenoRefGene.gp fi wait _EOF_ ); $bossScript->execute(); } # doCleanup ######################################################################### # main # Prevent "Suspended (tty input)" hanging: &HgAutomate::closeStdin(); # Make sure we have valid options and exactly 1 argument: &checkOptions(); &usage(1) if (scalar(@ARGV) != 1); $secondsStart = `date "+%s"`; chomp $secondsStart; ($db) = @ARGV; # Force debug and verbose until this is looking pretty solid: #$opt_debug = 1; #$opt_verbose = 3 if ($opt_verbose < 3); $noDbGenePredCheck = $opt_noDbGenePredCheck ? 0 : $noDbGenePredCheck; # Establish what directory we will work in. $buildDir = $opt_buildDir ? $opt_buildDir : - "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/augustus"; + "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/xenoRefGene"; $maskedSeq = $opt_maskedSeq ? $opt_maskedSeq : "$HgAutomate::clusterData/$db/$db.2bit"; $mrnas = $opt_mrnas ? $opt_mrnas : $mrnas; # Do everything. $stepper->execute(); # Tell the user anything they should know. my $stopStep = $stepper->getStopStep(); my $upThrough = ($stopStep eq 'cleanup') ? "" : " (through the '$stopStep' step)"; $secondsEnd = `date "+%s"`; chomp $secondsEnd; my $elapsedSeconds = $secondsEnd - $secondsStart;