9fb7e2a933dd0b378f251d348153730c9972ddcd hiram Wed Sep 1 15:24:05 2021 -0700 eliminate small errors in unlocalized scaffold names and genePredFilter for ncbiGene to eliminate problem gene definitions no redmine diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index b6f4a00..8c3e1ef 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -777,50 +777,50 @@ if (needsUpdate($unplacedScafAgp, $agpOutput)) { unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix); `touch -r $unplacedScafAgp $agpOutput`; } my $fastaOut = "$runDir/$asmId.unplaced.fa.gz"; if (needsUpdate($twoBitFile, $fastaOut)) { unplacedFasta($unplacedScafAgp, $twoBitFile, $chrPrefix, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } ########### non-nuclear chromosome sequence ################ my $nonNucAsm = "$buildDir/download/${asmId}_assembly_structure/non-nuclear"; my $nonNucChr2acc = "$nonNucAsm/assembled_chromosomes/chr2acc"; - my $agpSource = "$nonNucAsm/assembled_chromosomes/AGP"; if ( -s $nonNucChr2acc ) { + my $agpSource = "$nonNucAsm/assembled_chromosomes/AGP"; my $agpOutput = "$runDir/$asmId.nonNucChr.agp.gz"; my $agpNames = "$runDir/$asmId.nonNucChr.names"; my $fastaOut = "$runDir/$asmId.nonNucChr.fa.gz"; $partsDone += 1; if (needsUpdate($nonNucChr2acc, $agpOutput)) { compositeAgp($nonNucChr2acc, $agpSource, $agpOutput, $agpNames); `touch -r $nonNucChr2acc $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { compositeFasta($nonNucChr2acc, $twoBitFile, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } ########### non-nuclear scaffold unlocalized sequence ################ my $nonNucChr2scaf = "$nonNucAsm/unlocalized_scaffolds/unlocalized.chr2scaf"; - my $agpSource = "$nonNucAsm/unlocalized_scaffolds/AGP"; if ( -s $nonNucChr2scaf ) { + my $agpSource = "$nonNucAsm/unlocalized_scaffolds/AGP"; my $agpOutput = "$runDir/$asmId.nonNucUnlocalized.agp.gz"; my $agpNames = "$runDir/$asmId.nonNucUnlocalized.names"; my $fastaOut = "$runDir/$asmId.nonNucUnlocalized.fa.gz"; $partsDone += 1; if (needsUpdate($nonNucChr2scaf, $agpOutput)) { unlocalizedAgp($nonNucChr2scaf, $agpSource, $agpOutput, $agpNames); `touch -r $nonNucChr2scaf $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { unlocalizedFasta($nonNucChr2scaf, $twoBitFile, $fastaOut); `touch -r $twoBitFile $fastaOut`; } } $bossScript->add(<<_EOF_ @@ -1627,31 +1627,32 @@ $dupList = " | grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\" "; } $bossScript->add(<<_EOF_ export asmId=$asmId export gffFile=$gffFile function cleanUp() { rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred rm -f \$asmId.geneAttrs.ncbi.txt } if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then (gff3ToGenePred -warnAndContinue -useName \\ -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\ - 2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\ + 2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter \\ + -chromSizes=../../\$asmId.chrom.sizes stdin stdout \\ $dupList | gzip -c > \$asmId.ncbiGene.genePred.gz genePredCheck \$asmId.ncbiGene.genePred.gz export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'` if [ "\${howMany}" -eq 0 ]; then printf "# ncbiGene: no gene definitions found in \$gffFile\n"; cleanUp exit 0 fi export ncbiGenePred="\$asmId.ncbiGene.genePred.gz" _EOF_ ); if ($ucscNames) { $bossScript->add(<<_EOF_ liftUp -extGenePred -type=.gp stdout \\ ../../sequence/\$asmId.ncbiToUcsc.lift warn \\ @@ -1913,34 +1914,34 @@ "$HgAutomate::clusterData/asmHubs/refseqBuild/$ftpDir"; $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir; $assemblySource = $opt_sourceDir ? "$opt_sourceDir" : "$sourceDir/$ftpDir"; my $asmReport = "$assemblySource/${asmId}_assembly_report.txt"; $species = $opt_species ? $opt_species : $species; if (length($species) < 1) { if (-s "$asmReport") { $species = `grep -i "organism name:" $asmReport`; chomp $species; $species =~ s/.*organism\s+name:\s+//i; $species =~ s/\s+\(.*//; } else { - die "no -species specified and can not find $asmReport"; + die "ERROR: no -species specified and can not find $asmReport"; } if (length($species) < 1) { - die "no -species specified and can not find Organism name: in $asmReport"; + die "ERROR: no -species specified and can not find Organism name: in $asmReport"; } } $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species; $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies; $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq; $ucscNames = $opt_ucscNames ? 1 : $ucscNames; # '1' == 'TRUE' $noAugustus = $opt_noAugustus ? 1 : $noAugustus; # '1' == 'TRUE' $noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq; # '1' == 'TRUE' $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse; $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub; $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub; $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer; $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName; $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;