9a693302d1b139fb9e045ac1032345ccbef293a3 hiram Mon Nov 30 15:48:45 2020 -0800 correct order of names file for fakeAgp refs #24396 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 14a9717..c0042d1 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -195,30 +195,32 @@ my ( $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', 'rmskSpecies=s', 'noRmsk', 'ncbiRmsk', 'augustusSpecies=s', 'xenoRefSeq=s', 'asmHubName=s', + 'noXenoRefSeq', + 'noAugustus', 'ucscNames', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### ######################################################################### # assistant subroutines here. The 'do' steps follow this section ######################################################################### @@ -837,31 +839,31 @@ printf STDERR "creating fake AGP\n"; if ($ucscNames) { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names _EOF_ ); } else { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\ | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\ | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\ - | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names + | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names _EOF_ ); } } else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2 cat \$asmId.dups.txt 1>&2 @@ -1599,44 +1601,49 @@ &HgAutomate::verbose(1, "# ncbiGene step previously completed\n"); return; } } if (! -s "$buildDir/$asmId.faSize.txt") { &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n# $buildDir/$asmId.faSize.txt\n"); exit 255; } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "translate NCBI GFF3 gene definitions into a track"; my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash", $workhorse, $runDir, $whatItDoes); + my $dupList = ""; + if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) { + $dupList = " | grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\" "; + } + $bossScript->add(<<_EOF_ export asmId=$asmId export gffFile=$gffFile function cleanUp() { rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred rm -f \$asmId.geneAttrs.ncbi.txt } if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then (gff3ToGenePred -warnAndContinue -useName \\ -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\ 2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\ - | gzip -c > \$asmId.ncbiGene.genePred.gz + $dupList | gzip -c > \$asmId.ncbiGene.genePred.gz genePredCheck \$asmId.ncbiGene.genePred.gz export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'` if [ "\${howMany}" -eq 0 ]; then printf "# ncbiGene: no gene definitions found in \$gffFile\n"; cleanUp exit 0 fi export ncbiGenePred="\$asmId.ncbiGene.genePred.gz" _EOF_ ); if ($ucscNames) { $bossScript->add(<<_EOF_ liftUp -extGenePred -type=.gp stdout \\ ../../sequence/\$asmId.ncbiToUcsc.lift warn \\ \$asmId.ncbiGene.genePred.gz | gzip -c \\