06d7be056190c14b85e71bc12523f18ea6815b5e markd Mon Dec 7 00:50:29 2020 -0800 BLAT mmap index support merge with master diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index b021e16..c0042d1 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -195,30 +195,32 @@ my ( $asmId); # Other: my ($buildDir, $secondsStart, $secondsEnd, $assemblySource); sub checkOptions { # Make sure command line options are valid/supported. my $ok = GetOptions(@HgStepManager::optionSpec, 'buildDir=s', 'sourceDir=s', 'rmskSpecies=s', 'noRmsk', 'ncbiRmsk', 'augustusSpecies=s', 'xenoRefSeq=s', 'asmHubName=s', + 'noXenoRefSeq', + 'noAugustus', 'ucscNames', @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### ######################################################################### # assistant subroutines here. The 'do' steps follow this section ######################################################################### @@ -837,31 +839,31 @@ printf STDERR "creating fake AGP\n"; if ($ucscNames) { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names _EOF_ ); } else { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\ | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\ | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\ - | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names + | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names _EOF_ ); } } else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2 cat \$asmId.dups.txt 1>&2 @@ -982,31 +984,31 @@ my $whatItDoes = "construct asmId.chromAlias.txt for alias name recognition"; my $bossScript = newBash HgRemoteScript("$runDir/doChromAlias.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export buildDir=$buildDir export asmId=$asmId \$HOME/kent/src/hg/utils/automation/asmHubChromAlias.pl \\ \${asmId} | sort > \${asmId}.chromAlias.txt # verify each sequence name has an alias export sizeCount=`cat ../../\${asmId}.chrom.sizes | wc -l` export aliasCount=`grep -v "^#" \${asmId}.chromAlias.txt | wc -l` if [ "\${sizeCount}" -ne "\${aliasCount}" ]; then - printf "ERROR: chromAlias: incorrect number of aliases %d != %d\\n" "\${sizeCount}" "\${aliasCount}" 1>&2 + printf "ERROR: chromAlias: incorrect number of aliases chromSizes %d > %d aliasCount\\n" "\${sizeCount}" "\${aliasCount}" 1>&2 exit 255 fi exit 0 _EOF_ ); $bossScript->execute(); } # chromAlias ######################################################################### # * step: gatewayPage [workhorse] sub doGatewayPage { if ($asmHubName eq "n/a") { printf STDERR "ERROR: step gatewayPage needs argument -asmHubName <name>\n"; @@ -1126,30 +1128,34 @@ "\nERROR: step repeatmasker may be running\n"); exit 255; } } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct repeatMasker track data"; my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash", $workhorse, $runDir, $whatItDoes); my $rmskOpts = ""; if ($ncbiRmsk) { if ( -s "$buildDir/download/${asmId}_rm.out.gz" ) { $rmskOpts = " \\ -ncbiRmsk=\"$buildDir/download/${asmId}_rm.out.gz\" "; + if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) { + $rmskOpts .= " \\ + -dupList=\"${buildDir}/download/${asmId}.remove.dups.list\" "; + } if ($ucscNames) { $rmskOpts .= " \\ -liftSpec=\"$buildDir/sequence/$asmId.ncbiToUcsc.lift\""; } } } $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then export species=`echo $rmskSpecies | sed -e 's/_/ /g;'` rm -f versionInfo.txt @@ -1595,44 +1601,49 @@ &HgAutomate::verbose(1, "# ncbiGene step previously completed\n"); return; } } if (! -s "$buildDir/$asmId.faSize.txt") { &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n# $buildDir/$asmId.faSize.txt\n"); exit 255; } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "translate NCBI GFF3 gene definitions into a track"; my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash", $workhorse, $runDir, $whatItDoes); + my $dupList = ""; + if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) { + $dupList = " | grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\" "; + } + $bossScript->add(<<_EOF_ export asmId=$asmId export gffFile=$gffFile function cleanUp() { rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred rm -f \$asmId.geneAttrs.ncbi.txt } if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then (gff3ToGenePred -warnAndContinue -useName \\ -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\ 2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\ - | gzip -c > \$asmId.ncbiGene.genePred.gz + $dupList | gzip -c > \$asmId.ncbiGene.genePred.gz genePredCheck \$asmId.ncbiGene.genePred.gz export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'` if [ "\${howMany}" -eq 0 ]; then printf "# ncbiGene: no gene definitions found in \$gffFile\n"; cleanUp exit 0 fi export ncbiGenePred="\$asmId.ncbiGene.genePred.gz" _EOF_ ); if ($ucscNames) { $bossScript->add(<<_EOF_ liftUp -extGenePred -type=.gp stdout \\ ../../sequence/\$asmId.ncbiToUcsc.lift warn \\ \$asmId.ncbiGene.genePred.gz | gzip -c \\