653eaa635409806f4885b0486125efd4390c0931 hiram Thu Aug 27 14:00:07 2020 -0700 fix scripts to behave properly when not using UCSC sequence names refs #23734 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index bc49a4b..222bfa4 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -843,44 +843,52 @@ # verify everything is there twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"` export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"` if [ "\$newTotal" != "\$oldTotal" ]; then printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2 printf "# \$newTotal != \$oldTotal\\n" 1>&2 exit 255 fi rm source.\$asmId.chrom.sizes export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1` if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\\n" 1>&2 exit 255 fi +_EOF_ + ); + if ($ucscNames) { + $bossScript->add(<<_EOF_ join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift export c0=`cat \$asmId.ncbiToUcsc.lift | wc -l` export c1=`cat \$asmId.ucscToNcbi.lift | wc -l` export c2=`cat ../\$asmId.chrom.sizes | wc -l` # verify all names are accounted for if [ "\$c0" -ne "\$c2" ]; then printf "# ERROR: not all names accounted for in \$asmId.ncbiToUcsc.lift" 1>&2 exit 255 fi if [ "\$c1" -ne "\$c2" ]; then printf "# ERROR: not all names accounted for in \$asmId.ucscToNcbi.lift" 1>&2 exit 255 fi +_EOF_ + ); + } + $bossScript->add(<<_EOF_ twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt _EOF_ ); $bossScript->execute(); } # doSequence ######################################################################### # * step: assemblyGap [workhorse] sub doAssemblyGap { my $runDir = "$buildDir/trackData/assemblyGap"; &HgAutomate::mustMkdir($runDir); @@ -1474,34 +1482,36 @@ exit 0 fi _EOF_ ); $bossScript->execute(); } # sub doCpgIslands ######################################################################### # * step: ncbiGene [workhorse] sub doNcbiGene { my $gffFile = "$assemblySource/${asmId}_genomic.gff.gz"; if ( ! -s "${gffFile}" ) { &HgAutomate::verbose(1, "# step ncbiGene: no gff file found at:\n# $gffFile\n"); return; } + if ($ucscNames) { if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) { &HgAutomate::verbose(1, "# ERROR: ncbiGene: can not find $buildDir/sequence/$asmId.ncbiToUcsc.lift\n"); exit 255; } + } my $runDir = "$buildDir/trackData/ncbiGene"; if (-d "${runDir}" ) { if (! -s "$runDir/$asmId.ncbiGene.bb") { &HgAutomate::verbose(1, "WARNING ncbiGene step may already be running, but not completed ?\n"); return; } elsif (! needsUpdate("$gffFile", "$runDir/$asmId.ncbiGene.bb")) { &HgAutomate::verbose(1, "# ncbiGene step previously completed\n"); return; } } if (! -s "$buildDir/$asmId.faSize.txt") { &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n# $buildDir/$asmId.faSize.txt\n"); exit 255; } @@ -1521,45 +1531,55 @@ rm -f \$asmId.geneAttrs.ncbi.txt } if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then (gff3ToGenePred -warnAndContinue -useName \\ -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\ 2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\ | gzip -c > \$asmId.ncbiGene.genePred.gz genePredCheck \$asmId.ncbiGene.genePred.gz export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'` if [ "\${howMany}" -eq 0 ]; then printf "# ncbiGene: no gene definitions found in \$gffFile\n"; cleanUp exit 0 fi + export ncbiGenePred="\$asmId.ncbiGene.genePred.gz" +_EOF_ + ); + if ($ucscNames) { + $bossScript->add(<<_EOF_ liftUp -extGenePred -type=.gp stdout \\ ../../sequence/\$asmId.ncbiToUcsc.lift warn \\ \$asmId.ncbiGene.genePred.gz | gzip -c \\ > \$asmId.ncbiGene.ucsc.genePred.gz - genePredToBed -tab -fillSpace \$asmId.ncbiGene.ucsc.genePred.gz stdout \\ + ncbiGenePred="\$asmId.ncbiGene.ucsc.genePred.gz" +_EOF_ + ); + } + $bossScript->add(<<_EOF_ + genePredToBed -tab -fillSpace \$ncbiGenePred stdout \\ | bedToExons stdin stdout | bedSingleCover.pl stdin > \$asmId.exons.bed export baseCount=`awk '{sum+=\$3-\$2}END{printf "%d", sum}' \$asmId.exons.bed` export asmSizeNoGaps=`grep sequences ../../\$asmId.faSize.txt | awk '{print \$5}'` export perCent=`echo \$baseCount \$asmSizeNoGaps | awk '{printf "%.3f", 100.0*\$1/\$2}'` rm -f \$asmId.exons.bed - ~/kent/src/hg/utils/automation/gpToIx.pl \$asmId.ncbiGene.ucsc.genePred.gz \\ + ~/kent/src/hg/utils/automation/gpToIx.pl \$ncbiGenePred \\ | sort -u > \$asmId.ncbiGene.ix.txt ixIxx \$asmId.ncbiGene.ix.txt \$asmId.ncbiGene.ix \$asmId.ncbiGene.ixx rm -f \$asmId.ncbiGene.ix.txt - genePredToBigGenePred \$asmId.ncbiGene.ucsc.genePred.gz stdout \\ + genePredToBigGenePred \$ncbiGenePred stdout \\ | sort -k1,1 -k2,2n > \$asmId.ncbiGene.bed (bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\ -extraIndex=name \$asmId.ncbiGene.bed \\ ../../\$asmId.chrom.sizes \$asmId.ncbiGene.bb || true) if [ ! -s "\$asmId.ncbiGene.bb" ]; then printf "# ncbiGene: failing bedToBigBed\\n" 1>&2 exit 255 fi touch -r\$gffFile \$asmId.ncbiGene.bb bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\ | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo printf "%d bases of %d (%s%%) in intersection\\n" "\$baseCount" "\$asmSizeNoGaps" "\$perCent" > fb.\$asmId.ncbiGene.txt else printf "# ncbiGene step previously completed\\n" 1>&2 @@ -1589,41 +1609,44 @@ } } if ($filesFound < $filesExpected) { printf STDERR "# step ncbiRefSeq does not have all files required\n"; return; } my $runDir = "$buildDir/trackData/ncbiRefSeq"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run NCBI RefSeq gene procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doNcbiRefSeq.bash", $workhorse, $runDir, $whatItDoes); + my $liftSpec = ""; + if ($ucscNames) { + $liftSpec="-liftFile=\"\$buildDir/sequence/\$asmId.ncbiToUcsc.lift\""; + } $bossScript->add(<<_EOF_ export asmId="$asmId" export buildDir="$buildDir" -export liftFile="\$buildDir/sequence/\$asmId.ncbiToUcsc.lift" +export liftSpec="$liftSpec" export target2bit="\$buildDir/\$asmId.2bit" if [ $buildDir/\$asmId.2bit -nt \$asmId.ncbiRefSeq.bb ]; then ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -toGpWarnOnly -buildDir=`pwd` \\ - -bigClusterHub=$bigClusterHub -dbHost=$dbHost \\ - -liftFile="\$liftFile" \\ + -assemblyHub -bigClusterHub=$bigClusterHub -dbHost=$dbHost $liftSpec \\ -target2bit="\$target2bit" \\ -stop=load -fileServer=$fileServer -smallClusterHub=$smallClusterHub -workhorse=$workhorse \\ \$asmId \$asmId else printf "# ncbiRefSeq step previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); } # ncbiRefSeq ######################################################################### # * step: augustus [workhorse] sub doAugustus { my $runDir = "$buildDir/trackData/augustus";