bf57fe2786a732c3e49ae6453aed45b8062b6ed6 hiram Thu Aug 1 14:57:57 2019 -0700 add twoBitDup check and construct liftOver files and add doNcbiGene refs #23734 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 514dee5..5b64ed1 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -35,30 +35,31 @@ [ { name => 'download', func => \&doDownload }, { name => 'sequence', func => \&doSequence }, { name => 'assemblyGap', func => \&doAssemblyGap }, { name => 'gatewayPage', func => \&doGatewayPage }, { name => 'cytoBand', func => \&doCytoBand }, { name => 'gc5Base', func => \&doGc5Base }, { name => 'repeatMasker', func => \&doRepeatMasker }, { name => 'simpleRepeat', func => \&doSimpleRepeat }, { name => 'allGaps', func => \&doAllGaps }, { name => 'idKeys', func => \&doIdKeys }, { name => 'windowMasker', func => \&doWindowMasker }, { name => 'addMask', func => \&doAddMask }, { name => 'gapOverlap', func => \&doGapOverlap }, { name => 'tandemDups', func => \&doTandemDups }, { name => 'cpgIslands', func => \&doCpgIslands }, + { name => 'ncbiGene', func => \&doNcbiGene }, { name => 'augustus', func => \&doAugustus }, { name => 'trackDb', func => \&doTrackDb }, { name => 'cleanup', func => \&doCleanup }, ] ); # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $augustusSpecies = "human"; my $ucscNames = 0; # default 'FALSE' (== 0) my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen @@ -277,38 +278,41 @@ } # sub compositeFasta($$$) ######################################################################### # process NCBI unlocalized AGP file, perhaps translate into UCSC naming scheme # the agpNames result file is a naming correspondence file for later use sub unlocalizedAgp($$$$) { my ($chr2acc, $agpSource, $agpOutput, $agpNames) = @_; my %accToChr; readChr2Acc($chr2acc, \%accToChr); if ($ucscNames) { foreach my $acc (keys %accToChr) { my $ucscAcc = $acc; $ucscAcc =~ s/\./v/; $accToChr{$ucscAcc} = $accToChr{$acc}; - $accToChr{$acc} = undef; + delete $accToChr{$acc}; } } open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput"; open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames"; + my %chrNDone; # key is chrom name, value is 1 when done foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; + next if (exists($chrNDone{$chrN})); + $chrNDone{$chrN} = 1; my $agpFile = "$agpSource/chr$chrN.unlocalized.scaf.agp.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <FH>) { if ($line !~ m/^#/) { chomp $line; my (@a) = split('\t', $line); my $ncbiAcc = $a[0]; my $acc = $ncbiAcc; $acc =~ s/\./v/ if ($ucscNames); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "${acc}"; $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames); printf NAMES "%s\t%s\n", $ucscName, $ncbiAcc; printf AGP "%s", $ucscName; # begin AGP line with accession name @@ -584,50 +588,61 @@ ### XXX TO BE DONE - construct sequence when no AGP files exist $bossScript->add(<<_EOF_ export asmId="$asmId" if [ -s ../\$asmId.chrom.sizes ]; then printf "sequence step previously completed\\n" 1>&2 exit 0 fi zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit +twoBitDup -keyList=stdout ../\$asmId.unmasked.2bit > \$asmId.dupCheck.txt +(grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt +if [ -s "\$asmId.dups.txt" ]; then + printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\n" 1>&2 + grep "are identical" \$asmId.dupCheck.txt 1>&2 + exit 255 +else + rm -f \$asmId.dups.txt +fi touch -r ../download/\$asmId.2bit ../\$asmId.2bit touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes # verify everything is there twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"` export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"` if [ "\$newTotal" != "\$oldTotal" ]; then printf "# ERROR: sequence construction error: not same totals source vs. new:\n" 1>&2 printf "# \$newTotal != \$oldTotal\n" 1>&2 exit 255 fi rm source.\$asmId.chrom.sizes export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1` if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\n" 1>&2 exit 255 fi +join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift +join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt _EOF_ ); $bossScript->execute(); } # doSequence ######################################################################### # * step: assemblyGap [workhorse] sub doAssemblyGap { my $runDir = "$buildDir/trackData/assemblyGap"; &HgAutomate::mustMkdir($runDir); @@ -793,31 +808,33 @@ $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then export species=`echo $species | sed -e 's/_/ /g;'` doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\ -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId gzip \$asmId.sorted.fa.out \$asmId.fa.out \$asmId.nestedRepeats.bed doRepeatMasker.pl -continue=cleanup -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit \\ -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId \$HOME/kent/src/hg/utils/automation/asmHubRepeatMasker.sh \$asmId `pwd`/\$asmId.sorted.fa.out.gz `pwd` - +else + printf "# repeatMasker step previously completed\\n" 1>&2 + exit 0 fi _EOF_ ); $bossScript->execute(); } # repeatMasker ######################################################################### # * step: simpleRepeat [workhorse] sub doSimpleRepeat { my $runDir = "$buildDir/trackData/simpleRepeat"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct TRF/simpleRepeat track data"; my $bossScript = newBash HgRemoteScript("$runDir/doSimpleRepeat.bash", $workhorse, $runDir, $whatItDoes); @@ -1150,57 +1167,128 @@ doCpgIslands.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` \\ -dbHost=$dbHost \\ -smallClusterHub=$smallClusterHub -bigClusterHub=$bigClusterHub -workhorse=$workhorse \\ -maskedSeq=$buildDir/trackData/addMask/\$asmId.masked.2bit \\ -chromSizes=$buildDir/\$asmId.chrom.sizes \$asmId else printf "# cpgIslands masked previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # sub doCpgIslands ######################################################################### +# * step: ncbiGene [workhorse] +sub doNcbiGene { + my $gffFile = "$sourceDir/${asmId}_genomic.gff.gz"; + if ( ! -s "${gffFile}" ) { + printf STDERR "# step ncbiGene: no gff file found at:\n# %s\n", $gffFile; + return; + } + if ( ! -s "$buildDir/sequence/$asmId.ncbiToUcsc.lift" ) { + printf STDERR "# ERROR: ncbiGene: can not find ../../sequence/$asmId.ncbiToUcsc.lift\n"; + exit 255; + } + my $runDir = "$buildDir/trackData/ncbiGene"; + + &HgAutomate::mustMkdir($runDir); + + my $whatItDoes = "translate NCBI GFF3 gene definitions into a track"; + my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash", + $workhorse, $runDir, $whatItDoes); + + $bossScript->add(<<_EOF_ +export asmId=$asmId +export gffFile=$gffFile + +function cleanUp() { + rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred + rm -f \$asmId.geneAttrs.ncbi.txt +} + +if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then + (gff3ToGenePred -warnAndContinue -useName \\ + -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\ + 2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\ + | gzip -c > \$asmId.ncbiGene.genePred.gz + genePredCheck \$asmId.ncbiGene.genePred.gz + export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'` + if [ "\${howMany}" -eq 0 ]; then + printf "# ncbiGene: no gene definitions found in \$gffFile\n"; + cleanUp + exit 0 + fi + liftUp -extGenePred -type=.gp stdout \\ + ../../sequence/\$asmId.ncbiToUcsc.lift error \\ + \$asmId.ncbiGene.genePred.gz | gzip -c \\ + > \$asmId.ncbiGene.ucsc.genePred.gz + ~/kent/src/hg/utils/automation/gpToIx.pl \$asmId.ncbiGene.ucsc.genePred.gz \\ + | sort -u > \$asmId.ncbiGene.ix.txt + ixIxx \$asmId.ncbiGene.ix.txt \$asmId.ncbiGene.ix \$asmId.ncbiGene.ixx + rm -f \$asmId.ncbiGene.ix.txt + genePredToBigGenePred \$asmId.ncbiGene.ucsc.genePred.gz stdout \\ + | sort -k1,1 -k2,2n > \$asmId.ncbiGene.bed + (bedToBigBed -type=bed12+8 -tab -as=\$HOME/kent/src/hg/lib/bigGenePred.as \\ + -extraIndex=name \$asmId.ncbiGene.bed \\ + ../../\$asmId.chrom.sizes \$asmId.ncbiGene.bb || true) + if [ ! -s "\$asmId.ncbiGene.bb" ]; then + printf "# ncbiGene: failing bedToBigBed\\n" 1>&2 + exit 255 + fi + touch -r\$gffFile \$asmId.ncbiGene.bb + bigBedInfo \$asmId.ncbiGene.bb | egrep "^itemCount:|^basesCovered:" \\ + | sed -e 's/,//g' > \$asmId.ncbiGene.stats.txt + LC_NUMERIC=en_US /usr/bin/printf "# ncbiGene %s %'d %s %'d\\n" `cat \$asmId.ncbiGene.stats.txt` | xargs echo +else + printf "# ncbiGene previously completed\\n" 1>&2 +fi +_EOF_ + ); + $bossScript->execute(); +} # doNcbiGene + + +######################################################################### # * step: augustus [workhorse] sub doAugustus { my $runDir = "$buildDir/trackData/augustus"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "run Augustus gene prediction procedures"; my $bossScript = newBash HgRemoteScript("$runDir/doAugustus.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId if [ $buildDir/\$asmId.2bit -nt \$asmId.augustus.bb ]; then time (~/kent/src/hg/utils/automation/doAugustus.pl -stop=makeGp -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > makeDb.log 2>&1 time (~/kent/src/hg/utils/automation/doAugustus.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\ -bigClusterHub=$bigClusterHub -species=$augustusSpecies -workhorse=$workhorse \\ -noDbGenePredCheck -maskedSeq=$buildDir/\$asmId.2bit \$asmId) > cleanup.log 2>&1 else printf "# augustus genes previously completed\\n" 1>&2 fi _EOF_ ); $bossScript->execute(); -} # windowMasker +} # doAugustus ######################################################################### # * step: trackDb [workhorse] sub doTrackDb { my $runDir = "$buildDir"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "construct asmId.trackDb.txt file"; my $bossScript = newBash HgRemoteScript("$runDir/doTrackDb.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId \$HOME/kent/src/hg/utils/automation/asmHubTrackDb.sh $genbankRefseq \$asmId $runDir \\