c2d8f10c6d0ce5ebb68e8a76c5d9a2e94ab3dcd8 hiram Tue Dec 28 15:51:34 2021 -0800 correct ncbiGene procedure and correctly run up unplaced contig assembly when no AGP present no redmine diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 74da44a..cb93b70 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -585,72 +585,100 @@ $line =~ s/\./v/; printf AGP "%s%s", $chrPrefix, $line; } else { printf AGP "%s", $line; } } } close (FH); close (NAMES); close (AGP); } # sub unplacedAgp($$$$) ######################################################################### # process NCBI unplaced FASTA file, perhaps translate into UCSC naming scheme # optional chrPrefix can be "chrUn_" for assemblies that have other chr names -sub unplacedFasta($$$$) { - my ($agpFile, $twoBitFile, $chrPrefix, $fastaOut) = @_; +sub unplacedFasta($$$$$$$) { + my ($agpFile, $faFile, $twoBitFile, $chrPrefix, $fastaOut, $agpOutput, $agpNames) = @_; my %contigName; # key is NCBI contig name, value is UCSC contig name + if ( -s $agpFile ) { open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <FH>) { if ($line !~ m/^#/) { my ($ncbiAcc, undef) = split('\s+', $line, 2); my $ucscAcc = $ncbiAcc; $ucscAcc =~ s/\./v/ if ($ucscNames); $contigName{$ncbiAcc} = $ucscAcc; } } close (FH); + } else { # no AGP file, get the contig names from the fasta file + open (FH, "zgrep '^>' $faFile | cut -d' ' -f1 | tr -d '>'|") or die "can not read $faFile"; + while (my $ncbiAcc = <FH>) { + chomp $ncbiAcc; + my $ucscAcc = $ncbiAcc; + $ucscAcc =~ s/\./v/ if ($ucscNames); + $contigName{$ncbiAcc} = $ucscAcc; + } + close (FH); + } my ($fh, $tmpFile) = tempfile("seqList_XXXX", DIR=>'/dev/shm', SUFFIX=>'.txt', UNLINK=>1); foreach my $ctg (sort keys %contigName) { if (! exists($dupAccessionList{$ctg})) { printf $fh "%s\n", $ctg; } } close $fh; open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut"; open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/^>//; $line =~ s/ .*//; die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($contigName{$line})); if ($ucscNames) { printf FA ">%s%s\n", $chrPrefix, $contigName{$line}; } else { printf FA ">%s\n", $contigName{$line}; } } else { print FA $line; } } close(FH); close(FA); + if ( ! -s $agpFile ) { # there was no AGP file, make a fake one + printf STDERR "# no AGP for unplaced sequence, making up a fake AGP\n"; + print `hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 $fastaOut stdout | gzip -c > $agpOutput`; + open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames"; + open (FH, "zcat $agpOutput|") or die "can not read $agpOutput"; + while (my $line = <FH>) { + if ($line !~ m/^#/) { + my ($ncbiAcc, undef) = split('\s+', $line, 2); + next if (exists($dupAccessionList{$ncbiAcc})); + my $ucscAcc = $ncbiAcc; + $ucscAcc =~ s/\./v/; + printf NAMES "%s%s\t%s\n", $chrPrefix, $ucscAcc, $ncbiAcc; + } + } + close (FH); + close (NAMES); + } } # sub unplacedFasta($$$$) ######################################################################### ######################################################################### # do Steps section ######################################################################### ######################################################################### # * step: download [workhorse] sub doDownload { my $runDir = "$buildDir/download"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "setup work hierarchy of sym links to source files in\n\t$runDir/"; my $bossScript = newBash HgRemoteScript("$runDir/doDownload.bash", $workhorse, $runDir, $whatItDoes); @@ -758,47 +786,52 @@ } } ########### alternate sequences ############## my $assemblyStructure = "$buildDir/download/${asmId}_assembly_structure"; my $altCount = `find -L "${assemblyStructure}" -type f | grep "/alt_scaffolds/alt_scaffold_placement.txt" | wc -l`; chomp $altCount; if ($altCount > 0) { $partsDone += 1; ++$otherChrParts; altSequences($assemblyStructure); } ########### unplaced sequence ################ my $unplacedScafAgp = "$primaryAssembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; - if ( -s $unplacedScafAgp ) { + my $unplacedScafFa = "$primaryAssembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz"; + if ( -s $unplacedScafAgp || -s $unplacedScafFa) { my $chrPrefix = ""; # no prefix if no other chrom parts $chrPrefix = "chrUn_" if ($otherChrParts && ! $ucscNames); my $agpOutput = "$runDir/$asmId.unplaced.agp.gz"; my $agpNames = "$runDir/$asmId.unplaced.names"; $partsDone += 1; + if ( -s $unplacedScafAgp ) { if (needsUpdate($unplacedScafAgp, $agpOutput)) { unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix); `touch -r $unplacedScafAgp $agpOutput`; } + } + if ( -s $unplacedScafFa ) { # will make an AGP file if there was none my $fastaOut = "$runDir/$asmId.unplaced.fa.gz"; if (needsUpdate($twoBitFile, $fastaOut)) { - unplacedFasta($unplacedScafAgp, $twoBitFile, $chrPrefix, $fastaOut); + unplacedFasta($unplacedScafAgp, $unplacedScafFa, $twoBitFile, $chrPrefix, $fastaOut, $agpOutput, $agpNames); `touch -r $twoBitFile $fastaOut`; } } + } ########### non-nuclear chromosome sequence ################ my $nonNucAsm = "$buildDir/download/${asmId}_assembly_structure/non-nuclear"; my $nonNucChr2acc = "$nonNucAsm/assembled_chromosomes/chr2acc"; if ( -s $nonNucChr2acc ) { my $agpSource = "$nonNucAsm/assembled_chromosomes/AGP"; my $agpOutput = "$runDir/$asmId.nonNucChr.agp.gz"; my $agpNames = "$runDir/$asmId.nonNucChr.names"; my $fastaOut = "$runDir/$asmId.nonNucChr.fa.gz"; $partsDone += 1; if (needsUpdate($nonNucChr2acc, $agpOutput)) { compositeAgp($nonNucChr2acc, $agpSource, $agpOutput, $agpNames); `touch -r $nonNucChr2acc $agpOutput`; } if (needsUpdate($twoBitFile, $fastaOut)) { @@ -827,42 +860,42 @@ $bossScript->add(<<_EOF_ export asmId="$asmId" if [ -s ../\$asmId.chrom.sizes ]; then printf "sequence step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); ### construct sequence when no AGP files exist if (0 == $partsDone) { -printf STDERR "creating fake AGP\n"; + printf STDERR "# partsDone is zero, creating fake AGP\n"; if ($ucscNames) { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz -hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz +hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names _EOF_ ); } else { $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz -hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz +hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\ | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\ | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\ | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names _EOF_ ); } } else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit @@ -1639,31 +1672,31 @@ } } if (! -s "$buildDir/$asmId.faSize.txt") { &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n# $buildDir/$asmId.faSize.txt\n"); exit 255; } &HgAutomate::mustMkdir($runDir); my $whatItDoes = "translate NCBI GFF3 gene definitions into a track"; my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash", $workhorse, $runDir, $whatItDoes); my $dupList = ""; if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) { - $dupList = " | grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\" || true"; + $dupList = " | (grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\" || true)"; } $bossScript->add(<<_EOF_ export asmId=$asmId export gffFile=$gffFile function cleanUp() { rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred rm -f \$asmId.geneAttrs.ncbi.txt } if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then (gff3ToGenePred -warnAndContinue -useName \\ -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\ 2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter \\