06d156f363b5dc4c4aa59b90084724822ee7781a hiram Mon Jan 6 15:22:09 2020 -0800 correctly handle duplicates and faster refs #23891 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index c3764fb..b39de80 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -60,30 +60,33 @@ # Option defaults: my $dbHost = 'hgwdev'; my $sourceDir = "/hive/data/outside/ncbi/genomes"; my $augustusSpecies = "human"; my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq"; my $ucscNames = 0; # default 'FALSE' (== 0) my $asmHubName = "n/a"; # directory name in: /gbdb/hubs/asmHubName my $workhorse = "hgwdev"; # default workhorse when none chosen my $fileServer = "hgwdev"; # default when none chosen my $bigClusterHub = "ku"; # default when none chosen my $smallClusterHub = "ku"; # default when none chosen my $base = $0; $base =~ s/^(.*\/)?//; +# key is original accession name from the remove.dups.list, value is 1 +my %dupAccessionList = {}; + sub usage { # Usage / help / self-documentation: my ($status, $detailed) = @_; # Basic help (for incorrect usage): print STDERR " usage: $base [options] genbank|refseq subGroup species asmId required arguments: genbank|refseq - specify either genbank or refseq hierarchy source subGroup - specify subGroup at NCBI FTP site, examples: - vertebrate_mammalian vertebrate_other plant etc... species - species directory at NCBI FTP site, examples: - Homo_sapiens Mus_musculus etc... asmId - assembly identifier at NCBI FTP site, examples: - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc.. @@ -180,30 +183,45 @@ @HgAutomate::commonOptionSpec, ); &usage(1) if (!$ok); &usage(0, 1) if ($opt_help); &HgAutomate::processCommonOptions(); my $err = $stepper->processOptions(); usage(1) if ($err); $dbHost = $opt_dbHost if ($opt_dbHost); } ######################################################################### ######################################################################### # assistant subroutines here. The 'do' steps follow this section ######################################################################### ######################################################################### +# if there are duplicates in original sequence, read in that list for use here +sub readDupsList() { + my $downloadDir = "$buildDir/download"; + + if ( -s "${downloadDir}/${asmId}.remove.dups.list" ) { + open (FH, "<${downloadDir}/${asmId}.remove.dups.list") or die "can not read ${downloadDir}/${asmId}.remove.dups.list"; + while (my $accession = <FH>) { + chomp $accession; + $dupAccessionList{$accession} = 1; + } + close (FH); + } +} + +######################################################################### # return true if result does not exist or it is older than source # else return false sub needsUpdate($$) { my ($source, $result) = @_; if (-s $result) { if (stat($source)->mtime > stat($result)->mtime ) { return 1; } else { return 0; } } return 1; } ######################################################################### @@ -254,32 +272,34 @@ ######################################################################### # process NCBI fasta sequence, use UCSC chrom names if requested sub compositeFasta($$$) { my ($chr2acc, $twoBitFile, $fastaOut) = @_; my %accToChr; readChr2Acc($chr2acc, \%accToChr); if (! $ucscNames) { foreach my $acc (keys %accToChr) { $accToChr{$acc} = $acc; } } my ($fh, $tmpFile) = tempfile("seqList_XXXX", DIR=>'/dev/shm', SUFFIX=>'.txt', UNLINK=>1); foreach my $acc (sort keys %accToChr) { + if (! exists($dupAccessionList{$acc})) { printf $fh "%s\n", $acc; } + } close $fh; open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut"; open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/^>//; $line =~ s/ .*//; die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($accToChr{$line})); if (! $ucscNames) { printf FA ">%s\n", $accToChr{$line}; } else { if ( $accToChr{$line} eq "MT" ) { printf FA ">chrM\n"; @@ -314,30 +334,31 @@ open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput"; open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames"; my %chrNDone; # key is chrom name, value is 1 when done foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; next if (exists($chrNDone{$chrN})); $chrNDone{$chrN} = 1; my $agpFile = "$agpSource/chr$chrN.unlocalized.scaf.agp.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <FH>) { if ($line !~ m/^#/) { chomp $line; my (@a) = split('\t', $line); my $ncbiAcc = $a[0]; + next if (exists($dupAccessionList{$ncbiAcc})); my $acc = $ncbiAcc; $acc =~ s/\./v/ if ($ucscNames); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "${acc}"; $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames); printf NAMES "%s\t%s\n", $ucscName, $ncbiAcc; printf AGP "%s", $ucscName; # begin AGP line with accession name for (my $i = 1; $i < scalar(@a); ++$i) { # the rest of the AGP line printf AGP "\t%s", $a[$i]; } printf AGP "\n"; } } close (FH); @@ -349,32 +370,34 @@ ######################################################################### # process unlocalized NCBI fasta sequence, use UCSC chrom names if requested sub unlocalizedFasta($$$) { my ($chr2acc, $twoBitFile, $fastaOut) = @_; my %accToChr; readChr2Acc($chr2acc, \%accToChr); if (! $ucscNames) { foreach my $acc (keys %accToChr) { $accToChr{$acc} = $acc; } } my ($fh, $tmpFile) = tempfile("seqList_XXXX", DIR=>'/dev/shm', SUFFIX=>'.txt', UNLINK=>1); foreach my $acc (sort keys %accToChr) { + if (! exists($dupAccessionList{$acc})) { printf $fh "%s\n", $acc; } + } close $fh; open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut"; open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/^>//; $line =~ s/ .*//; die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($accToChr{$line})); my $chrN = $accToChr{$line}; my $acc = $line; $acc =~ s/\./v/ if ($ucscNames); my $ucscName = "${acc}"; $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames); @@ -412,30 +435,31 @@ } close (AP); } # sub readAltPlacement($$) ######################################################################### ### process one of the alternate AGP files, changing names via the nameHash ### and writing to the given fileHandle (fh) sub processAltAgp($$$) { my ($agpFile, $nameHash, $fh) = @_; open (AG, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <AG>) { next if ($line =~ m/^#/); chomp $line; my (@a) = split('\t', $line); my $ncbiAcc = $a[0]; + next if (exists($dupAccessionList{$ncbiAcc})); my $ucscName = $nameHash->{$ncbiAcc}; printf $fh "%s", $ucscName; # begin AGP line with accession nam for (my $i = 1; $i < scalar(@a); ++$i) { # the reset of the AGP line printf $fh "\t%s", $a[$i]; } printf $fh "\n"; } close (AG); } # sub processAltAgp($$$) ######################################################################### ### process one of the alternate FASTA files, changing names via the nameHash ### and writing to the given fileHandle (fh) sub processAltFasta($$$) { @@ -455,31 +479,30 @@ } close (FF); } # sub processAltFasta($$$) ######################################################################### # there are alternate sequences, process their multiple AGP and FASTA files # into single AGP and FASTA files sub altSequences($) { my $runDir = "$buildDir/sequence"; my ($assemblyStructure) = @_; # construct the name correspondence hash ################################# my %accToChr; # hash for accession name to browser chromosome name open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/alt_scaffold_placement.txt'|") or die "can not find alt_scaffolds in ${assemblyStructure}";; while (my $line = <FH>) { chomp $line; -### printf "%s\n", $line; readAltPlacement($line, \%accToChr); } close (FH); # and write to alt.names open (AN, ">$runDir/$asmId.alt.names"); foreach my $acc (sort keys %accToChr) { printf AN "%s\t%s\n", $accToChr{$acc}, $acc; } close (AN); # process the AGP files, writing the agpOutput file ###################### my $agpOutput = "$runDir/$asmId.alt.agp.gz"; open (AGP, "|gzip -c > $agpOutput") or die "can not write to $agpOutput"; open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/AGP/alt.scaf.agp.gz'|") or die "can not find alt.scaf.agp.gz in ${assemblyStructure}";; while (my $agpFile = <FH>) { chomp $agpFile; @@ -503,32 +526,33 @@ ######################################################################### # process NCBI unplaced AGP file, perhaps translate into UCSC naming scheme # the agpNames result file is a naming correspondence file for later use # optional chrPrefix can be "chrUn_" for assemblies that have other chr names sub unplacedAgp($$$$) { my ($agpFile, $agpOutput, $agpNames, $chrPrefix) = @_; open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput"; open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <FH>) { if ($line =~ m/^#/) { print AGP $line; } else { - if ($ucscNames) { my ($ncbiAcc, undef) = split('\s+', $line, 2); + next if (exists($dupAccessionList{$ncbiAcc})); + if ($ucscNames) { my $ucscAcc = $ncbiAcc; $ucscAcc =~ s/\./v/; printf NAMES "%s%s\t%s\n", $chrPrefix, $ucscAcc, $ncbiAcc; $line =~ s/\./v/; } printf AGP "%s%s", $chrPrefix, $line; } } close (FH); close (NAMES); close (AGP); } # sub unplacedAgp($$$$) ######################################################################### # process NCBI unplaced FASTA file, perhaps translate into UCSC naming scheme @@ -538,32 +562,34 @@ my %contigName; # key is NCBI contig name, value is UCSC contig name open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <FH>) { if ($line !~ m/^#/) { my ($ncbiAcc, undef) = split('\s+', $line, 2); my $ucscAcc = $ncbiAcc; $ucscAcc =~ s/\./v/ if ($ucscNames); $contigName{$ncbiAcc} = $ucscAcc; } } close (FH); my ($fh, $tmpFile) = tempfile("seqList_XXXX", DIR=>'/dev/shm', SUFFIX=>'.txt', UNLINK=>1); foreach my $ctg (sort keys %contigName) { + if (! exists($dupAccessionList{$ctg})) { printf $fh "%s\n", $ctg; } + } close $fh; open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut"; open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/^>//; $line =~ s/ .*//; die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($contigName{$line})); printf FA ">%s%s\n", $chrPrefix, $contigName{$line}; } else { print FA $line; } } @@ -576,66 +602,68 @@ # do Steps section ######################################################################### ######################################################################### # * step: download [workhorse] sub doDownload { my $runDir = "$buildDir/download"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "setup work hierarchy of sym links to source files in\n\t$runDir/"; my $bossScript = newBash HgRemoteScript("$runDir/doDownload.bash", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ export asmId=$asmId -if [ ! -L \${asmId}_genomic.fna.gz -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then +if [ ! -s \${asmId}.2bit -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then rm -f \${asmId}_genomic.fna.gz \\ + \${asmId}_genomic.fna.dups.gz \\ \${asmId}_assembly_report.txt \\ \${asmId}_rm.out.gz \\ \${asmId}_assembly_structure \\ \$asmId.2bit ln -s $assemblySource/\${asmId}_genomic.fna.gz . ln -s $assemblySource/\${asmId}_assembly_report.txt . ln -s $assemblySource/\${asmId}_rm.out.gz . if [ -d $assemblySource/\${asmId}_assembly_structure ]; then ln -s $assemblySource/\${asmId}_assembly_structure . fi faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit - twoBitDup -keyList=stdout \$asmId.2bit > \$asmId.dupCheck.txt - (grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt + twoBitDup \$asmId.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2 - grep "are identical" \$asmId.dupCheck.txt 1>&2 - awk '{print \$3}' \$asmId.dups.txt > \$asmId.remove.dups.list + cat \$asmId.dups.txt 1>&2 + awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\ \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz rm -f \$asmId.2bit faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit fi - rm -f \$asmId.dups.txt - gzip -f \$asmId.dupCheck.txt + gzip -f \$asmId.dups.txt touch -r \${asmId}_genomic.fna.gz \$asmId.2bit else printf "# download step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); + + readDupsList(); + } # doDownload ######################################################################### # * step: sequence [workhorse] sub doSequence { my $runDir = "$buildDir/sequence"; &HgAutomate::mustMkdir($runDir); my $whatItDoes = "process source files into 2bit sequence and agp"; my $bossScript = newBash HgRemoteScript("$runDir/doSequence.bash", $workhorse, $runDir, $whatItDoes); my $twoBitFile = "$buildDir/download/$asmId.2bit"; my $otherChrParts = 0; # to see if this is unplaced scaffolds only @@ -763,45 +791,43 @@ printf STDERR "creating fake AGP\n"; $bossScript->add(<<_EOF_ twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names _EOF_ ); } else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit -twoBitDup -keyList=stdout ../\$asmId.unmasked.2bit > \$asmId.dupCheck.txt -(grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt +twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2 - grep "are identical" \$asmId.dupCheck.txt 1>&2 - awk '{print \$3}' \$asmId.dups.txt > \$asmId.remove.dups.list - mv \$asmId.unmasked.2bit \$asmId.unmasked.dups.2bit - twoBitToFa \$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\ + cat \$asmId.dups.txt 1>&2 + awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list + mv ../\$asmId.unmasked.2bit ../\$asmId.unmasked.dups.2bit + twoBitToFa ../\$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\ stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz rm -f ../\$asmId.2bit ../\$asmId.unmasked.2bit faToTwoBit \$asmId.noDups.fasta.gz ../\$asmId.2bit faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$asmId.unmasked.2bit fi -rm -f \$asmId.dups.txt -gzip -f \$asmId.dupCheck.txt +gzip -f \$asmId.dups.txt touch -r ../download/\$asmId.2bit ../\$asmId.2bit touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes # verify everything is there twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"` export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"` if [ "\$newTotal" != "\$oldTotal" ]; then printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2 printf "# \$newTotal != \$oldTotal\\n" 1>&2 exit 255 fi rm source.\$asmId.chrom.sizes