dbdb4c9fb976266d03316b80d53c54a65784f1b9 hiram Thu Dec 12 13:06:05 2019 -0800 can verify the name lists have the correct number refs #23891 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 5c33ed6..7ca3a40 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -395,74 +395,74 @@ my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc, $parent_type, $parent_name, $parent_acc, $region_name, $ori, $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop, $alt_start_tail, $alt_stop_tail) = split('\t', $line); my $acc = $alt_scaf_acc; $alt_scaf_acc = $acc; my $ucscName = $acc; if ($ucscNames) { $alt_scaf_acc =~ s/\./v/; $ucscName = sprintf("chr%s_%s_alt", $parent_name, $alt_scaf_acc); if ( $prim_asm_name ne "Primary Assembly" ) { $ucscName = sprintf("%s_alt", $alt_scaf_acc); } } $accToChr->{$acc} = $ucscName; printf STDERR "# warning: name longer than 31 characters: '%s'\n# in: '%s'\n", $ucscName, $altPlacementFile if (length($ucscName) > 31); } close (AP); -} +} # sub readAltPlacement($$) ######################################################################### ### process one of the alternate AGP files, changing names via the nameHash ### and writing to the given fileHandle (fh) sub processAltAgp($$$) { my ($agpFile, $nameHash, $fh) = @_; open (AG, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <AG>) { next if ($line =~ m/^#/); chomp $line; my (@a) = split('\t', $line); my $ncbiAcc = $a[0]; my $ucscName = $nameHash->{$ncbiAcc}; printf $fh "%s", $ucscName; # begin AGP line with accession nam for (my $i = 1; $i < scalar(@a); ++$i) { # the reset of the AGP line printf $fh "\t%s", $a[$i]; } printf $fh "\n"; } close (AG); -} +} # sub processAltAgp($$$) ######################################################################### ### process one of the alternate FASTA files, changing names via the nameHash ### and writing to the given fileHandle (fh) sub processAltFasta($$$) { my ($faFile, $nameHash, $fh) = @_; open (FF, "zcat $faFile|") or die "can not read $faFile"; while (my $line = <FF>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/^>//; $line =~ s/ .*//; die "ERROR: can not find accession $line in nameHash" if (! exists($nameHash->{$line})); my $ucscName = $nameHash->{$line}; printf $fh ">%s\n", $ucscName; } else { print $fh $line; } } close (FF); -} +} # sub processAltFasta($$$) ######################################################################### # there are alternate sequences, process their multiple AGP and FASTA files # into single AGP and FASTA files sub altSequences($) { my $runDir = "$buildDir/sequence"; my ($assemblyStructure) = @_; # construct the name correspondence hash ################################# my %accToChr; # hash for accession name to browser chromosome name open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/alt_scaffold_placement.txt'|") or die "can not find alt_scaffolds in ${assemblyStructure}";; while (my $line = <FH>) { chomp $line; ### printf "%s\n", $line; readAltPlacement($line, \%accToChr); } @@ -483,31 +483,31 @@ printf STDERR "%s\n", $agpFile; } close (FH); close (AGP); # process the FASTA files, writing the fastaOut file ###################### my $fastaOut = "$runDir/$asmId.alt.fa.gz"; open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut"; open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/FASTA/alt.scaf.fna.gz'|") or die "can not find alt.scaf.fna.gz in ${assemblyStructure}";; while (my $faFile = <FH>) { chomp $faFile; processAltFasta($faFile, \%accToChr, \*FA); printf STDERR "%s\n", $faFile; } close (FH); close (FA); -} +} # sub altSequences($) ######################################################################### # process NCBI unplaced AGP file, perhaps translate into UCSC naming scheme # the agpNames result file is a naming correspondence file for later use # optional chrPrefix can be "chrUn_" for assemblies that have other chr names sub unplacedAgp($$$$) { my ($agpFile, $agpOutput, $agpNames, $chrPrefix) = @_; open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput"; open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = <FH>) { if ($line =~ m/^#/) { print AGP $line; } else { @@ -776,31 +776,42 @@ export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"` export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"` if [ "\$newTotal" != "\$oldTotal" ]; then printf "# ERROR: sequence construction error: not same totals source vs. new:\n" 1>&2 printf "# \$newTotal != \$oldTotal\n" 1>&2 exit 255 fi rm source.\$asmId.chrom.sizes export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1` if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\n" 1>&2 exit 255 fi join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift - +export c0=`cat \$asmId.ncbiToUcsc.lift | wc -l` +export c1=`cat \$asmId.ucscToNcbi.lift | wc -l` +export c2=`cat ../\$asmId.chrom.sizes | wc -l` +# verify all names are accounted for +if [ "\$c0" -ne "\$c2 ]; then + printf "# ERROR: not all names accounted for in \$asmId.ncbiToUcsc.lift" 1>&2 + exit 255 +fi +if [ "\$c1" -ne "\$c2 ]; then + printf "# ERROR: not all names accounted for in \$asmId.ucscToNcbi.lift" 1>&2 + exit 255 +fi twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt _EOF_ ); $bossScript->execute(); } # doSequence ######################################################################### # * step: assemblyGap [workhorse] sub doAssemblyGap { my $runDir = "$buildDir/trackData/assemblyGap"; &HgAutomate::mustMkdir($runDir);