249f1c6c7231ede3da49dd949eaffb491e5d48cf hiram Fri Dec 23 08:39:41 2022 -0800 correct processing of alternatie and patch sequences no redmine diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 1a77c0f..289b29e 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -457,85 +457,96 @@ } } close(FH); close(FA); } # sub unlocalizedFasta($$$) ######################################################################### # read alt_scaffold_placement file, return name correspondence in # given hash pointer sub readAltPlacement($$) { my ($altPlacementFile, $accToChr) = @_; open (AP, "<$altPlacementFile") or die "can not read $altPlacementFile"; while (my $line = ) { chomp $line; next if ($line =~ m/^#/); + my $fixAlt = "alt"; + $fixAlt = "fix" if ($altPlacementFile =~ m/PATCH/); my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc, $parent_type, $parent_name, $parent_acc, $region_name, $ori, $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop, $alt_start_tail, $alt_stop_tail) = split('\t', $line); my $acc = $alt_scaf_acc; $alt_scaf_acc = $acc; my $ucscName = $acc; - if ($ucscNames) { + # should always be calculating UCSC names here, the resulting names + # can be used for either naming scheme later in AGP and FA processing $alt_scaf_acc =~ s/\./v/; - $ucscName = sprintf("chr%s_%s_alt", $parent_name, $alt_scaf_acc); + $ucscName = sprintf("chr%s_%s_%s", $parent_name, $alt_scaf_acc, $fixAlt); if ( $prim_asm_name ne "Primary Assembly" ) { - $ucscName = sprintf("%s_alt", $alt_scaf_acc); - } + $ucscName = sprintf("%s_%s", $alt_scaf_acc, $fixAlt); } $accToChr->{$acc} = $ucscName; printf STDERR "# warning: name longer than 31 characters: '%s'\n# in: '%s'\n", $ucscName, $altPlacementFile if (length($ucscName) > 31); } close (AP); } # sub readAltPlacement($$) ######################################################################### ### process one of the alternate AGP files, changing names via the nameHash ### and writing to the given fileHandle (fh) sub processAltAgp($$$) { my ($agpFile, $nameHash, $fh) = @_; open (AG, "zcat $agpFile|") or die "can not read $agpFile"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my (@a) = split('\t', $line); my $ncbiAcc = $a[0]; next if (exists($dupAccessionList{$ncbiAcc})); my $ucscName = $nameHash->{$ncbiAcc}; + if ($ucscNames) { printf $fh "%s", $ucscName; # begin AGP line with accession nam + } else { + printf $fh "%s", $ncbiAcc; # begin AGP line with accession nam + } for (my $i = 1; $i < scalar(@a); ++$i) { # the reset of the AGP line printf $fh "\t%s", $a[$i]; } printf $fh "\n"; } close (AG); } # sub processAltAgp($$$) ######################################################################### ### process one of the alternate FASTA files, changing names via the nameHash ### and writing to the given fileHandle (fh) sub processAltFasta($$$) { my ($faFile, $nameHash, $fh) = @_; open (FF, "zcat $faFile|") or die "can not read $faFile"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; - $line =~ s/^>//; - $line =~ s/ .*//; - die "ERROR: can not find accession $line in nameHash" if (! exists($nameHash->{$line})); - my $ucscName = $nameHash->{$line}; + my $ncbiAcc = $line; + $ncbiAcc =~ s/^>//; + $ncbiAcc =~ s/ .*//; + die "ERROR: can not find accession $ncbiAcc in nameHash" if (! exists($nameHash->{$ncbiAcc})); + my $ucscName = $nameHash->{$ncbiAcc}; + if ($ucscNames) { printf $fh ">%s\n", $ucscName; } else { + printf $fh ">%s\n", $ncbiAcc; + } + } else { print $fh $line; } } close (FF); } # sub processAltFasta($$$) ######################################################################### # there are alternate sequences, process their multiple AGP and FASTA files # into single AGP and FASTA files sub altSequences($) { my $runDir = "$buildDir/sequence"; my ($assemblyStructure) = @_; # construct the name correspondence hash ################################# my %accToChr; # hash for accession name to browser chromosome name open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/alt_scaffold_placement.txt'|") or die "can not find alt_scaffolds in ${assemblyStructure}";;