5308f161d60d310ffe5a80e7755e835ff7ea7a9f hiram Tue Sep 13 12:47:32 2022 -0700 OK to allow duplicate names no redmine diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl index 0f5318d..6fbdfba 100755 --- src/hg/utils/automation/asmHubChromAlias.pl +++ src/hg/utils/automation/asmHubChromAlias.pl @@ -96,32 +96,36 @@ foreach my $seqName (sort keys %chromIndex) { printf "%s\t%s\n", $seqName, $chromIndex{$seqName}; } } # given an alias and a sequence name, add to result or verify identical # to previous add sub addAlias($$$) { my ($source, $alias, $sequence) = @_; if ($alias eq "na") { return; } if ($sequence eq "na") { return; } + # it is OK to allow duplicate names, different naming authorities could + # have the same name, found for example in GCF_006542625.1_Asia_NLE_v1 + # which has UCSC names identical to 'assembly' names and the hub has been + # build with UCSC names # do not need to add the sequence name itself - return if ($alias eq $sequence); + # return if ($alias eq $sequence); if (!defined($aliasOut{$source})) { my %h; # hash: key: alias name, value 'native' chrom name $aliasOut{$source} = \%h; # printf STDERR "# creating aliasOut{'%s'}\n", $source; } my $hashPtr = $aliasOut{$source}; # already done, verify it is equivalent to previous request if (defined($hashPtr->{$alias})) { if ($sequence ne $hashPtr->{$alias}) { printf STDERR "ERROR: additional alias '%s:%s' does not match previous '%s'\n", $alias, $sequence, $hashPtr->{$alias}; exit 255; } return; } $hashPtr->{$alias} = $sequence; @@ -296,42 +300,48 @@ chomp $line; ++$dbgCount; my ($asmName, $gbkName, $refSeqName) = split('\t', $line); $asmName =~ s/ /_/g; # some assemblies have spaces in chr names ... $asmName =~ s/:/_/g; # one assembly had : in chr name if (defined($dupToSequence{$asmName})) { # avoid duplicates printf STDERR "# skipping duplicate name $asmName\n"; next; } elsif (defined($dupToSequence{$gbkName})) { # avoid duplicates printf STDERR "# skipping duplicate name $gbkName\n"; next; } elsif (defined($dupToSequence{$refSeqName})) { # avoid duplicates printf STDERR "# skipping duplicate name $refSeqName\n"; next; } - printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName if ($dbgCount < 5); # next if ($refSeqName eq "na"); # may not be any RefSeq name # next if ($gbkName eq "na"); # may not be any GenBank name # fill in ncbiToUcsc for potentially the 'other' NCBI name if (defined($ncbiToUcsc{$refSeqName}) && !defined($ncbiToUcsc{$gbkName})) { $ncbiToUcsc{$gbkName} = $ncbiToUcsc{$refSeqName}; $ucscToNcbi{$ncbiToUcsc{$refSeqName}} = $gbkName; } if (defined($ncbiToUcsc{$gbkName}) && !defined($ncbiToUcsc{$refSeqName})) { $ncbiToUcsc{$refSeqName} = $ncbiToUcsc{$gbkName}; $ucscToNcbi{$ncbiToUcsc{$gbkName}} = $refSeqName; } + if (defined($ncbiToUcsc{$gbkName})) { + printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName, $ncbiToUcsc{$gbkName} if ($dbgCount < 5); + } elsif (defined($ncbiToUcsc{$refSeqName})) { + printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName, $ncbiToUcsc{$refSeqName} if ($dbgCount < 5); + } else { + printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\tno UCSC name\n", $asmName, $gbkName, $refSeqName if ($dbgCount < 5); + } if ($refSeqName ne "na") { my $seqName = $refSeqName; if (! $isRefSeq) { $seqName = $gbkName; } if ($ucscNames) { $seqName = $ncbiToUcsc{$seqName}; } if (!defined($seqName)) { if (defined($aliasOut{"refseq"})) { if (defined($aliasOut{"refseq"}->{$refSeqName})) { $seqName = $aliasOut{"refseq"}->{$refSeqName}; } } }