90555f803fa25829e2bf91e2faa913433999896f hiram Tue Apr 5 11:16:39 2022 -0700 new style chromAlias refs #29109 diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl index 798f776..99c7af2 100755 --- src/hg/utils/automation/asmHubChromAlias.pl +++ src/hg/utils/automation/asmHubChromAlias.pl @@ -90,30 +90,36 @@ } else { $chromIndex{$sequence} = ""; } } } } foreach my $seqName (sort keys %chromIndex) { printf "%s\t%s\n", $seqName, $chromIndex{$seqName}; } } # given an alias and a sequence name, add to result or verify identical # to previous add sub addAlias($$$) { my ($source, $alias, $sequence) = @_; + if ($alias eq "na") { + return; + } + if ($sequence eq "na") { + return; + } # do not need to add the sequence name itself return if ($alias eq $sequence); if (!defined($aliasOut{$source})) { my %h; # hash: key: alias name, value 'native' chrom name $aliasOut{$source} = \%h; # printf STDERR "# creating aliasOut{'%s'}\n", $source; } my $hashPtr = $aliasOut{$source}; # already done, verify it is equivalent to previous request if (defined($hashPtr->{$alias})) { if ($sequence ne $hashPtr->{$alias}) { printf STDERR "ERROR: additional alias '%s:%s' does not match previous '%s'\n", $alias, $sequence, $hashPtr->{$alias}; exit 255; } return; @@ -153,82 +159,98 @@ while (my $line = <FH>) { chomp $line; my ($ucscName, $seqName) = split('\s+', $line); $ncbiToUcsc{$seqName} = $ucscName; $ucscToNcbi{$ucscName} = $seqName; ++$nameCount; $ucscNames = 1 if (defined($sequenceSizes{$ucscName})); if ($isRefSeq) { $ucscToRefSeq{$ucscName} = $seqName; } else { $ucscToGenbank{$ucscName} = $seqName; } } close (FH); -# when not a UCSC named assembly, add the UCSC names as aliases -if (! $ucscNames) { - if ($isRefSeq) { - foreach my $ucscName (sort keys %ucscToRefSeq) { - addAlias("ucsc", $ucscName, $ucscToRefSeq{$ucscName}); - } - } else { - foreach my $ucscName (sort keys %ucscToGenbank) { - addAlias("ucsc", $ucscName, $ucscToGenbank{$ucscName}); - } - } -} - my $dupsNotFound = 0; my $dupsList = "../../download/$asmId.dups.txt.gz"; if ( -s "$dupsList" ) { open (FH, "zcat $dupsList | awk '{print \$1, \$3}'|") or die "can not read $dupsList"; while (my $line = <FH>) { chomp $line; my ($dupAlias, $dupTarget) = split('\s+', $line); $dupToSequence{$dupAlias} = $dupTarget; if ($ucscNames) { if (!defined($ncbiToUcsc{$dupTarget})) { printf STDERR "# ERROR: can not find dupTarget: $dupTarget in ncbiToUcsc for dupAlias: $dupAlias\n"; $dupsNotFound += 1; } else { - addAlias("ucsc", $dupAlias, $ncbiToUcsc{$dupTarget}); + printf STDERR "# addAlias(\"ucsc\", $dupAlias, $ncbiToUcsc{$dupTarget});\n"; } } else { - addAlias($asmSource, $dupAlias, $dupTarget); + printf STDERR "# addAlias($asmSource, $dupAlias, $dupTarget);\n"; } printf STDERR "# adding duplicate alias %s\n", $dupAlias; ++$dupCount; } close (FH); } if ($dupsNotFound > 0) { printf STDERR "ERROR: can not find %d duplicate names\n", $dupsNotFound; exit 255; } +if ($dupCount > 0) { + printf STDERR "# processed %d duplicates\n", $dupCount; +} + +# when not a UCSC named assembly, add the UCSC names as aliases +if (! $ucscNames) { + if ($isRefSeq) { + foreach my $ucscName (sort keys %ucscToRefSeq) { + if (defined($dupToSequence{$ucscToRefSeq{$ucscName}})) { # avoid duplicates + printf STDERR "# skipping duplicate name $ucscToRefSeq{$ucscName}\n"; + } else { + addAlias("ucsc", $ucscName, $ucscToRefSeq{$ucscName}); + } + } + } else { + foreach my $ucscName (sort keys %ucscToGenbank) { + if (defined($dupToSequence{$ucscToGenbank{$ucscName}})) { # avoid duplicates + printf STDERR "# skipping duplicate name $ucscToGenbank{$ucscName}\n"; + } else { + addAlias("ucsc", $ucscName, $ucscToGenbank{$ucscName}); + } + } + } +} + if ($sequenceCount != $nameCount) { printf STDERR "ERROR: do not find the same name count in sequence vs. names files\n"; printf STDERR "sequenceCount %d != %d names count - %d duplicates\n", $sequenceCount, $nameCount, $dupCount; exit 255; } # printf STDERR "# initial set of %d UCSC names\n", $sequenceCount; ### first set of names is the UCSC to NCBI sequence names foreach my $sequence (sort keys %sequenceSizes) { my $seqName = $sequence; + if (defined($dupToSequence{$seqName})) { # avoid duplicates + printf STDERR "# skipping duplicate name $seqName\n"; + next; + } my $alias = $sequence; if ($ucscNames) { $alias = $ucscToNcbi{$seqName}; } else { $alias = $ncbiToUcsc{$seqName}; } if (!defined($alias)) { printf STDERR "ERROR: can not find alias name for '%s' sequence\n", $seqName; exit 255; } addAlias($ucscNames ? "ucsc" : $asmSource, $alias, $seqName); } # foreach my $sequence (sort keys %chrNames) { # printf "%s\t%s\n", $chrNames{$sequence}, $sequence; @@ -239,75 +261,97 @@ my %chr2acc; # key is sequence name, value is NCBI chromosome name my $asmStructCount = `ls ../../download/${asmId}_assembly_structure/*/*/chr2acc 2> /dev/null | wc -l`; chomp $asmStructCount; if ( $asmStructCount > 0 ) { # printf STDERR "# second name set processing chr2acc files\n"; open (FH, "grep -h -v '^#' ../../download/${asmId}_assembly_structure/*/*/chr2acc|") or die "can not grep chr2acc files"; while (my $line = <FH>) { chomp $line; my ($alias, $seqName) = split('\t', $line); $alias =~ s/ /_/g; # some assemblies have spaces in chr names ... $alias =~ s/:/_/g; # one assembly had : in a chr name $chr2acc{$seqName} = $alias; if ($ucscNames) { $seqName = $ncbiToUcsc{$seqName}; } + if (defined($dupToSequence{$seqName})) { # avoid duplicates + printf STDERR "# skipping duplicate name $seqName\n"; + } else { addAlias("ncbi", $alias, $seqName); } + } close (FH); } # foreach my $sequence (sort keys %chr2acc) { # printf "%s\t%s\n", $chr2acc{$sequence}, $sequence; # } my $dbgCount = 0; # printf STDERR "# third set processing assembly_report\n"; # column 1 is the 'assembly' name # column 5 is the GenBank-Accn, column 7 is the RefSeq-Accn open (FH, "grep -v '^#' ../../download/${asmId}_assembly_report.txt | cut -d\$'\t' -f1,5,7|") or die "can not grep assembly_report"; while (my $line = <FH>) { chomp $line; ++$dbgCount; my ($asmName, $gbkName, $refSeqName) = split('\s+', $line); + if (defined($dupToSequence{$asmName})) { # avoid duplicates + printf STDERR "# skipping duplicate name $asmName\n"; + next; + } elsif (defined($dupToSequence{$gbkName})) { # avoid duplicates + printf STDERR "# skipping duplicate name $gbkName\n"; + next; + } elsif (defined($dupToSequence{$refSeqName})) { # avoid duplicates + printf STDERR "# skipping duplicate name $refSeqName\n"; + next; + } printf STDERR "# '%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName if ($dbgCount < 5); # next if ($refSeqName eq "na"); # may not be any RefSeq name # next if ($gbkName eq "na"); # may not be any GenBank name if ($refSeqName ne "na") { my $seqName = $refSeqName; if (! $isRefSeq) { $seqName = $gbkName; } if ($ucscNames) { $seqName = $ncbiToUcsc{$seqName}; } if (!defined($seqName)) { if (defined($aliasOut{"refseq"})) { if (defined($aliasOut{"refseq"}->{$refSeqName})) { $seqName = $aliasOut{"refseq"}->{$refSeqName}; } } } if (defined($seqName)) { if (defined($dupToSequence{$seqName})) { + if (defined($dupToSequence{$seqName})) { # avoid duplicates + printf STDERR "# skipping duplicate name $dupToSequence{$seqName}\n"; + } else { addAlias("refseq", $refSeqName, $dupToSequence{$seqName}); addAlias("assembly", $asmName, $dupToSequence{$seqName}); + } + } else { + if (defined($dupToSequence{$seqName})) { # avoid duplicates + printf STDERR "# skipping duplicate name $seqName\n"; } else { addAlias("refseq", $refSeqName, $seqName); addAlias("assembly", $asmName, $seqName); } } + } } # if ($refSeqName ne "na") if ($gbkName ne "na") { my $seqName = $gbkName; if ($isRefSeq) { $seqName = $refSeqName; } if ($ucscNames) { $seqName = $ncbiToUcsc{$seqName}; } if (!defined($seqName)) { if (defined($aliasOut{"genbank"})) { if (defined($aliasOut{"genbank"}->{$gbkName})) { $seqName = $aliasOut{"genbank"}->{$gbkName}; }