e0232de768e8056705fd15b8bf908d99d3ea74e8 hiram Sat Sep 24 18:37:56 2022 -0700 do not fill in names for RefSeq or GenBank if either of those is not defined refs #29982 diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl index 64fe006..524146a 100755 --- src/hg/utils/automation/asmHubChromAlias.pl +++ src/hg/utils/automation/asmHubChromAlias.pl @@ -19,30 +19,33 @@ my %ucscToRefSeq; # key is UCSC sequence name, value is RefSeq name my %ucscToGenbank; # key is UCSC sequence name, value is GenBank name my $ucscNames = 0; # == 1 if sequence is UCSC names, == 0 if NCBI names my $dupCount = 0; my %dupToSequence; # key is duplicate name, value is target sequence name # to manage duplicates coming in from assembly report my $sequenceCount = 0; my %sequenceSizes; # key is sequence name, value is sequence size my $asmId = shift; my $ncbiAsmId = shift; my %aliasOut; # key is source name, value is a hash pointer where # key is alias name and value is chrom name # typical source names: ucsc, refseq, genbank, ensembl, assembly, ncbi +my %aliasDone; # key is source name, value is hash pointer where + # key is chrom name value is alias, to check + # for duplicate incoming # the argument is the name of the 'native' source name # where 'native' are the names in the assembly itself, thus the # assembly names. The other sources are aliases to that. sub showAlias($) { my ($native) = @_; # second line is the column headers printf "# %s", $native; foreach my $source (sort keys %aliasOut) { printf "\t%s", $source if ($source ne $native); } printf "\n"; # need to account for all sequence names for all sources # create a list of sequence names for each source that does *not* have @@ -85,65 +88,70 @@ if (defined($noAlias{$source})) { $hashPtr = $noAlias{$source}; foreach my $sequence (sort keys %sequenceSizes) { next if (!defined($hashPtr->{$sequence})); if (defined($chromIndex{$sequence})) { $chromIndex{$sequence} .= "\t" . ""; } else { $chromIndex{$sequence} = ""; } } } } foreach my $seqName (sort keys %chromIndex) { printf "%s\t%s\n", $seqName, $chromIndex{$seqName}; } -} +} # sub showAlias($) # given an alias and a sequence name, add to result or verify identical # to previous add sub addAlias($$$) { my ($source, $alias, $sequence) = @_; if ($alias eq "na") { return; } if ($sequence eq "na") { return; } # it is OK to allow duplicate names, different naming authorities could # have the same name, found for example in GCF_006542625.1_Asia_NLE_v1 # which has UCSC names identical to 'assembly' names and the hub has been # build with UCSC names - # do not need to add the sequence name itself - # return if ($alias eq $sequence); if (!defined($aliasOut{$source})) { my %h; # hash: key: alias name, value 'native' chrom name $aliasOut{$source} = \%h; -# printf STDERR "# creating aliasOut{'%s'}\n", $source; + my %t; # hash: key: native sequence name, value alias + $aliasDone{$source} = \%t; } my $hashPtr = $aliasOut{$source}; + my $donePtr = $aliasDone{$source}; # already done, verify it is equivalent to previous request if (defined($hashPtr->{$alias})) { if ($sequence ne $hashPtr->{$alias}) { printf STDERR "ERROR: additional alias '%s:%s' does not match previous '%s'\n", $alias, $sequence, $hashPtr->{$alias}; exit 255; } return; } + if (defined($donePtr->{$sequence})) { + printf STDERR "# WARNING multiple inputs for source '%s.%s' was '%s' new '%s'\n", $source, $sequence, $donePtr->{$sequence}, $alias; + } else { + $donePtr->{$sequence} = $alias; $hashPtr->{$alias} = $sequence; - return; } + return; +} # sub addAlias($$$) # asmSource - is this a genbank or refseq assembly my $asmSource = "genbank"; my $isRefSeq = 0; # == 0 for Genbank assembly, == 1 for RefSeq assembly if ($ncbiAsmId =~ m/^GCF/) { # printf STDERR "# processing a RefSeq assembly\n"; $isRefSeq = 1; $asmSource = "refseq"; } else { # printf STDERR "# processing a GenBank assembly\n"; } my $twoBit = "../../$asmId.2bit"; open (FH, "twoBitInfo $twoBit stdout|") or die "can not twoBitInfo $twoBit stdout"; @@ -185,50 +193,53 @@ while (my $line = <FH>) { chomp $line; my ($ucscName, $seqName) = split('\s+', $line); $ncbiToUcsc{$seqName} = $ucscName; $ucscToNcbi{$ucscName} = $seqName; ++$nameCount; $ucscNames = 1 if (defined($sequenceSizes{$ucscName})); if ($isRefSeq) { $ucscToRefSeq{$ucscName} = $seqName; } else { $ucscToGenbank{$ucscName} = $seqName; } } close (FH); +# with the new style system, can not have multiple aliases for +# the same sequence name. The older system did do this. +# Need to record them here so they can be skipped later. my $dupsNotFound = 0; my $dupsList = `ls ../../download/*.dups.txt.gz 2> /dev/null|head -1`; chomp $dupsList if (defined($dupsList)); if ( -s "$dupsList" ) { open (FH, "zcat $dupsList | awk '{print \$1, \$3}'|") or die "can not read $dupsList"; while (my $line = <FH>) { chomp $line; my ($dupAlias, $dupTarget) = split('\s+', $line); $dupToSequence{$dupAlias} = $dupTarget; if ($ucscNames) { if (!defined($ncbiToUcsc{$dupTarget})) { printf STDERR "# ERROR: can not find dupTarget: $dupTarget in ncbiToUcsc for dupAlias: $dupAlias\n"; $dupsNotFound += 1; - } else { - printf STDERR "# addAlias(\"ucsc\", $dupAlias, $ncbiToUcsc{$dupTarget});\n"; +# } else { +# printf STDERR "# dupList addAlias(\"ucsc\", $dupAlias, $ncbiToUcsc{$dupTarget});\n"; } - } else { - printf STDERR "# addAlias($asmSource, $dupAlias, $dupTarget);\n"; +# } else { +# printf STDERR "# dupList addAlias($asmSource, $dupAlias, $dupTarget);\n"; } - printf STDERR "# adding duplicate alias %s\n", $dupAlias; + printf STDERR "# would have added duplicate alias %s for %s\n", $dupAlias, $dupTarget; ++$dupCount; } close (FH); } if ($dupsNotFound > 0) { printf STDERR "ERROR: can not find %d duplicate names\n", $dupsNotFound; exit 255; } if ($dupCount > 0) { printf STDERR "# processed %d duplicates\n", $dupCount; } # when not a UCSC named assembly, add the UCSC names as aliases if (! $ucscNames) { @@ -325,38 +336,40 @@ $asmName =~ s/ /_/g; # some assemblies have spaces in chr names ... $asmName =~ s/:/_/g; # one assembly had : in chr name if (defined($dupToSequence{$asmName})) { # avoid duplicates printf STDERR "# skipping duplicate name $asmName\n"; next; } elsif (defined($dupToSequence{$gbkName})) { # avoid duplicates printf STDERR "# skipping duplicate name $gbkName\n"; next; } elsif (defined($dupToSequence{$refSeqName})) { # avoid duplicates printf STDERR "# skipping duplicate name $refSeqName\n"; next; } # next if ($refSeqName eq "na"); # may not be any RefSeq name # next if ($gbkName eq "na"); # may not be any GenBank name # fill in ncbiToUcsc for potentially the 'other' NCBI name + if ($refSeqName ne "na" && $gbkName ne "na") { if (defined($ncbiToUcsc{$refSeqName}) && !defined($ncbiToUcsc{$gbkName})) { $ncbiToUcsc{$gbkName} = $ncbiToUcsc{$refSeqName}; $ucscToNcbi{$ncbiToUcsc{$refSeqName}} = $gbkName; } if (defined($ncbiToUcsc{$gbkName}) && !defined($ncbiToUcsc{$refSeqName})) { $ncbiToUcsc{$refSeqName} = $ncbiToUcsc{$gbkName}; $ucscToNcbi{$ncbiToUcsc{$gbkName}} = $refSeqName; } + } if (defined($ncbiToUcsc{$gbkName})) { printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName, $ncbiToUcsc{$gbkName} if ($dbgCount < 5); } elsif (defined($ncbiToUcsc{$refSeqName})) { printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName, $ncbiToUcsc{$refSeqName} if ($dbgCount < 5); } else { printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\tno UCSC name\n", $asmName, $gbkName, $refSeqName if ($dbgCount < 5); } if ($refSeqName ne "na") { my $seqName = $refSeqName; if (! $isRefSeq) { $seqName = $gbkName; } if ($ucscNames) { $seqName = $ncbiToUcsc{$seqName}; }