0a7c7349c6768dfdd3433781d087a44cfe03cd74 hiram Sun Nov 29 00:48:48 2020 -0800 correctly doing chromAlias for refSeq not UCSC names with duplicates refs #24396 diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl index f2c6de9..db4c464 100755 --- src/hg/utils/automation/asmHubChromAlias.pl +++ src/hg/utils/automation/asmHubChromAlias.pl @@ -8,30 +8,33 @@ printf STDERR "usage: asmHubChromAlias.pl asmId > asmId.chromAlias.tab\n\n"; printf STDERR "where asmId is something like: GCF_000001735.4_TAIR10.1\n"; printf STDERR "Outputs a tab file suitable for processing with ixIxx to\n"; printf STDERR "create an index file to use in an assembly hub.\n"; printf STDERR "This command assumes it is in a work directory in the\n"; printf STDERR "assembly hub: .../asmId/trackData/chromAlias/\n"; printf STDERR "and .../asmId/downloads/ and .../asmId/sequence/ have\n"; printf STDERR "been created and processed for the hub build.\n"; exit 255; } my %ucscToRefSeq; # key is UCSC sequence name, value is RefSeq name my %ucscToGenbank; # key is UCSC sequence name, value is GenBank name my $ucscNames = 0; # == 1 if sequence is UCSC names, == 0 if NCBI names my $dupCount = 0; +my %dupToSequence; # key is duplicate name, value is target sequence name + # to manage duplicates coming in from assembly report + my $asmId = shift; my %aliasOut; # key is alias name, value is sequence name in this assembly sub showAlias() { printf "# sequenceName\talias names\tassembly: %s\n", $asmId; my %chromIndex; # key is sequence name in assembly, value # is a tab separated list of aliases foreach my $alias (sort keys %aliasOut) { my $name = $aliasOut{$alias}; next if ($alias eq "na"); if (defined($chromIndex{$name})) { $chromIndex{$name} .= "\t" . $alias; } else { $chromIndex{$name} = $alias; @@ -91,43 +94,40 @@ if ($refSeq) { $ucscToRefSeq{$ucscName} = $seqName; } else { $ucscToGenbank{$ucscName} = $seqName; } } close (FH); my $dupsNotFound = 0; my $dupsList = "../../download/$asmId.dups.txt.gz"; if ( -s "$dupsList" ) { open (FH, "zcat $dupsList | awk '{print \$1, \$3}'|") or die "can not read $dupsList"; while (my $line = <FH>) { chomp $line; my ($dupAlias, $dupTarget) = split('\s+', $line); -### early version my ($dupTarget, $dupAlias) = split('\s+', $line); + $dupToSequence{$dupAlias} = $dupTarget; if ($ucscNames) { if (!defined($ncbiToUcsc{$dupTarget})) { printf STDERR "# ERROR: can not find dupTarget: $dupTarget in ncbiToUcsc for dupAlias: $dupAlias\n"; $dupsNotFound += 1; } else { addAlias($dupAlias, $ncbiToUcsc{$dupTarget}); } - } elsif (defined($ncbiToUcsc{$dupTarget})) { - addAlias($dupAlias, $ncbiToUcsc{$dupTarget}); } else { - printf STDERR "# ERROR: can not find duplicate name $dupAlias for sequence $dupTarget\n"; - $dupsNotFound += 1; + addAlias($dupAlias, $dupTarget); } ++$dupCount; } close (FH); } if ($dupsNotFound > 0) { printf STDERR "ERROR: can not find %d duplicate names\n", $dupsNotFound; exit 255; } if ($sequenceCount != $nameCount) { printf STDERR "ERROR: do not find the same name count in sequence vs. names files\n"; printf STDERR "sequenceCount %d != %d names count - %d duplicates\n", $sequenceCount, $nameCount, $dupCount; exit 255; @@ -186,32 +186,40 @@ chomp $line; my ($gbkName, $refSeqName) = split('\s+', $line); next if ($refSeqName eq "na"); # may not be any RefSeq name next if ($gbkName eq "na"); # may not be any GenBank name $gbk2acc{$gbkName} = $refSeqName; if ($refSeq) { my $seqName = $refSeqName; if ($ucscNames) { $seqName = $ncbiToUcsc{$seqName}; } if (!defined($seqName)) { if (defined($aliasOut{$refSeqName})) { $seqName = $aliasOut{$refSeqName}; } } + if ($dupToSequence{$seqName}) { + addAlias($gbkName, $dupToSequence{$seqName}); + } else { addAlias($gbkName, $seqName); + } } else { my $seqName = $gbkName; if ($ucscNames) { $seqName = $ncbiToUcsc{$seqName}; } if (!defined($seqName)) { if (defined($aliasOut{$gbkName})) { $seqName = $aliasOut{$gbkName}; } } + if ($dupToSequence{$seqName}) { + addAlias($refSeqName, $dupToSequence{$seqName}); + } else { addAlias($refSeqName, $seqName); } } +} close (FH); showAlias();