0a7c7349c6768dfdd3433781d087a44cfe03cd74
hiram
  Sun Nov 29 00:48:48 2020 -0800
correctly doing chromAlias for refSeq not UCSC names with duplicates refs #24396

diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl
index f2c6de9..db4c464 100755
--- src/hg/utils/automation/asmHubChromAlias.pl
+++ src/hg/utils/automation/asmHubChromAlias.pl
@@ -8,30 +8,33 @@
   printf STDERR "usage: asmHubChromAlias.pl asmId > asmId.chromAlias.tab\n\n";
   printf STDERR "where asmId is something like: GCF_000001735.4_TAIR10.1\n";
   printf STDERR "Outputs a tab file suitable for processing with ixIxx to\n";
   printf STDERR "create an index file to use in an assembly hub.\n";
   printf STDERR "This command assumes it is in a work directory in the\n";
   printf STDERR "assembly hub: .../asmId/trackData/chromAlias/\n";
   printf STDERR "and .../asmId/downloads/ and .../asmId/sequence/ have\n";
   printf STDERR "been created and processed for the hub build.\n";
   exit 255;
 }
 
 my %ucscToRefSeq;	# key is UCSC sequence name, value is RefSeq name
 my %ucscToGenbank;	# key is UCSC sequence name, value is GenBank name
 my $ucscNames = 0;	# == 1 if sequence is UCSC names, == 0 if NCBI names
 my $dupCount = 0;
+my %dupToSequence;	# key is duplicate name, value is target sequence name
+			# to manage duplicates coming in from assembly report
+
 my $asmId = shift;
 
 my %aliasOut;	# key is alias name, value is sequence name in this assembly
 
 sub showAlias() {
   printf "# sequenceName\talias names\tassembly: %s\n", $asmId;
   my %chromIndex;	# key is sequence name in assembly, value
                         # is a tab separated list of aliases
   foreach my $alias (sort keys %aliasOut) {
     my $name = $aliasOut{$alias};
     next if ($alias eq "na");
     if (defined($chromIndex{$name})) {
       $chromIndex{$name} .= "\t" . $alias;
     } else {
       $chromIndex{$name} = $alias;
@@ -91,43 +94,40 @@
   if ($refSeq) {
     $ucscToRefSeq{$ucscName} = $seqName;
   } else {
     $ucscToGenbank{$ucscName} = $seqName;
   }
 }
 close (FH);
 
 my $dupsNotFound = 0;
 my $dupsList = "../../download/$asmId.dups.txt.gz";
 if ( -s "$dupsList" ) {
   open (FH, "zcat $dupsList | awk '{print \$1, \$3}'|") or die "can not read $dupsList";
   while (my $line = <FH>) {
     chomp $line;
     my ($dupAlias, $dupTarget) = split('\s+', $line);
-### early version    my ($dupTarget, $dupAlias) = split('\s+', $line);
+    $dupToSequence{$dupAlias} = $dupTarget;
     if ($ucscNames) {
       if (!defined($ncbiToUcsc{$dupTarget})) {
        printf STDERR "# ERROR: can not find dupTarget: $dupTarget in ncbiToUcsc for dupAlias: $dupAlias\n";
          $dupsNotFound += 1;
       } else {
         addAlias($dupAlias, $ncbiToUcsc{$dupTarget});
       }
-    } elsif (defined($ncbiToUcsc{$dupTarget})) {
-        addAlias($dupAlias, $ncbiToUcsc{$dupTarget});
     } else {
-      printf STDERR "# ERROR: can not find duplicate name $dupAlias for sequence $dupTarget\n";
-      $dupsNotFound += 1;
+        addAlias($dupAlias, $dupTarget);
     }
     ++$dupCount;
   }
   close (FH);
 }
 
 if ($dupsNotFound > 0) {
   printf STDERR "ERROR: can not find %d duplicate names\n", $dupsNotFound;
   exit 255;
 }
 
 if ($sequenceCount != $nameCount) {
   printf STDERR "ERROR: do not find the same name count in sequence vs. names files\n";
   printf STDERR "sequenceCount %d != %d names count - %d duplicates\n", $sequenceCount, $nameCount, $dupCount;
   exit 255;
@@ -186,32 +186,40 @@
   chomp $line;
   my ($gbkName, $refSeqName) = split('\s+', $line);
   next if ($refSeqName eq "na");	# may not be any RefSeq name
   next if ($gbkName eq "na");	# may not be any GenBank name
   $gbk2acc{$gbkName} = $refSeqName;
   if ($refSeq) {
     my $seqName = $refSeqName;
     if ($ucscNames) {
        $seqName = $ncbiToUcsc{$seqName};
     }
     if (!defined($seqName)) {
        if (defined($aliasOut{$refSeqName})) {
           $seqName = $aliasOut{$refSeqName};
        }
     }
+    if ($dupToSequence{$seqName}) {
+      addAlias($gbkName, $dupToSequence{$seqName});
+    } else {
       addAlias($gbkName, $seqName);
+    }
   } else {
     my $seqName = $gbkName;
     if ($ucscNames) {
        $seqName = $ncbiToUcsc{$seqName};
     }
     if (!defined($seqName)) {
        if (defined($aliasOut{$gbkName})) {
           $seqName = $aliasOut{$gbkName};
        }
     }
+    if ($dupToSequence{$seqName}) {
+      addAlias($refSeqName, $dupToSequence{$seqName});
+    } else {
       addAlias($refSeqName, $seqName);
     }
   }
+}
 close (FH);
 
 showAlias();