90555f803fa25829e2bf91e2faa913433999896f
hiram
  Tue Apr 5 11:16:39 2022 -0700
new style chromAlias refs #29109

diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl
index 798f776..99c7af2 100755
--- src/hg/utils/automation/asmHubChromAlias.pl
+++ src/hg/utils/automation/asmHubChromAlias.pl
@@ -90,30 +90,36 @@
         } else {
           $chromIndex{$sequence} = "";
         }
       }
     }
   }
   foreach my $seqName (sort keys %chromIndex) {
     printf "%s\t%s\n", $seqName, $chromIndex{$seqName};
   }
 }
 
 #  given an alias and a sequence name, add to result or verify identical
 #  to previous add
 sub addAlias($$$) {
   my ($source, $alias, $sequence) = @_;
+  if ($alias eq "na") {
+    return;
+  }
+  if ($sequence eq "na") {
+    return;
+  }
   # do not need to add the sequence name itself
   return if ($alias eq $sequence);
   if (!defined($aliasOut{$source})) {
      my %h;	# hash: key: alias name, value 'native' chrom name
      $aliasOut{$source} = \%h;
 #     printf STDERR "# creating aliasOut{'%s'}\n", $source;
   }
   my $hashPtr = $aliasOut{$source};
   # already done, verify it is equivalent to previous request
   if (defined($hashPtr->{$alias})) {
      if ($sequence ne $hashPtr->{$alias}) {
         printf STDERR "ERROR: additional alias '%s:%s' does not match previous '%s'\n", $alias, $sequence, $hashPtr->{$alias};
         exit 255;
      }
      return;
@@ -153,82 +159,98 @@
 while (my $line = <FH>) {
   chomp $line;
   my ($ucscName, $seqName) = split('\s+', $line);
   $ncbiToUcsc{$seqName} = $ucscName;
   $ucscToNcbi{$ucscName} = $seqName;
   ++$nameCount;
   $ucscNames = 1 if (defined($sequenceSizes{$ucscName}));
   if ($isRefSeq) {
     $ucscToRefSeq{$ucscName} = $seqName;
   } else {
     $ucscToGenbank{$ucscName} = $seqName;
   }
 }
 close (FH);
 
-# when not a UCSC named assembly, add the UCSC names as aliases
-if (! $ucscNames) {
-  if ($isRefSeq) {
-    foreach my $ucscName (sort keys %ucscToRefSeq) {
-       addAlias("ucsc", $ucscName, $ucscToRefSeq{$ucscName});
-    }
-  } else {
-    foreach my $ucscName (sort keys %ucscToGenbank) {
-       addAlias("ucsc", $ucscName, $ucscToGenbank{$ucscName});
-    }
-  }
-}
-
 my $dupsNotFound = 0;
 my $dupsList = "../../download/$asmId.dups.txt.gz";
 if ( -s "$dupsList" ) {
   open (FH, "zcat $dupsList | awk '{print \$1, \$3}'|") or die "can not read $dupsList";
   while (my $line = <FH>) {
     chomp $line;
     my ($dupAlias, $dupTarget) = split('\s+', $line);
     $dupToSequence{$dupAlias} = $dupTarget;
     if ($ucscNames) {
       if (!defined($ncbiToUcsc{$dupTarget})) {
        printf STDERR "# ERROR: can not find dupTarget: $dupTarget in ncbiToUcsc for dupAlias: $dupAlias\n";
          $dupsNotFound += 1;
       } else {
-        addAlias("ucsc", $dupAlias, $ncbiToUcsc{$dupTarget});
+        printf STDERR "# addAlias(\"ucsc\", $dupAlias, $ncbiToUcsc{$dupTarget});\n";
       }
     } else {
-        addAlias($asmSource, $dupAlias, $dupTarget);
+        printf STDERR "# addAlias($asmSource, $dupAlias, $dupTarget);\n";
     }
     printf STDERR "# adding duplicate alias %s\n", $dupAlias;
     ++$dupCount;
   }
   close (FH);
 }
 
 if ($dupsNotFound > 0) {
   printf STDERR "ERROR: can not find %d duplicate names\n", $dupsNotFound;
   exit 255;
 }
+if ($dupCount > 0) {
+  printf STDERR "# processed %d duplicates\n", $dupCount;
+}
+
+# when not a UCSC named assembly, add the UCSC names as aliases
+if (! $ucscNames) {
+  if ($isRefSeq) {
+    foreach my $ucscName (sort keys %ucscToRefSeq) {
+       if (defined($dupToSequence{$ucscToRefSeq{$ucscName}})) {   # avoid duplicates
+         printf STDERR "# skipping duplicate name $ucscToRefSeq{$ucscName}\n";
+       } else {
+         addAlias("ucsc", $ucscName, $ucscToRefSeq{$ucscName});
+       }
+    }
+  } else {
+    foreach my $ucscName (sort keys %ucscToGenbank) {
+       if (defined($dupToSequence{$ucscToGenbank{$ucscName}})) {   # avoid duplicates
+         printf STDERR "# skipping duplicate name $ucscToGenbank{$ucscName}\n";
+       } else {
+         addAlias("ucsc", $ucscName, $ucscToGenbank{$ucscName});
+       }
+    }
+  }
+}
+
 
 if ($sequenceCount != $nameCount) {
   printf STDERR "ERROR: do not find the same name count in sequence vs. names files\n";
   printf STDERR "sequenceCount %d != %d names count - %d duplicates\n", $sequenceCount, $nameCount, $dupCount;
   exit 255;
 }
 
 # printf STDERR "# initial set of %d UCSC names\n", $sequenceCount;
 ### first set of names is the UCSC to NCBI sequence names
 foreach my $sequence (sort keys %sequenceSizes) {
   my $seqName = $sequence;
+  if (defined($dupToSequence{$seqName})) {   # avoid duplicates
+    printf STDERR "# skipping duplicate name $seqName\n";
+    next;
+  }
   my $alias = $sequence;
   if ($ucscNames) {
      $alias = $ucscToNcbi{$seqName};
   } else {
      $alias = $ncbiToUcsc{$seqName};
   } 
   if (!defined($alias)) {
     printf STDERR "ERROR: can not find alias name for '%s' sequence\n", $seqName;
     exit 255;
   }
   addAlias($ucscNames ? "ucsc" : $asmSource, $alias, $seqName);
 }
 
 # foreach my $sequence (sort keys %chrNames) {
 #   printf "%s\t%s\n",  $chrNames{$sequence}, $sequence;
@@ -239,75 +261,97 @@
 my %chr2acc;	# key is sequence name, value is NCBI chromosome name
 my $asmStructCount = `ls  ../../download/${asmId}_assembly_structure/*/*/chr2acc 2> /dev/null | wc -l`;
 chomp $asmStructCount;
 if ( $asmStructCount > 0 ) {
 #  printf STDERR "# second name set processing chr2acc files\n";
   open (FH, "grep -h -v '^#' ../../download/${asmId}_assembly_structure/*/*/chr2acc|") or die "can not grep chr2acc files";
   while (my $line = <FH>) {
     chomp $line;
     my ($alias, $seqName) = split('\t', $line);
     $alias =~ s/ /_/g;	# some assemblies have spaces in chr names ...
     $alias =~ s/:/_/g;	# one assembly had : in a chr name
     $chr2acc{$seqName} = $alias;
     if ($ucscNames) {
        $seqName = $ncbiToUcsc{$seqName};
     }
+    if (defined($dupToSequence{$seqName})) {   # avoid duplicates
+      printf STDERR "# skipping duplicate name $seqName\n";
+    } else {
       addAlias("ncbi", $alias, $seqName);
     }
+  }
   close (FH);
 }
 
 # foreach my $sequence (sort keys %chr2acc) {
 #   printf "%s\t%s\n",  $chr2acc{$sequence}, $sequence;
 # }
 
 my $dbgCount = 0;
 # printf STDERR "# third set processing assembly_report\n";
 # column 1 is the 'assembly' name
 # column 5 is the GenBank-Accn, column 7 is the RefSeq-Accn
 open (FH, "grep -v '^#' ../../download/${asmId}_assembly_report.txt | cut -d\$'\t' -f1,5,7|") or die "can not grep assembly_report";
 while (my $line = <FH>) {
   chomp $line;
   ++$dbgCount;
   my ($asmName, $gbkName, $refSeqName) = split('\s+', $line);
+  if (defined($dupToSequence{$asmName})) {   # avoid duplicates
+     printf STDERR "# skipping duplicate name $asmName\n";
+     next;
+  } elsif (defined($dupToSequence{$gbkName})) {   # avoid duplicates
+     printf STDERR "# skipping duplicate name $gbkName\n";
+     next;
+  } elsif (defined($dupToSequence{$refSeqName})) {   # avoid duplicates
+     printf STDERR "# skipping duplicate name $refSeqName\n";
+     next;
+  }
   printf STDERR "# '%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName if ($dbgCount < 5);
 #  next if ($refSeqName eq "na");	# may not be any RefSeq name
 #  next if ($gbkName eq "na");	# may not be any GenBank name
   if ($refSeqName ne "na") {
     my $seqName = $refSeqName;
     if (! $isRefSeq) {
       $seqName = $gbkName;
     }
     if ($ucscNames) {
        $seqName = $ncbiToUcsc{$seqName};
     }
     if (!defined($seqName)) {
        if (defined($aliasOut{"refseq"})) {
          if (defined($aliasOut{"refseq"}->{$refSeqName})) {
             $seqName = $aliasOut{"refseq"}->{$refSeqName};
          }
        }
     }
     if (defined($seqName)) {
       if (defined($dupToSequence{$seqName})) {
+        if (defined($dupToSequence{$seqName})) {   # avoid duplicates
+          printf STDERR "# skipping duplicate name $dupToSequence{$seqName}\n";
+        } else {
           addAlias("refseq", $refSeqName, $dupToSequence{$seqName});
           addAlias("assembly", $asmName, $dupToSequence{$seqName});
+        }
+      } else {
+        if (defined($dupToSequence{$seqName})) {   # avoid duplicates
+          printf STDERR "# skipping duplicate name $seqName\n";
         } else {
           addAlias("refseq", $refSeqName, $seqName);
           addAlias("assembly", $asmName, $seqName);
         }
       }
+    }
   }	#	if ($refSeqName ne "na")
 
   if ($gbkName ne "na") {
     my $seqName = $gbkName;
     if ($isRefSeq) {
       $seqName = $refSeqName;
     }
     if ($ucscNames) {
        $seqName = $ncbiToUcsc{$seqName};
     }
     if (!defined($seqName)) {
        if (defined($aliasOut{"genbank"})) {
          if (defined($aliasOut{"genbank"}->{$gbkName})) {
             $seqName = $aliasOut{"genbank"}->{$gbkName};
          }