e0232de768e8056705fd15b8bf908d99d3ea74e8
hiram
  Sat Sep 24 18:37:56 2022 -0700
do not fill in names for RefSeq or GenBank if either of those is not defined refs #29982

diff --git src/hg/utils/automation/asmHubChromAlias.pl src/hg/utils/automation/asmHubChromAlias.pl
index 64fe006..524146a 100755
--- src/hg/utils/automation/asmHubChromAlias.pl
+++ src/hg/utils/automation/asmHubChromAlias.pl
@@ -19,30 +19,33 @@
 my %ucscToRefSeq;	# key is UCSC sequence name, value is RefSeq name
 my %ucscToGenbank;	# key is UCSC sequence name, value is GenBank name
 my $ucscNames = 0;	# == 1 if sequence is UCSC names, == 0 if NCBI names
 my $dupCount = 0;
 my %dupToSequence;	# key is duplicate name, value is target sequence name
 			# to manage duplicates coming in from assembly report
 my $sequenceCount = 0;
 my %sequenceSizes;	# key is sequence name, value is sequence size
 
 my $asmId = shift;
 my $ncbiAsmId = shift;
 
 my %aliasOut;	# key is source name, value is a hash pointer where
 		# key is alias name and value is chrom name
 	# typical source names: ucsc, refseq, genbank, ensembl, assembly, ncbi
+my %aliasDone;	# key is source name, value is hash pointer where
+		# key is chrom name value is alias, to check
+                # for duplicate incoming
 
 # the argument is the name of the 'native' source name
 # where 'native' are the names in the assembly itself, thus the
 # assembly names.  The other sources are aliases to that.
 sub showAlias($) {
   my ($native) = @_;
   # second line is the column headers
   printf "# %s", $native;
   foreach my $source (sort keys %aliasOut) {
     printf "\t%s", $source if ($source ne $native);
   }
   printf "\n";
 
   # need to account for all sequence names for all sources
   # create a list of sequence names for each source that does *not* have
@@ -85,65 +88,70 @@
     if (defined($noAlias{$source})) {
       $hashPtr = $noAlias{$source};
       foreach my $sequence (sort keys %sequenceSizes) {
         next if (!defined($hashPtr->{$sequence}));
         if (defined($chromIndex{$sequence})) {
           $chromIndex{$sequence} .= "\t" . "";
         } else {
           $chromIndex{$sequence} = "";
         }
       }
     }
   }
   foreach my $seqName (sort keys %chromIndex) {
     printf "%s\t%s\n", $seqName, $chromIndex{$seqName};
   }
-}
+}	#	sub showAlias($)
 
 #  given an alias and a sequence name, add to result or verify identical
 #  to previous add
 sub addAlias($$$) {
   my ($source, $alias, $sequence) = @_;
   if ($alias eq "na") {
     return;
   }
   if ($sequence eq "na") {
     return;
   }
   # it is OK to allow duplicate names, different naming authorities could
   # have the same name, found for example in GCF_006542625.1_Asia_NLE_v1
   # which has UCSC names identical to 'assembly' names and the hub has been
   # build with UCSC names
-  # do not need to add the sequence name itself
-  #  return if ($alias eq $sequence);
   if (!defined($aliasOut{$source})) {
      my %h;	# hash: key: alias name, value 'native' chrom name
      $aliasOut{$source} = \%h;
-#     printf STDERR "# creating aliasOut{'%s'}\n", $source;
+     my %t;	# hash: key: native sequence name, value alias
+     $aliasDone{$source} = \%t;
   }
   my $hashPtr = $aliasOut{$source};
+  my $donePtr = $aliasDone{$source};
   # already done, verify it is equivalent to previous request
   if (defined($hashPtr->{$alias})) {
      if ($sequence ne $hashPtr->{$alias}) {
         printf STDERR "ERROR: additional alias '%s:%s' does not match previous '%s'\n", $alias, $sequence, $hashPtr->{$alias};
         exit 255;
      }
      return;
   }
+  if (defined($donePtr->{$sequence})) {
+    printf STDERR "# WARNING multiple inputs for source '%s.%s' was '%s' new '%s'\n", $source, $sequence, $donePtr->{$sequence}, $alias;
+  } else {
+    $donePtr->{$sequence} = $alias;
     $hashPtr->{$alias} = $sequence;
-  return;
   }
+  return;
+}	#	sub addAlias($$$)
 
 # asmSource - is this a genbank or refseq assembly
 my $asmSource = "genbank";
 my $isRefSeq = 0; #       == 0 for Genbank assembly, == 1 for RefSeq assembly
 if ($ncbiAsmId =~ m/^GCF/) {
 #  printf STDERR "# processing a RefSeq assembly\n";
   $isRefSeq = 1;
   $asmSource = "refseq";
 } else {
 #  printf STDERR "# processing a GenBank assembly\n";
 }
 
 my $twoBit = "../../$asmId.2bit";
 
 open (FH, "twoBitInfo $twoBit stdout|") or die "can not twoBitInfo $twoBit stdout";
@@ -185,50 +193,53 @@
 while (my $line = <FH>) {
   chomp $line;
   my ($ucscName, $seqName) = split('\s+', $line);
   $ncbiToUcsc{$seqName} = $ucscName;
   $ucscToNcbi{$ucscName} = $seqName;
   ++$nameCount;
   $ucscNames = 1 if (defined($sequenceSizes{$ucscName}));
   if ($isRefSeq) {
     $ucscToRefSeq{$ucscName} = $seqName;
   } else {
     $ucscToGenbank{$ucscName} = $seqName;
   }
 }
 close (FH);
 
+# with the new style system, can not have multiple aliases for
+# the same sequence name.  The older system did do this.
+# Need to record them here so they can be skipped later.
 my $dupsNotFound = 0;
 my $dupsList = `ls ../../download/*.dups.txt.gz 2> /dev/null|head -1`;
 chomp $dupsList if (defined($dupsList));
 if ( -s "$dupsList" ) {
   open (FH, "zcat $dupsList | awk '{print \$1, \$3}'|") or die "can not read $dupsList";
   while (my $line = <FH>) {
     chomp $line;
     my ($dupAlias, $dupTarget) = split('\s+', $line);
     $dupToSequence{$dupAlias} = $dupTarget;
     if ($ucscNames) {
       if (!defined($ncbiToUcsc{$dupTarget})) {
        printf STDERR "# ERROR: can not find dupTarget: $dupTarget in ncbiToUcsc for dupAlias: $dupAlias\n";
          $dupsNotFound += 1;
-      } else {
-        printf STDERR "# addAlias(\"ucsc\", $dupAlias, $ncbiToUcsc{$dupTarget});\n";
+#      } else {
+#        printf STDERR "# dupList addAlias(\"ucsc\", $dupAlias, $ncbiToUcsc{$dupTarget});\n";
       }
-    } else {
-        printf STDERR "# addAlias($asmSource, $dupAlias, $dupTarget);\n";
+#    } else {
+#        printf STDERR "# dupList addAlias($asmSource, $dupAlias, $dupTarget);\n";
     }
-    printf STDERR "# adding duplicate alias %s\n", $dupAlias;
+    printf STDERR "# would have added duplicate alias %s for %s\n", $dupAlias, $dupTarget;
     ++$dupCount;
   }
   close (FH);
 }
 
 if ($dupsNotFound > 0) {
   printf STDERR "ERROR: can not find %d duplicate names\n", $dupsNotFound;
   exit 255;
 }
 if ($dupCount > 0) {
   printf STDERR "# processed %d duplicates\n", $dupCount;
 }
 
 # when not a UCSC named assembly, add the UCSC names as aliases
 if (! $ucscNames) {
@@ -325,38 +336,40 @@
   $asmName =~ s/ /_/g;	# some assemblies have spaces in chr names ...
   $asmName =~ s/:/_/g;	# one assembly had : in chr name
   if (defined($dupToSequence{$asmName})) {   # avoid duplicates
      printf STDERR "# skipping duplicate name $asmName\n";
      next;
   } elsif (defined($dupToSequence{$gbkName})) {   # avoid duplicates
      printf STDERR "# skipping duplicate name $gbkName\n";
      next;
   } elsif (defined($dupToSequence{$refSeqName})) {   # avoid duplicates
      printf STDERR "# skipping duplicate name $refSeqName\n";
      next;
   }
 #  next if ($refSeqName eq "na");	# may not be any RefSeq name
 #  next if ($gbkName eq "na");	# may not be any GenBank name
   # fill in ncbiToUcsc for potentially the 'other' NCBI name
+  if ($refSeqName ne "na"  && $gbkName ne "na") {
     if (defined($ncbiToUcsc{$refSeqName}) && !defined($ncbiToUcsc{$gbkName})) {
       $ncbiToUcsc{$gbkName} = $ncbiToUcsc{$refSeqName};
       $ucscToNcbi{$ncbiToUcsc{$refSeqName}} = $gbkName;
       }
     if (defined($ncbiToUcsc{$gbkName}) && !defined($ncbiToUcsc{$refSeqName})) {
       $ncbiToUcsc{$refSeqName} = $ncbiToUcsc{$gbkName};
       $ucscToNcbi{$ncbiToUcsc{$gbkName}} = $refSeqName;
     }
+  }
   if (defined($ncbiToUcsc{$gbkName})) {
      printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName, $ncbiToUcsc{$gbkName} if ($dbgCount < 5);
   } elsif (defined($ncbiToUcsc{$refSeqName})) {
      printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\t'%s'\n", $asmName, $gbkName, $refSeqName, $ncbiToUcsc{$refSeqName} if ($dbgCount < 5);
   } else {
      printf STDERR "# asmRpt: '%s'\t'%s'\t'%s'\tno UCSC name\n", $asmName, $gbkName, $refSeqName if ($dbgCount < 5);
   }
   if ($refSeqName ne "na") {
     my $seqName = $refSeqName;
     if (! $isRefSeq) {
       $seqName = $gbkName;
     }
     if ($ucscNames) {
        $seqName = $ncbiToUcsc{$seqName};
     }