b9032792e3f2d0e704ecaf56b1fac45457acaa37
hiram
  Fri Feb 7 14:54:23 2020 -0800
eliminate : in chr names and correctly reading dup list where needed refs #23891

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 0b27f9a..cdd137e 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -232,30 +232,31 @@
      }
   }
   return 1;
 }
 
 #########################################################################
 # read chr2acc file, return name correspondence in given hash pointer
 sub readChr2Acc($$) {
   my ($chr2acc, $accToChr) = @_;
   open (FH, "<$chr2acc") or die "can not read $chr2acc";
   while (my $line = <FH>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my ($chrN, $acc) = split('\t', $line);
     $chrN =~ s/ /_/g;   # some assemblies have spaces in chr names ...
+    $chrN =~ s/:/_/g;   # one assembly GCF_002910315.2 had : in a chr name
     $accToChr->{$acc} = $chrN;
   }
   close (FH);
 }
 
 #########################################################################
 # process NCBI AGP file into UCSC naming scheme
 #   the agpNames result file is a naming correspondence file for later use
 sub compositeAgp($$$$) {
   my ($chr2acc, $agpSource, $agpOutput, $agpNames) = @_;
   my %accToChr;
   readChr2Acc($chr2acc, \%accToChr);
 
   open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput";
   open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames";
@@ -648,50 +649,50 @@
     faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\
       \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz
     rm -f \$asmId.2bit
     faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
   fi
   gzip -f \$asmId.dups.txt
   touch -r \${asmId}_genomic.fna.gz \$asmId.2bit
 else
   printf "# download step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 
-  readDupsList();
-
 } # doDownload
 
 
 #########################################################################
 # * step: sequence [workhorse]
 sub doSequence {
   my $runDir = "$buildDir/sequence";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "process source files into 2bit sequence and agp";
   my $bossScript = newBash HgRemoteScript("$runDir/doSequence.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   my $twoBitFile = "$buildDir/download/$asmId.2bit";
   my $otherChrParts = 0;  # to see if this is unplaced scaffolds only
   my $primaryAssembly = "$buildDir/download/${asmId}_assembly_structure/Primary_Assembly";
   my $partsDone = 0;
 
+  readDupsList();
+
   ###########  Assembled chromosomes  ################
   my $chr2acc = "$primaryAssembly/assembled_chromosomes/chr2acc";
   if ( -s $chr2acc ) {
     ++$otherChrParts;
     my $agpSource = "$primaryAssembly/assembled_chromosomes/AGP";
     my $agpOutput = "$runDir/$asmId.chr.agp.gz";
     my $agpNames = "$runDir/$asmId.chr.names";
     my $fastaOut = "$runDir/$asmId.chr.fa.gz";
     $partsDone += 1;
     if (needsUpdate($chr2acc, $agpOutput)) {
       compositeAgp($chr2acc, $agpSource, $agpOutput, $agpNames);
       `touch -r $chr2acc $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       compositeFasta($chr2acc, $twoBitFile, $fastaOut);