b2b24fc1972e32973020518288369e2582a7ffcf
hiram
  Wed Sep 30 12:39:12 2020 -0700
fixup errors in building a hub without UCSC names refs #24386

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 0e93e6c..2a6f7b4 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -358,34 +358,39 @@
     my $chrN = $accToChr{$acc};
     next if (exists($chrNDone{$chrN}));
     $chrNDone{$chrN} = 1;
     my $agpFile =  "$agpSource/chr$chrN.unlocalized.scaf.agp.gz";
     open (FH, "zcat $agpFile|") or die "can not read $agpFile";
     while (my $line = <FH>) {
       if ($line !~ m/^#/) {
          chomp $line;
          my (@a) = split('\t', $line);
          my $ncbiAcc = $a[0];
          next if (exists($dupAccessionList{$ncbiAcc}));
          my $acc = $ncbiAcc;
          $acc =~ s/\./v/ if ($ucscNames);
          die "ERROR: chrN $chrN not correct for $acc"
              if ($accToChr{$acc} ne $chrN);
-         my $ucscName = "${acc}";
-         $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames);
+         my $ucscName = "$ncbiAcc";
+         $ucscName = "chr${chrN}_${acc}_random";
+         $ucscName =~ s/\./v/;
          printf NAMES "%s\t%s\n", $ucscName, $ncbiAcc;
+         if ($ucscNames) {
             printf AGP "%s", $ucscName;    # begin AGP line with accession name
+         } else {
+            printf AGP "%s", $ncbiAcc;    # begin AGP line with accession name
+         }
          for (my $i = 1; $i < scalar(@a); ++$i) {   # the rest of the AGP line
              printf AGP "\t%s", $a[$i];
          }
          printf AGP "\n";
       }
     }
     close (FH);
   }
   close (AGP);
   close (NAMES);
 }	# sub unlocalizedAgp($$$$)
 
 #########################################################################
 # process unlocalized NCBI fasta sequence, use UCSC chrom names if requested
 sub unlocalizedFasta($$$) {
@@ -547,37 +552,39 @@
 # process NCBI unplaced AGP file, perhaps translate into UCSC naming scheme
 #   the agpNames result file is a naming correspondence file for later use
 #   optional chrPrefix can be "chrUn_" for assemblies that have other chr names
 sub unplacedAgp($$$$) {
   my ($agpFile, $agpOutput, $agpNames, $chrPrefix) = @_;
 
   open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput";
   open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames";
   open (FH, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <FH>) {
     if ($line =~ m/^#/) {
         print AGP $line;
     } else {
         my ($ncbiAcc, undef) = split('\s+', $line, 2);
         next if (exists($dupAccessionList{$ncbiAcc}));
-        if ($ucscNames) {
         my $ucscAcc = $ncbiAcc;
         $ucscAcc =~ s/\./v/;
         printf NAMES "%s%s\t%s\n", $chrPrefix, $ucscAcc, $ncbiAcc;
+        if ($ucscNames) {
           $line =~ s/\./v/;
-        }
           printf AGP "%s%s", $chrPrefix, $line;
+        } else {
+          printf AGP "%s", $line;
+        }
     }
   }
   close (FH);
   close (NAMES);
   close (AGP);
 }	# sub unplacedAgp($$$$)
 
 #########################################################################
 # process NCBI unplaced FASTA file, perhaps translate into UCSC naming scheme
 #   optional chrPrefix can be "chrUn_" for assemblies that have other chr names
 sub unplacedFasta($$$$) {
   my ($agpFile, $twoBitFile, $chrPrefix, $fastaOut) = @_;
 
   my %contigName;  # key is NCBI contig name, value is UCSC contig name
   open (FH, "zcat $agpFile|") or die "can not read $agpFile";
@@ -595,32 +602,36 @@
   foreach my $ctg (sort keys %contigName) {
      if (! exists($dupAccessionList{$ctg})) {
        printf $fh "%s\n", $ctg;
      }
   }
   close $fh;
 
   open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut";
   open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile";
   while (my $line = <FH>) {
     if ($line =~ m/^>/) {
       chomp $line;
       $line =~ s/^>//;
       $line =~ s/ .*//;
       die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($contigName{$line}));
+      if ($ucscNames) {
         printf FA ">%s%s\n", $chrPrefix, $contigName{$line};
       } else {
+        printf FA ">%s\n", $contigName{$line};
+      }
+    } else {
       print FA $line;
     }
   }
   close(FH);
   close(FA);
 }	# sub unplacedFasta($$$$)
 
 #########################################################################
 #########################################################################
 # do Steps section
 #########################################################################
 #########################################################################
 # * step: download [workhorse]
 sub doDownload {
   my $runDir = "$buildDir/download";
@@ -731,31 +742,31 @@
 
   ###########  alternate sequences  ##############
   my $assemblyStructure = "$buildDir/download/${asmId}_assembly_structure";
   my $altCount = `find -L "${assemblyStructure}" -type f | grep "/alt_scaffolds/alt_scaffold_placement.txt" | wc -l`;
   chomp $altCount;
   if ($altCount > 0) {
     $partsDone += 1;
     ++$otherChrParts;
     altSequences($assemblyStructure);
   }
 
   ###########  unplaced sequence  ################
   my $unplacedScafAgp = "$primaryAssembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
   if ( -s $unplacedScafAgp ) {
     my $chrPrefix = "";   # no prefix if no other chrom parts
-    $chrPrefix = "chrUn_" if ($otherChrParts);
+    $chrPrefix = "chrUn_" if ($otherChrParts && ! $ucscNames);
     my $agpOutput = "$runDir/$asmId.unplaced.agp.gz";
     my $agpNames = "$runDir/$asmId.unplaced.names";
     $partsDone += 1;
 
     if (needsUpdate($unplacedScafAgp, $agpOutput)) {
       unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix);
       `touch -r $unplacedScafAgp $agpOutput`;
     }
     my $fastaOut = "$runDir/$asmId.unplaced.fa.gz";
     if (needsUpdate($twoBitFile, $fastaOut)) {
       unplacedFasta($unplacedScafAgp, $twoBitFile, $chrPrefix, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
@@ -1791,30 +1802,36 @@
   $bossScript->add(<<_EOF_
 printf "to be done\\n" 1>&2
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
+if (scalar(@ARGV) != 1) {
+  printf STDERR "ERROR: can not find 1 argument in ARGV, instead: %d\n", scalar(@ARGV);
+  for (my $i = 0; $i < scalar(@ARGV); ++$i) {
+    printf "# ARGV[%d] : '%s'\n", $i, $ARGV[$i];
+  }
+}
 &usage(1) if (scalar(@ARGV) != 1);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 
 # expected command line arguments after options are processed
 ($asmId) = @ARGV;
 # yes, there can be more than two fields separated by _
 # but in this case, we only care about the first two:
 # GC[AF]_123456789.3_assembly_Name
 #   0         1         2      3 ....
 my @partNames = split('_', $asmId);
 $ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0],
    substr($partNames[1],0,3), substr($partNames[1],3,3),
    substr($partNames[1],6,3), $asmId);
 
@@ -1854,30 +1871,32 @@
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName;
 $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
 printf STDERR "# asmHubName %s\n", $asmHubName;
 printf STDERR "# rmskSpecies %s\n", $rmskSpecies;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
+printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE";
+printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE";
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;