f9ce722d011fb87c9f61a4d892b9140ef7fa9047
hiram
  Mon Sep 12 20:38:56 2022 -0700
now working through step chromAlias and fixup that procedure to work correctly on ucsc names refs #29811

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 637607e..650231e 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -68,30 +68,31 @@
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $species = "";       # usually found in asmId_assembly_report.txt
 my $ftpDir = "";	# will be determined from given asmId
 my $rmskSpecies = "";
 my $noRmsk = 0;	# when RepeatMasker is not possible, such as bacteria
 my $ncbiRmsk = 0;	# when =1 call doRepeatMasker.pl
                         # with -ncbiRmsk=path.out.gz and -liftSpec=...
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq";
 my $noAugustus = 0;     # bacteria do *not* create an augustus track
 my $noXenoRefSeq = 0;	# bacteria do *not* create a xenoRefSeq track
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $dbName = "";  # default uses NCBI asmId for name, can specify: abcDef1
+my $defaultName = "";  # will be asmId or dbName if present
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # key is original accession name from the remove.dups.list, value is 1
 my %dupAccessionList;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
@@ -182,31 +183,31 @@
 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
    \$db.2bit.
 3. The \$db.2bit files have already been distributed to cluster-scratch
    (/scratch/hg or /iscratch, /san etc).
 4. template?
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: asmId
 my ( $asmId);
 # Other:
-my ($buildDir, $secondsStart, $secondsEnd, $assemblySource);
+my ($buildDir, $secondsStart, $secondsEnd, $assemblySource, $asmReport);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'sourceDir=s',
 		      'rmskSpecies=s',
 		      'noRmsk',
 		      'ncbiRmsk',
 		      'augustusSpecies=s',
 		      'xenoRefSeq=s',
 		      'dbName=s',
 		      'noXenoRefSeq',
 		      'noAugustus',
 		      'ucscNames',
@@ -846,33 +847,30 @@
     my $agpSource = "$nonNucAsm/unlocalized_scaffolds/AGP";
     my $agpOutput = "$runDir/$asmId.nonNucUnlocalized.agp.gz";
     my $agpNames = "$runDir/$asmId.nonNucUnlocalized.names";
     my $fastaOut = "$runDir/$asmId.nonNucUnlocalized.fa.gz";
     $partsDone += 1;
     if (needsUpdate($nonNucChr2scaf, $agpOutput)) {
       unlocalizedAgp($nonNucChr2scaf, $agpSource, $agpOutput, $agpNames);
       `touch -r $nonNucChr2scaf $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       unlocalizedFasta($nonNucChr2scaf, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
-  my $defaultName = $asmId;
-  $defaultName = $dbName if (length($dbName));
-
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 export dbName="$defaultName"
 
 if [ -s ../\$dbName.chrom.sizes ]; then
   printf "sequence step previously completed\\n" 1>&2
   exit 0
 fi
 
 _EOF_
   );
 
 ### construct sequence when no AGP files exist
   if (0 == $partsDone) {
     printf STDERR "# partsDone is zero, creating fake AGP\n";
@@ -965,31 +963,31 @@
   );
   $bossScript->execute();
 } # doSequence
 
 #########################################################################
 # * step: assemblyGap [workhorse]
 sub doAssemblyGap {
   my $runDir = "$buildDir/trackData/assemblyGap";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct assembly and gap tracks from AGP file";
   my $bossScript = newBash HgRemoteScript("$runDir/doAssemblyGap.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
-export asmId=$asmId
+export asmId="$defaultName"
 
 if [ ../../\$asmId.agp.gz -nt \$asmId.assembly.bb ]; then
 
 zcat ../../\$asmId.agp.gz | grep -v "^#" | awk '\$5 != "N" && \$5 != "U"' \\
    | awk '{printf "%s\\t%d\\t%d\\t%s\\t0\\t%s\\n", \$1, \$2-1, \$3, \$6, \$9}' \\
       | sort -k1,1 -k2,2n > \$asmId.assembly.bed
 
 zcat ../../\$asmId.agp.gz | grep -v "^#" | awk '\$5 == "N" || \$5 == "U"' \\
    | awk '{printf "%s\\t%d\\t%d\\t%s\\n", \$1, \$2-1, \$3, \$8}' \\
       | sort -k1,1 -k2,2n > \$asmId.gap.bed
 
 bedToBigBed -extraIndex=name -verbose=0 \$asmId.assembly.bed \\
     ../../\$asmId.chrom.sizes \$asmId.assembly.bb
 touch -r ../../\$asmId.agp.gz \$asmId.assembly.bb
 \$HOME/kent/src/hg/utils/automation/genbank/nameToIx.pl \\
@@ -1015,31 +1013,31 @@
   $bossScript->execute();
 } # assemblyGap
 
 #########################################################################
 # * step: chromAlias [workhorse]
 sub doChromAlias {
   my $runDir = "$buildDir/trackData/chromAlias";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct asmId.chromAlias.txt for alias name recognition";
   my $bossScript = newBash HgRemoteScript("$runDir/doChromAlias.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export buildDir=$buildDir
-export asmId=$asmId
+export asmId="$defaultName"
 
 \$HOME/kent/src/hg/utils/automation/asmHubChromAlias.pl \\
     \${asmId} | sort > \${asmId}.chromAlias.txt
 
 \$HOME/kent/src/hg/utils/automation/aliasTextToBed.pl \\
   -chromSizes=\$buildDir/\$asmId.chrom.sizes \\
     -aliasText=\${asmId}.chromAlias.txt \\
       -aliasBed=\${asmId}.chromAlias.bed \\
         -aliasAs=\${asmId}.chromAlias.as \\
         -aliasBigBed=\${asmId}.chromAlias.bb
 
 bigBedToBed -header \${asmId}.chromAlias.bb test.chromAlias.bed
 \$HOME/kent/src/hg/utils/automation/aliasBedToCt.pl \\
   test.chromAlias.bed \${asmId}.chromAlias.bb .
 
@@ -1076,34 +1074,34 @@
   my $photoJpg = "noPhoto";
   my $photoCredit = "noPhoto";
   my $photoLink = "";
   my $speciesNoBlank = $species;
   $speciesNoBlank =~ s/ /_/g;
   if ( -s "$runDir/../photo/$speciesNoBlank.jpg" ) {
      $photoJpg = "../photo/${speciesNoBlank}.jpg";
      $photoCredit = "../photo/photoCredits.txt";
      $photoLink = "rm -f ${speciesNoBlank}.jpg; ln -s ../photo/${speciesNoBlank}.jpg ."
   } else {
      printf STDERR "# gatewayPage: warning: no photograph available\n";
   }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
+export asmReport="$asmReport"
 
 \$HOME/kent/src/hg/utils/automation/asmHubGatewayPage.pl \\
-     ../download/\${asmId}_assembly_report.txt \\
-       ../\${asmId}.chrom.sizes \\
+     \${asmReport} ../\${asmId}.chrom.sizes \\
          $photoJpg $photoCredit \\
            > \$asmId.description.html 2> \$asmId.names.tab
 \$HOME/kent/src/hg/utils/automation/genbank/buildStats.pl \\
        ../\$asmId.chrom.sizes 2> \$asmId.build.stats.txt
 $photoLink
 _EOF_
   );
   $bossScript->execute();
 } # gatewayPage
 
 #########################################################################
 # * step: cytoBand [workhorse]
 sub doCytoBand {
   my $runDir = "$buildDir/trackData/cytoBand";
   &HgAutomate::mustMkdir($runDir);
@@ -2012,61 +2010,64 @@
 my @partNames = split('_', $asmId);
 $ftpDir = sprintf("%s/%s/%s/%s/%s", $partNames[0],
    substr($partNames[1],0,3), substr($partNames[1],3,3),
    substr($partNames[1],6,3), $asmId);
 
 # Force debug and verbose until this is looking pretty solid:
 # $opt_debug = 1;
 # $opt_verbose = 3 if ($opt_verbose < 3);
 
 # Establish what directory we will work in.
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/asmHubs/refseqBuild/$ftpDir";
 
 $sourceDir = $opt_sourceDir ? $opt_sourceDir : $sourceDir;
 $assemblySource = $opt_sourceDir ? "$opt_sourceDir" : "$sourceDir/$ftpDir";
-my $asmReport = "$assemblySource/${asmId}_assembly_report.txt";
+$asmReport = "$assemblySource/${asmId}_assembly_report.txt";
 
 $species = $opt_species ? $opt_species : $species;
 
 if (length($species) < 1) {
   if (-s "$asmReport") {
      $species = `grep -i "organism name:" $asmReport`;
      chomp $species;
      $species =~ s/.*organism\s+name:\s+//i;
      $species =~ s/\s+\(.*//;
   } else {
      die "ERROR: no -species specified and can not find $asmReport";
   }
   if (length($species) < 1) {
      die "ERROR: no -species specified and can not find Organism name: in $asmReport";
   }
 }
 
 $dbName = $opt_dbName ? $opt_dbName : $dbName;
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
 $noAugustus = $opt_noAugustus ? 1 : $noAugustus;   # '1' == 'TRUE'
 $noXenoRefSeq = $opt_noXenoRefSeq ? 1 : $noXenoRefSeq;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;
 $noRmsk = $opt_noRmsk ? 1 : 0;
+$defaultName = $asmId;
+$defaultName = $dbName if (length($dbName));
+
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
 if (length($dbName)) {
 printf STDERR "# dbName %s - building in /hive/data/genomes/%s\n", $dbName, $dbName;
 }
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
 printf STDERR "# rmskSpecies %s\n", $rmskSpecies;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE";
 printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE";
 printf STDERR "# noAugustus %s\n", $noAugustus ? "TRUE" : "FALSE";