249f1c6c7231ede3da49dd949eaffb491e5d48cf
hiram
  Fri Dec 23 08:39:41 2022 -0800
correct processing of alternatie and patch sequences no redmine

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 1a77c0f..289b29e 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -457,85 +457,96 @@
     }
   }
   close(FH);
   close(FA);
 }	# sub unlocalizedFasta($$$)
 
 #########################################################################
 # read alt_scaffold_placement file, return name correspondence in
 # given hash pointer
 sub readAltPlacement($$) {
   my ($altPlacementFile, $accToChr) = @_;
   open (AP, "<$altPlacementFile") or die "can not read $altPlacementFile";
   while (my $line = <AP>) {
     chomp $line;
     next if ($line =~ m/^#/);
+    my $fixAlt = "alt";
+    $fixAlt = "fix" if ($altPlacementFile =~ m/PATCH/);
     my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc, $parent_type, $parent_name, $parent_acc, $region_name, $ori, $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop, $alt_start_tail, $alt_stop_tail) = split('\t', $line);
     my $acc = $alt_scaf_acc;
     $alt_scaf_acc = $acc;
     my $ucscName = $acc;
-    if ($ucscNames) {
+    # should always be calculating UCSC names here, the resulting names
+    # can be used for either naming scheme later in AGP and FA processing
     $alt_scaf_acc =~ s/\./v/;
-      $ucscName = sprintf("chr%s_%s_alt", $parent_name, $alt_scaf_acc);
+    $ucscName = sprintf("chr%s_%s_%s", $parent_name, $alt_scaf_acc, $fixAlt);
     if ( $prim_asm_name ne "Primary Assembly" ) {
-        $ucscName = sprintf("%s_alt", $alt_scaf_acc);
-      }
+      $ucscName = sprintf("%s_%s", $alt_scaf_acc, $fixAlt);
     }
     $accToChr->{$acc} = $ucscName;
     printf STDERR "# warning: name longer than 31 characters: '%s'\n# in: '%s'\n", $ucscName, $altPlacementFile if (length($ucscName) > 31);
   }
   close (AP);
 }	#	sub readAltPlacement($$)
 
 #########################################################################
 ### process one of the alternate AGP files, changing names via the nameHash
 ### and writing to the given fileHandle (fh)
 sub processAltAgp($$$) {
   my ($agpFile, $nameHash, $fh) = @_;
   open (AG, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <AG>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my (@a) = split('\t', $line);
     my $ncbiAcc = $a[0];
     next if (exists($dupAccessionList{$ncbiAcc}));
     my $ucscName = $nameHash->{$ncbiAcc};
+    if ($ucscNames) {
       printf $fh "%s", $ucscName;  # begin AGP line with accession nam
+    } else {
+      printf $fh "%s", $ncbiAcc;  # begin AGP line with accession nam
+    }
     for (my $i = 1; $i < scalar(@a); ++$i) {  # the reset of the AGP line
       printf $fh "\t%s", $a[$i];
     }
     printf $fh "\n";
   }
   close (AG);
 }	#	sub processAltAgp($$$)
 
 #########################################################################
 ### process one of the alternate FASTA files, changing names via the nameHash
 ### and writing to the given fileHandle (fh)
 
 sub processAltFasta($$$) {
   my ($faFile, $nameHash, $fh) = @_;
   open (FF, "zcat $faFile|") or die "can not read $faFile";
   while (my $line = <FF>) {
     if ($line =~ m/^>/) {
       chomp $line;
-      $line =~ s/^>//;
-      $line =~ s/ .*//;
-      die "ERROR: can not find accession $line in nameHash" if (! exists($nameHash->{$line}));
-      my $ucscName = $nameHash->{$line};
+      my $ncbiAcc = $line;
+      $ncbiAcc =~ s/^>//;
+      $ncbiAcc =~ s/ .*//;
+      die "ERROR: can not find accession $ncbiAcc in nameHash" if (! exists($nameHash->{$ncbiAcc}));
+      my $ucscName = $nameHash->{$ncbiAcc};
+      if ($ucscNames) {
         printf $fh ">%s\n", $ucscName;
       } else {
+        printf $fh ">%s\n", $ncbiAcc;
+      }
+    } else {
       print $fh $line;
     }
   }
   close (FF);
 }	#	sub processAltFasta($$$)
 
 #########################################################################
 # there are alternate sequences, process their multiple AGP and FASTA files
 # into single AGP and FASTA files
 sub altSequences($) {
   my $runDir = "$buildDir/sequence";
   my ($assemblyStructure) = @_;
   # construct the name correspondence hash #################################
   my %accToChr;	# hash for accession name to browser chromosome name
   open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/alt_scaffold_placement.txt'|") or die "can not find alt_scaffolds in ${assemblyStructure}";;