663d3c8c2095473aeb96b0d24849a2b793795d40
hiram
  Wed Dec 11 14:00:57 2019 -0800
now correctly processin alternate haplotype and patch sequences refs #23891

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index a6a243b..5c33ed6 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -371,31 +371,143 @@
       die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($accToChr{$line}));
       my $chrN = $accToChr{$line};
       my $acc = $line;
       $acc =~ s/\./v/ if ($ucscNames);
       my $ucscName = "${acc}";
       $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames);
       printf FA ">%s\n", $ucscName;
     } else {
       print FA $line;
     }
   }
   close(FH);
   close(FA);
 }	# sub unlocalizedFasta($$$)
 
-### XXX TO BE DONE - process alternate haplotypes via alt.scaf.agp.gz
+#########################################################################
+# read alt_scaffold_placement file, return name correspondence in
+# given hash pointer
+sub readAltPlacement($$) {
+  my ($altPlacementFile, $accToChr) = @_;
+  open (AP, "<$altPlacementFile") or die "can not read $altPlacementFile";
+  while (my $line = <AP>) {
+    chomp $line;
+    next if ($line =~ m/^#/);
+    my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc, $parent_type, $parent_name, $parent_acc, $region_name, $ori, $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop, $alt_start_tail, $alt_stop_tail) = split('\t', $line);
+    my $acc = $alt_scaf_acc;
+    $alt_scaf_acc = $acc;
+    my $ucscName = $acc;
+    if ($ucscNames) {
+      $alt_scaf_acc =~ s/\./v/;
+      $ucscName = sprintf("chr%s_%s_alt", $parent_name, $alt_scaf_acc);
+      if ( $prim_asm_name ne "Primary Assembly" ) {
+        $ucscName = sprintf("%s_alt", $alt_scaf_acc);
+      }
+    }
+    $accToChr->{$acc} = $ucscName;
+    printf STDERR "# warning: name longer than 31 characters: '%s'\n# in: '%s'\n", $ucscName, $altPlacementFile if (length($ucscName) > 31);
+  }
+  close (AP);
+}
+
+#########################################################################
+### process one of the alternate AGP files, changing names via the nameHash
+### and writing to the given fileHandle (fh)
+sub processAltAgp($$$) {
+  my ($agpFile, $nameHash, $fh) = @_;
+  open (AG, "zcat $agpFile|") or die "can not read $agpFile";
+  while (my $line = <AG>) {
+    next if ($line =~ m/^#/);
+    chomp $line;
+    my (@a) = split('\t', $line);
+    my $ncbiAcc = $a[0];
+    my $ucscName = $nameHash->{$ncbiAcc};
+    printf $fh "%s", $ucscName;  # begin AGP line with accession nam
+    for (my $i = 1; $i < scalar(@a); ++$i) {  # the reset of the AGP line
+      printf $fh "\t%s", $a[$i];
+    }
+    printf $fh "\n";
+  }
+  close (AG);
+}
+
+#########################################################################
+### process one of the alternate FASTA files, changing names via the nameHash
+### and writing to the given fileHandle (fh)
+
+sub processAltFasta($$$) {
+  my ($faFile, $nameHash, $fh) = @_;
+  open (FF, "zcat $faFile|") or die "can not read $faFile";
+  while (my $line = <FF>) {
+    if ($line =~ m/^>/) {
+      chomp $line;
+      $line =~ s/^>//;
+      $line =~ s/ .*//;
+      die "ERROR: can not find accession $line in nameHash" if (! exists($nameHash->{$line}));
+      my $ucscName = $nameHash->{$line};
+      printf $fh ">%s\n", $ucscName;
+    } else {
+      print $fh $line;
+    }
+  }
+  close (FF);
+}
+
+#########################################################################
+# there are alternate sequences, process their multiple AGP and FASTA files
+# into single AGP and FASTA files
+sub altSequences($) {
+  my $runDir = "$buildDir/sequence";
+  my ($assemblyStructure) = @_;
+  # construct the name correspondence hash #################################
+  my %accToChr;	# hash for accession name to browser chromosome name
+  open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/alt_scaffold_placement.txt'|") or die "can not find alt_scaffolds in ${assemblyStructure}";;
+  while (my $line = <FH>) {
+   chomp $line;
+###   printf "%s\n", $line;
+   readAltPlacement($line, \%accToChr);
+  }
+  close (FH);
+  # and write to alt.names
+  open (AN, ">$runDir/$asmId.alt.names");
+  foreach my $acc (sort keys %accToChr) {
+    printf AN "%s\t%s\n", $accToChr{$acc}, $acc;
+  }
+  close (AN);
+  # process the AGP files, writing the agpOutput file ######################
+  my $agpOutput = "$runDir/$asmId.alt.agp.gz";
+  open (AGP, "|gzip -c > $agpOutput") or die "can not write to $agpOutput";
+  open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/AGP/alt.scaf.agp.gz'|") or die "can not find alt.scaf.agp.gz in ${assemblyStructure}";;
+  while (my $agpFile = <FH>) {
+    chomp $agpFile;
+    processAltAgp($agpFile, \%accToChr, \*AGP);
+    printf STDERR "%s\n", $agpFile;
+  }
+  close (FH);
+  close (AGP);
+  # process the FASTA files, writing the fastaOut file ######################
+  my $fastaOut = "$runDir/$asmId.alt.fa.gz";
+  open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut";
+  open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/FASTA/alt.scaf.fna.gz'|") or die "can not find alt.scaf.fna.gz in ${assemblyStructure}";;
+  while (my $faFile = <FH>) {
+    chomp $faFile;
+    processAltFasta($faFile, \%accToChr, \*FA);
+    printf STDERR "%s\n", $faFile;
+  }
+  close (FH);
+  close (FA);
+}
 
 #########################################################################
 # process NCBI unplaced AGP file, perhaps translate into UCSC naming scheme
 #   the agpNames result file is a naming correspondence file for later use
 #   optional chrPrefix can be "chrUn_" for assemblies that have other chr names
 sub unplacedAgp($$$$) {
   my ($agpFile, $agpOutput, $agpNames, $chrPrefix) = @_;
 
   open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput";
   open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames";
   open (FH, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <FH>) {
     if ($line =~ m/^#/) {
         print AGP $line;
     } else {
@@ -538,30 +650,40 @@
     my $agpSource = "$primaryAssembly/unlocalized_scaffolds/AGP";
     my $agpOutput = "$runDir/$asmId.unlocalized.agp.gz";
     my $agpNames = "$runDir/$asmId.unlocalized.names";
     my $fastaOut = "$runDir/$asmId.unlocalized.fa.gz";
     $partsDone += 1;
     if (needsUpdate($chr2scaf, $agpOutput)) {
       unlocalizedAgp($chr2scaf, $agpSource, $agpOutput, $agpNames);
       `touch -r $chr2scaf $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       unlocalizedFasta($chr2scaf, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
+  ###########  alternate sequences  ##############
+  my $assemblyStructure = "$buildDir/download/${asmId}_assembly_structure";
+  my $altCount = `find -L "${assemblyStructure}" -type f | grep "/alt_scaffolds/alt_scaffold_placement.txt" | wc -l`;
+  chomp $altCount;
+  if ($altCount > 0) {
+    $partsDone += 1;
+    ++$otherChrParts;
+    altSequences($assemblyStructure);
+  }
+
   ###########  unplaced sequence  ################
   my $unplacedScafAgp = "$primaryAssembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
   if ( -s $unplacedScafAgp ) {
     my $chrPrefix = "";   # no prefix if no other chrom parts
     $chrPrefix = "chrUn_" if ($otherChrParts);
     my $agpOutput = "$runDir/$asmId.unplaced.agp.gz";
     my $agpNames = "$runDir/$asmId.unplaced.names";
     $partsDone += 1;
 
     if (needsUpdate($unplacedScafAgp, $agpOutput)) {
       unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix);
       `touch -r $unplacedScafAgp $agpOutput`;
     }
     my $fastaOut = "$runDir/$asmId.unplaced.fa.gz";
     if (needsUpdate($twoBitFile, $fastaOut)) {