dbdb4c9fb976266d03316b80d53c54a65784f1b9
hiram
  Thu Dec 12 13:06:05 2019 -0800
can verify the name lists have the correct number refs #23891

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 5c33ed6..7ca3a40 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -395,74 +395,74 @@
     my ($alt_asm_name, $prim_asm_name, $alt_scaf_name, $alt_scaf_acc, $parent_type, $parent_name, $parent_acc, $region_name, $ori, $alt_scaf_start, $alt_scaf_stop, $parent_start, $parent_stop, $alt_start_tail, $alt_stop_tail) = split('\t', $line);
     my $acc = $alt_scaf_acc;
     $alt_scaf_acc = $acc;
     my $ucscName = $acc;
     if ($ucscNames) {
       $alt_scaf_acc =~ s/\./v/;
       $ucscName = sprintf("chr%s_%s_alt", $parent_name, $alt_scaf_acc);
       if ( $prim_asm_name ne "Primary Assembly" ) {
         $ucscName = sprintf("%s_alt", $alt_scaf_acc);
       }
     }
     $accToChr->{$acc} = $ucscName;
     printf STDERR "# warning: name longer than 31 characters: '%s'\n# in: '%s'\n", $ucscName, $altPlacementFile if (length($ucscName) > 31);
   }
   close (AP);
-}
+}	#	sub readAltPlacement($$)
 
 #########################################################################
 ### process one of the alternate AGP files, changing names via the nameHash
 ### and writing to the given fileHandle (fh)
 sub processAltAgp($$$) {
   my ($agpFile, $nameHash, $fh) = @_;
   open (AG, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <AG>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my (@a) = split('\t', $line);
     my $ncbiAcc = $a[0];
     my $ucscName = $nameHash->{$ncbiAcc};
     printf $fh "%s", $ucscName;  # begin AGP line with accession nam
     for (my $i = 1; $i < scalar(@a); ++$i) {  # the reset of the AGP line
       printf $fh "\t%s", $a[$i];
     }
     printf $fh "\n";
   }
   close (AG);
-}
+}	#	sub processAltAgp($$$)
 
 #########################################################################
 ### process one of the alternate FASTA files, changing names via the nameHash
 ### and writing to the given fileHandle (fh)
 
 sub processAltFasta($$$) {
   my ($faFile, $nameHash, $fh) = @_;
   open (FF, "zcat $faFile|") or die "can not read $faFile";
   while (my $line = <FF>) {
     if ($line =~ m/^>/) {
       chomp $line;
       $line =~ s/^>//;
       $line =~ s/ .*//;
       die "ERROR: can not find accession $line in nameHash" if (! exists($nameHash->{$line}));
       my $ucscName = $nameHash->{$line};
       printf $fh ">%s\n", $ucscName;
     } else {
       print $fh $line;
     }
   }
   close (FF);
-}
+}	#	sub processAltFasta($$$)
 
 #########################################################################
 # there are alternate sequences, process their multiple AGP and FASTA files
 # into single AGP and FASTA files
 sub altSequences($) {
   my $runDir = "$buildDir/sequence";
   my ($assemblyStructure) = @_;
   # construct the name correspondence hash #################################
   my %accToChr;	# hash for accession name to browser chromosome name
   open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/alt_scaffold_placement.txt'|") or die "can not find alt_scaffolds in ${assemblyStructure}";;
   while (my $line = <FH>) {
    chomp $line;
 ###   printf "%s\n", $line;
    readAltPlacement($line, \%accToChr);
   }
@@ -483,31 +483,31 @@
     printf STDERR "%s\n", $agpFile;
   }
   close (FH);
   close (AGP);
   # process the FASTA files, writing the fastaOut file ######################
   my $fastaOut = "$runDir/$asmId.alt.fa.gz";
   open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut";
   open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/FASTA/alt.scaf.fna.gz'|") or die "can not find alt.scaf.fna.gz in ${assemblyStructure}";;
   while (my $faFile = <FH>) {
     chomp $faFile;
     processAltFasta($faFile, \%accToChr, \*FA);
     printf STDERR "%s\n", $faFile;
   }
   close (FH);
   close (FA);
-}
+}	#	sub altSequences($)
 
 #########################################################################
 # process NCBI unplaced AGP file, perhaps translate into UCSC naming scheme
 #   the agpNames result file is a naming correspondence file for later use
 #   optional chrPrefix can be "chrUn_" for assemblies that have other chr names
 sub unplacedAgp($$$$) {
   my ($agpFile, $agpOutput, $agpNames, $chrPrefix) = @_;
 
   open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput";
   open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames";
   open (FH, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <FH>) {
     if ($line =~ m/^#/) {
         print AGP $line;
     } else {
@@ -776,31 +776,42 @@
 export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"`
 export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"`
 if [ "\$newTotal" != "\$oldTotal" ]; then
   printf "# ERROR: sequence construction error: not same totals source vs. new:\n" 1>&2
   printf "# \$newTotal != \$oldTotal\n" 1>&2
   exit 255
 fi
 rm source.\$asmId.chrom.sizes
 export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1`
 if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then
   printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\n" 1>&2
   exit 255
 fi
 join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift
 join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift
-
+export c0=`cat \$asmId.ncbiToUcsc.lift | wc -l`
+export c1=`cat \$asmId.ucscToNcbi.lift | wc -l`
+export c2=`cat ../\$asmId.chrom.sizes | wc -l`
+# verify all names are accounted for
+if [ "\$c0" -ne "\$c2 ]; then
+  printf "# ERROR: not all names accounted for in \$asmId.ncbiToUcsc.lift" 1>&2
+  exit 255
+fi
+if [ "\$c1" -ne "\$c2 ]; then
+  printf "# ERROR: not all names accounted for in \$asmId.ucscToNcbi.lift" 1>&2
+  exit 255
+fi
 twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz
 touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz
 zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt
 touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt
 _EOF_
   );
   $bossScript->execute();
 } # doSequence
 
 #########################################################################
 # * step: assemblyGap [workhorse]
 sub doAssemblyGap {
   my $runDir = "$buildDir/trackData/assemblyGap";
   &HgAutomate::mustMkdir($runDir);