c2d8f10c6d0ce5ebb68e8a76c5d9a2e94ab3dcd8
hiram
  Tue Dec 28 15:51:34 2021 -0800
correct ncbiGene procedure and correctly run up unplaced contig assembly when no AGP present no redmine

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 74da44a..cb93b70 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -585,72 +585,100 @@
           $line =~ s/\./v/;
           printf AGP "%s%s", $chrPrefix, $line;
         } else {
           printf AGP "%s", $line;
         }
     }
   }
   close (FH);
   close (NAMES);
   close (AGP);
 }	# sub unplacedAgp($$$$)
 
 #########################################################################
 # process NCBI unplaced FASTA file, perhaps translate into UCSC naming scheme
 #   optional chrPrefix can be "chrUn_" for assemblies that have other chr names
-sub unplacedFasta($$$$) {
-  my ($agpFile, $twoBitFile, $chrPrefix, $fastaOut) = @_;
+sub unplacedFasta($$$$$$$) {
+  my ($agpFile, $faFile, $twoBitFile, $chrPrefix, $fastaOut, $agpOutput, $agpNames) = @_;
 
   my %contigName;  # key is NCBI contig name, value is UCSC contig name
+  if ( -s $agpFile ) {
     open (FH, "zcat $agpFile|") or die "can not read $agpFile";
     while (my $line = <FH>) {
       if ($line !~ m/^#/) {
           my ($ncbiAcc, undef) = split('\s+', $line, 2);
           my $ucscAcc = $ncbiAcc;
           $ucscAcc =~ s/\./v/ if ($ucscNames);
           $contigName{$ncbiAcc} = $ucscAcc;
       }
     }
     close (FH);
+  } else {	# no AGP file, get the contig names from the fasta file
+    open (FH, "zgrep '^>' $faFile | cut -d' ' -f1 | tr -d '>'|") or die "can not read $faFile";
+    while (my $ncbiAcc = <FH>) {
+      chomp $ncbiAcc;
+      my $ucscAcc = $ncbiAcc;
+      $ucscAcc =~ s/\./v/ if ($ucscNames);
+      $contigName{$ncbiAcc} = $ucscAcc;
+    }
+    close (FH);
+  }
 
   my ($fh, $tmpFile) = tempfile("seqList_XXXX", DIR=>'/dev/shm', SUFFIX=>'.txt', UNLINK=>1);
   foreach my $ctg (sort keys %contigName) {
      if (! exists($dupAccessionList{$ctg})) {
        printf $fh "%s\n", $ctg;
      }
   }
   close $fh;
 
   open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut";
   open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile";
   while (my $line = <FH>) {
     if ($line =~ m/^>/) {
       chomp $line;
       $line =~ s/^>//;
       $line =~ s/ .*//;
       die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($contigName{$line}));
       if ($ucscNames) {
         printf FA ">%s%s\n", $chrPrefix, $contigName{$line};
       } else {
         printf FA ">%s\n", $contigName{$line};
       }
     } else {
       print FA $line;
     }
   }
   close(FH);
   close(FA);
+  if ( ! -s $agpFile ) {	# there was no AGP file, make a fake one
+     printf STDERR "# no AGP for unplaced sequence, making up a fake AGP\n";
+     print `hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 $fastaOut stdout | gzip -c > $agpOutput`;
+     open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames";
+     open (FH, "zcat $agpOutput|") or die "can not read $agpOutput";
+     while (my $line = <FH>) {
+       if ($line !~ m/^#/) {
+          my ($ncbiAcc, undef) = split('\s+', $line, 2);
+          next if (exists($dupAccessionList{$ncbiAcc}));
+          my $ucscAcc = $ncbiAcc;
+          $ucscAcc =~ s/\./v/;
+          printf NAMES "%s%s\t%s\n", $chrPrefix, $ucscAcc, $ncbiAcc;
+       }
+     }
+    close (FH);
+    close (NAMES);
+  }
 }	# sub unplacedFasta($$$$)
 
 #########################################################################
 #########################################################################
 # do Steps section
 #########################################################################
 #########################################################################
 # * step: download [workhorse]
 sub doDownload {
   my $runDir = "$buildDir/download";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "setup work hierarchy of sym links to source files in\n\t$runDir/";
   my $bossScript = newBash HgRemoteScript("$runDir/doDownload.bash", $workhorse,
 				      $runDir, $whatItDoes);
@@ -758,47 +786,52 @@
     }
   }
 
   ###########  alternate sequences  ##############
   my $assemblyStructure = "$buildDir/download/${asmId}_assembly_structure";
   my $altCount = `find -L "${assemblyStructure}" -type f | grep "/alt_scaffolds/alt_scaffold_placement.txt" | wc -l`;
   chomp $altCount;
   if ($altCount > 0) {
     $partsDone += 1;
     ++$otherChrParts;
     altSequences($assemblyStructure);
   }
 
   ###########  unplaced sequence  ################
   my $unplacedScafAgp = "$primaryAssembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
-  if ( -s $unplacedScafAgp ) {
+  my $unplacedScafFa = "$primaryAssembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz";
+  if ( -s $unplacedScafAgp || -s $unplacedScafFa) {
     my $chrPrefix = "";   # no prefix if no other chrom parts
     $chrPrefix = "chrUn_" if ($otherChrParts && ! $ucscNames);
     my $agpOutput = "$runDir/$asmId.unplaced.agp.gz";
     my $agpNames = "$runDir/$asmId.unplaced.names";
     $partsDone += 1;
 
+    if ( -s $unplacedScafAgp ) {
       if (needsUpdate($unplacedScafAgp, $agpOutput)) {
         unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix);
         `touch -r $unplacedScafAgp $agpOutput`;
       }
+    }
+    if ( -s $unplacedScafFa ) {	# will make an AGP file if there was none
       my $fastaOut = "$runDir/$asmId.unplaced.fa.gz";
       if (needsUpdate($twoBitFile, $fastaOut)) {
-      unplacedFasta($unplacedScafAgp, $twoBitFile, $chrPrefix, $fastaOut);
+        unplacedFasta($unplacedScafAgp, $unplacedScafFa, $twoBitFile, $chrPrefix, $fastaOut, $agpOutput, $agpNames);
         `touch -r $twoBitFile $fastaOut`;
       }
     }
+  }
 
   ###########  non-nuclear chromosome sequence  ################
   my $nonNucAsm = "$buildDir/download/${asmId}_assembly_structure/non-nuclear";
   my $nonNucChr2acc = "$nonNucAsm/assembled_chromosomes/chr2acc";
   if ( -s $nonNucChr2acc ) {
     my $agpSource = "$nonNucAsm/assembled_chromosomes/AGP";
     my $agpOutput = "$runDir/$asmId.nonNucChr.agp.gz";
     my $agpNames = "$runDir/$asmId.nonNucChr.names";
     my $fastaOut = "$runDir/$asmId.nonNucChr.fa.gz";
     $partsDone += 1;
     if (needsUpdate($nonNucChr2acc, $agpOutput)) {
       compositeAgp($nonNucChr2acc, $agpSource, $agpOutput, $agpNames);
       `touch -r $nonNucChr2acc $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
@@ -827,42 +860,42 @@
 
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 
 
 if [ -s ../\$asmId.chrom.sizes ]; then
   printf "sequence step previously completed\\n" 1>&2
   exit 0
 fi
 
 _EOF_
   );
 
 ### construct sequence when no AGP files exist
   if (0 == $partsDone) {
-printf STDERR "creating fake AGP\n";
+    printf STDERR "# partsDone is zero, creating fake AGP\n";
     if ($ucscNames) {
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz
-hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
+hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
 _EOF_
       );
     } else {
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz
-hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
+hgFakeAgp -singleContigs -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\
   | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\
     | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\
       | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names
 _EOF_
       );
     }
 } else {
 printf STDERR "partsDone: %d\n", $partsDone;
   }
 
   $bossScript->add(<<_EOF_
 zcat *.agp.gz | gzip > ../\$asmId.agp.gz
 faToTwoBit *.fa.gz ../\$asmId.2bit
 faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
@@ -1639,31 +1672,31 @@
      }
   }
   if (! -s "$buildDir/$asmId.faSize.txt") {
     &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n#  $buildDir/$asmId.faSize.txt\n");
     exit 255;
   }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "translate NCBI GFF3 gene definitions into a track";
   my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   my $dupList = "";
   if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) {
-    $dupList = " | grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\"  || true";
+    $dupList = " | (grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\"  || true)";
   }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 export gffFile=$gffFile
 
 function cleanUp() {
   rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred
   rm -f \$asmId.geneAttrs.ncbi.txt
 }
 
 if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then
   (gff3ToGenePred -warnAndContinue -useName \\
     -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\
       2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter \\