9c0a3a2843c69011bd7061bc61a73c04033a8af0
hiram
  Fri Dec 6 15:20:47 2019 -0800
now building a fake AGP when there are none in the source no redmine

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index fa6f160..a6a243b 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -498,131 +498,152 @@
 
 
 #########################################################################
 # * step: sequence [workhorse]
 sub doSequence {
   my $runDir = "$buildDir/sequence";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "process source files into 2bit sequence and agp";
   my $bossScript = newBash HgRemoteScript("$runDir/doSequence.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   my $twoBitFile = "$buildDir/download/$asmId.2bit";
   my $otherChrParts = 0;  # to see if this is unplaced scaffolds only
   my $primaryAssembly = "$buildDir/download/${asmId}_assembly_structure/Primary_Assembly";
+  my $partsDone = 0;
 
   ###########  Assembled chromosomes  ################
   my $chr2acc = "$primaryAssembly/assembled_chromosomes/chr2acc";
   if ( -s $chr2acc ) {
     ++$otherChrParts;
     my $agpSource = "$primaryAssembly/assembled_chromosomes/AGP";
     my $agpOutput = "$runDir/$asmId.chr.agp.gz";
     my $agpNames = "$runDir/$asmId.chr.names";
     my $fastaOut = "$runDir/$asmId.chr.fa.gz";
+    $partsDone += 1;
     if (needsUpdate($chr2acc, $agpOutput)) {
       compositeAgp($chr2acc, $agpSource, $agpOutput, $agpNames);
       `touch -r $chr2acc $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       compositeFasta($chr2acc, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
   ###########  unlocalized sequence  ################
   my $chr2scaf = "$primaryAssembly/unlocalized_scaffolds/unlocalized.chr2scaf";
   if ( -s $chr2scaf ) {
     ++$otherChrParts;
     my $agpSource = "$primaryAssembly/unlocalized_scaffolds/AGP";
     my $agpOutput = "$runDir/$asmId.unlocalized.agp.gz";
     my $agpNames = "$runDir/$asmId.unlocalized.names";
     my $fastaOut = "$runDir/$asmId.unlocalized.fa.gz";
+    $partsDone += 1;
     if (needsUpdate($chr2scaf, $agpOutput)) {
       unlocalizedAgp($chr2scaf, $agpSource, $agpOutput, $agpNames);
       `touch -r $chr2scaf $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       unlocalizedFasta($chr2scaf, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
   ###########  unplaced sequence  ################
   my $unplacedScafAgp = "$primaryAssembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz";
   if ( -s $unplacedScafAgp ) {
     my $chrPrefix = "";   # no prefix if no other chrom parts
     $chrPrefix = "chrUn_" if ($otherChrParts);
     my $agpOutput = "$runDir/$asmId.unplaced.agp.gz";
     my $agpNames = "$runDir/$asmId.unplaced.names";
+    $partsDone += 1;
 
     if (needsUpdate($unplacedScafAgp, $agpOutput)) {
       unplacedAgp($unplacedScafAgp, $agpOutput, $agpNames, $chrPrefix);
       `touch -r $unplacedScafAgp $agpOutput`;
     }
     my $fastaOut = "$runDir/$asmId.unplaced.fa.gz";
     if (needsUpdate($twoBitFile, $fastaOut)) {
       unplacedFasta($unplacedScafAgp, $twoBitFile, $chrPrefix, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
   ###########  non-nuclear chromosome sequence  ################
   my $nonNucAsm = "$buildDir/download/${asmId}_assembly_structure/non-nuclear";
   my $nonNucChr2acc = "$nonNucAsm/assembled_chromosomes/chr2acc";
   my $agpSource = "$nonNucAsm/assembled_chromosomes/AGP";
   if ( -s $nonNucChr2acc ) {
     my $agpOutput = "$runDir/$asmId.nonNucChr.agp.gz";
     my $agpNames = "$runDir/$asmId.nonNucChr.names";
     my $fastaOut = "$runDir/$asmId.nonNucChr.fa.gz";
+    $partsDone += 1;
     if (needsUpdate($nonNucChr2acc, $agpOutput)) {
       compositeAgp($nonNucChr2acc, $agpSource, $agpOutput, $agpNames);
       `touch -r $nonNucChr2acc $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       compositeFasta($nonNucChr2acc, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
   ###########  non-nuclear scaffold unlocalized sequence  ################
   my $nonNucChr2scaf = "$nonNucAsm/unlocalized_scaffolds/unlocalized.chr2scaf";
   if ( -s $nonNucChr2scaf ) {
     my $agpOutput = "$runDir/$asmId.nonNucUnlocalized.agp.gz";
     my $agpNames = "$runDir/$asmId.nonNucUnlocalized.names";
     my $fastaOut = "$runDir/$asmId.nonNucUnlocalized.fa.gz";
+    $partsDone += 1;
     if (needsUpdate($nonNucChr2scaf, $agpOutput)) {
       compositeAgp($nonNucChr2scaf, $agpSource, $agpOutput, $agpNames);
       `touch -r $nonNucChr2scaf $agpOutput`;
     }
     if (needsUpdate($twoBitFile, $fastaOut)) {
       compositeFasta($nonNucChr2scaf, $twoBitFile, $fastaOut);
       `touch -r $twoBitFile $fastaOut`;
     }
   }
 
-### XXX TO BE DONE - construct sequence when no AGP files exist
-
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 
 
 if [ -s ../\$asmId.chrom.sizes ]; then
   printf "sequence step previously completed\\n" 1>&2
   exit 0
 fi
 
+_EOF_
+  );
+
+### construct sequence when no AGP files exist
+  if (0 == $partsDone) {
+printf STDERR "creating fake AGP\n";
+    $bossScript->add(<<_EOF_
+twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz
+hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
+zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
+_EOF_
+    );
+} else {
+printf STDERR "partsDone: %d\n", $partsDone;
+  }
+
+  $bossScript->add(<<_EOF_
 zcat *.agp.gz | gzip > ../\$asmId.agp.gz
 faToTwoBit *.fa.gz ../\$asmId.2bit
 faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
 twoBitDup -keyList=stdout ../\$asmId.unmasked.2bit > \$asmId.dupCheck.txt
 (grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt
 if [ -s "\$asmId.dups.txt" ]; then
   printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\n" 1>&2
   grep "are identical" \$asmId.dupCheck.txt 1>&2
   exit 255
 else
   rm -f \$asmId.dups.txt
 fi
 touch -r ../download/\$asmId.2bit ../\$asmId.2bit
 touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit
 touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz
@@ -880,30 +901,33 @@
 # * step: allGaps [workhorse]
 sub doAllGaps {
   my $runDir = "$buildDir/trackData/allGaps";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct 'all' gap track data";
   my $bossScript = newBash HgRemoteScript("$runDir/doAllGaps.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 export buildDir=$buildDir
 
 if [ \$buildDir/\$asmId.2bit -nt \$asmId.allGaps.bb ]; then
   twoBitInfo -nBed ../../\$asmId.2bit stdout | awk '{printf "%s\\t%d\\t%d\\t%d\\t%d\\t+\\n", \$1, \$2, \$3, NR, \$3-\$2}' > \$asmId.allGaps.bed
+  if [ ! -s \$asmId.allGaps.bed ]; then
+    exit 0
+  fi
   if [ -s ../assemblyGap/\$asmId.gap.bb ]; then
     bigBedToBed ../assemblyGap/\$asmId.gap.bb \$asmId.gap.bed
     # verify the 'all' gaps should include the gap track items
     bedIntersect -minCoverage=0.0000000014 \$asmId.allGaps.bed \$asmId.gap.bed \\
       \$asmId.verify.annotated.gap.bed
     gapTrackCoverage=`awk '{print \$3-\$2}' \$asmId.gap.bed \\
       | ave stdin | grep "^total" | sed -e 's/.000000//;'`
     intersectCoverage=`ave -col=5 \$asmId.verify.annotated.gap.bed \\
       | grep "^total" | sed -e 's/.000000//;'`
     if [ \$gapTrackCoverage -ne \$intersectCoverage ]; then
       printf "ERROR: 'all' gaps does not include gap track coverage\\n" 1>&2
       printf "gapTrackCoverage: \$gapTrackCoverage != \$intersectCoverage intersection\\n" 1>&2
       exit 255
     fi
   else