06d7be056190c14b85e71bc12523f18ea6815b5e
markd
  Mon Dec 7 00:50:29 2020 -0800
BLAT mmap index support merge with master

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index b021e16..c0042d1 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -195,30 +195,32 @@
 my ( $asmId);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd, $assemblySource);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'sourceDir=s',
 		      'rmskSpecies=s',
 		      'noRmsk',
 		      'ncbiRmsk',
 		      'augustusSpecies=s',
 		      'xenoRefSeq=s',
 		      'asmHubName=s',
+		      'noXenoRefSeq',
+		      'noAugustus',
 		      'ucscNames',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 #########################################################################
 #  assistant subroutines here.  The 'do' steps follow this section
 #########################################################################
@@ -837,31 +839,31 @@
 printf STDERR "creating fake AGP\n";
     if ($ucscNames) {
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz
 hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
 _EOF_
       );
     } else {
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz
 hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 twoBitInfo ../download/\$asmId.2bit stdout | cut -f1 \\
   | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" \\
     | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\$\\)/.\\1/;' \\
-      | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
+      | awk '{printf "%s\\t%s\\n", \$1, \$2}' | sort > \$asmId.fake.names
 _EOF_
       );
     }
 } else {
 printf STDERR "partsDone: %d\n", $partsDone;
   }
 
   $bossScript->add(<<_EOF_
 zcat *.agp.gz | gzip > ../\$asmId.agp.gz
 faToTwoBit *.fa.gz ../\$asmId.2bit
 faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
 twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt
 if [ -s "\$asmId.dups.txt" ]; then
   printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2
   cat \$asmId.dups.txt 1>&2
@@ -982,31 +984,31 @@
 
   my $whatItDoes = "construct asmId.chromAlias.txt for alias name recognition";
   my $bossScript = newBash HgRemoteScript("$runDir/doChromAlias.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export buildDir=$buildDir
 export asmId=$asmId
 
 \$HOME/kent/src/hg/utils/automation/asmHubChromAlias.pl \\
     \${asmId} | sort > \${asmId}.chromAlias.txt
 # verify each sequence name has an alias
 export sizeCount=`cat ../../\${asmId}.chrom.sizes | wc -l`
 export aliasCount=`grep -v "^#" \${asmId}.chromAlias.txt | wc -l`
 if [ "\${sizeCount}" -ne "\${aliasCount}" ]; then
-  printf "ERROR: chromAlias: incorrect number of aliases %d != %d\\n" "\${sizeCount}" "\${aliasCount}" 1>&2
+  printf "ERROR: chromAlias: incorrect number of aliases chromSizes %d > %d aliasCount\\n" "\${sizeCount}" "\${aliasCount}" 1>&2
   exit 255
 fi
 
 exit 0
 
 _EOF_
   );
   $bossScript->execute();
 } # chromAlias
 
 #########################################################################
 # * step: gatewayPage [workhorse]
 sub doGatewayPage {
   if ($asmHubName eq "n/a") {
     printf STDERR "ERROR: step gatewayPage needs argument -asmHubName <name>\n";
@@ -1126,30 +1128,34 @@
 	"\nERROR: step repeatmasker may be running\n");
        exit 255;
      }
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct repeatMasker track data";
   my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   my $rmskOpts = "";
   if ($ncbiRmsk) {
      if ( -s "$buildDir/download/${asmId}_rm.out.gz" ) {
        $rmskOpts = " \\
   -ncbiRmsk=\"$buildDir/download/${asmId}_rm.out.gz\" ";
+       if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) {
+         $rmskOpts .= " \\
+  -dupList=\"${buildDir}/download/${asmId}.remove.dups.list\" ";
+       }
        if ($ucscNames) {
          $rmskOpts .= " \\
   -liftSpec=\"$buildDir/sequence/$asmId.ncbiToUcsc.lift\"";
        }
      }
   }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then
 export species=`echo $rmskSpecies | sed -e 's/_/ /g;'`
 
 rm -f versionInfo.txt
 
@@ -1595,44 +1601,49 @@
        &HgAutomate::verbose(1, "# ncbiGene step previously completed\n");
        return;
      }
   }
   if (! -s "$buildDir/$asmId.faSize.txt") {
     &HgAutomate::verbose(1, "# step ncbiGene: can not find faSize.txt at:\n#  $buildDir/$asmId.faSize.txt\n");
     exit 255;
   }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "translate NCBI GFF3 gene definitions into a track";
   my $bossScript = newBash HgRemoteScript("$runDir/doNcbiGene.bash",
                     $workhorse, $runDir, $whatItDoes);
 
+  my $dupList = "";
+  if ( -s "${buildDir}/download/${asmId}.remove.dups.list" ) {
+    $dupList = " | grep -v -f \"${buildDir}/download/${asmId}.remove.dups.list\" ";
+  }
+
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 export gffFile=$gffFile
 
 function cleanUp() {
   rm -f \$asmId.ncbiGene.genePred.gz \$asmId.ncbiGene.genePred
   rm -f \$asmId.geneAttrs.ncbi.txt
 }
 
 if [ \$gffFile -nt \$asmId.ncbiGene.bb ]; then
   (gff3ToGenePred -warnAndContinue -useName \\
     -attrsOut=\$asmId.geneAttrs.ncbi.txt \$gffFile stdout \\
       2>> \$asmId.ncbiGene.log.txt || true) | genePredFilter stdin stdout \\
-        | gzip -c > \$asmId.ncbiGene.genePred.gz
+        $dupList | gzip -c > \$asmId.ncbiGene.genePred.gz
   genePredCheck \$asmId.ncbiGene.genePred.gz
   export howMany=`genePredCheck \$asmId.ncbiGene.genePred.gz 2>&1 | grep "^checked" | awk '{print \$2}'`
   if [ "\${howMany}" -eq 0 ]; then
      printf "# ncbiGene: no gene definitions found in \$gffFile\n";
      cleanUp
      exit 0
   fi
   export ncbiGenePred="\$asmId.ncbiGene.genePred.gz"
 _EOF_
   );
   if ($ucscNames) {
     $bossScript->add(<<_EOF_
   liftUp -extGenePred -type=.gp stdout \\
     ../../sequence/\$asmId.ncbiToUcsc.lift warn \\
       \$asmId.ncbiGene.genePred.gz | gzip -c \\