4da1a14630708d887c3481e3b16df5044c86c59a
hiram
  Tue Oct 13 11:16:59 2020 -0700
better batch size for xenoRefGene and fixup to handle assemblies without repeat masker and RM can not run on those species refs #26347

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 2a6f7b4..b64d4bd 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -17,30 +17,31 @@
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_sourceDir
     $opt_species
     $opt_rmskSpecies
     $opt_ncbiRmsk
+    $opt_noRmsk
     $opt_augustusSpecies
     $opt_xenoRefSeq
     $opt_ucscNames
     $opt_asmHubName
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'download',   func => \&doDownload },
       { name => 'sequence',   func => \&doSequence },
       { name => 'assemblyGap',   func => \&doAssemblyGap },
       { name => 'chromAlias',   func => \&doChromAlias },
       { name => 'gatewayPage',   func => \&doGatewayPage },
       { name => 'cytoBand',   func => \&doCytoBand },
       { name => 'gc5Base',   func => \&doGc5Base },
@@ -56,34 +57,35 @@
       { name => 'ncbiGene',   func => \&doNcbiGene },
       { name => 'ncbiRefSeq',   func => \&doNcbiRefSeq },
       { name => 'xenoRefGene',   func => \&doXenoRefGene },
       { name => 'augustus',   func => \&doAugustus },
       { name => 'trackDb',   func => \&doTrackDb },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $species = "";       # usually found in asmId_assembly_report.txt
 my $ftpDir = "";	# will be determined from given asmId
 my $rmskSpecies = "";
+my $noRmsk = 0;	# when RepeatMasker is not possible, such as bacteria
 my $ncbiRmsk = 0;	# when =1 call doRepeatMasker.pl
                         # with -ncbiRmsk=path.out.gz and -liftSpec=...
 my $augustusSpecies = "human";
-my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
+my $xenoRefSeq = "/hive/data/genomes/asmHubs/xenoRefSeq";
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $asmHubName = "n/a";  # directory name in: /gbdb/hubs/asmHubName
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 # key is original accession name from the remove.dups.list, value is 1
 my %dupAccessionList;
 
 sub usage {
   # Usage / help / self-documentation:
@@ -100,30 +102,31 @@
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir     Construct assembly hub in dir instead of default
        $HgAutomate::clusterData/asmHubs/refseqBuild/GC[AF]/123/456/789/asmId/
     -sourceDir dir    Find assembly in dir instead of default:
        $sourceDir/GC[AF]/123/456/789/asmId
     -ucscNames        Translate NCBI/INSDC/RefSeq names to UCSC names
                       default is to use the given NCBI/INSDC/RefSeq names
     -asmHubName <name>  directory name in: /gbdb/hubs/asmHubName
     -species <name>   use this species designation if there is no
                       asmId_assembly_report.txt with an
                       'Organism name:' entry to obtain species
     -rmskSpecies <name> to override default 'species' name for repeat masker
                       the default is found in the asmId_asssembly_report.txt
                       e.g. -rmskSpecies=viruses
+    -noRmsk           when RepeatMasker is not possible, such as bacteria
     -ncbiRmsk         use NCBI rm.out.gz file instead of local cluster run
                       for repeat masking
     -augustusSpecies <human|chicken|zebrafish> default 'human'
     -xenoRefSeq </path/to/xenoRefSeqMrna> - location of xenoRefMrna.fa.gz
                 expanded directory of mrnas/ and xenoRefMrna.sizes, default
                 $xenoRefSeq
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $workhorse,
 						'fileServer' => $fileServer,
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Automates build of assembly hub.  Steps:
@@ -181,30 +184,31 @@
 }
 
 
 # Globals:
 # Command line args: asmId
 my ( $asmId);
 # Other:
 my ($buildDir, $secondsStart, $secondsEnd, $assemblySource);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'sourceDir=s',
 		      'rmskSpecies=s',
+		      'noRmsk',
 		      'ncbiRmsk',
 		      'augustusSpecies=s',
 		      'xenoRefSeq=s',
 		      'asmHubName=s',
 		      'ucscNames',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
@@ -643,32 +647,36 @@
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ ! -s \${asmId}.2bit -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then
   rm -f \${asmId}_genomic.fna.gz \\
     \${asmId}_genomic.fna.dups.gz \\
     \${asmId}_assembly_report.txt \\
     \${asmId}_rm.out.gz \\
     \${asmId}_rm.run \\
     \${asmId}_assembly_structure \\
     \$asmId.2bit
 
   ln -s $assemblySource/\${asmId}_genomic.fna.gz .
   ln -s $assemblySource/\${asmId}_assembly_report.txt .
+  if [ -s $assemblySource/\${asmId}_rm.out.gz ]; then
     ln -s $assemblySource/\${asmId}_rm.out.gz .
+  fi
+  if [ -s $assemblySource/\${asmId}_rm.run ]; then
     ln -s $assemblySource/\${asmId}_rm.run .
+  fi
   if [ -d $assemblySource/\${asmId}_assembly_structure ]; then
     ln -s $assemblySource/\${asmId}_assembly_structure .
   fi
   faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
   twoBitDup \$asmId.2bit > \$asmId.dups.txt
   if [ -s "\$asmId.dups.txt" ]; then
     printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2
     cat \$asmId.dups.txt 1>&2
     awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list
     mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz
     faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\
       \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz
     rm -f \$asmId.2bit
     faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
   fi
@@ -809,37 +817,46 @@
   $bossScript->add(<<_EOF_
 export asmId="$asmId"
 
 
 if [ -s ../\$asmId.chrom.sizes ]; then
   printf "sequence step previously completed\\n" 1>&2
   exit 0
 fi
 
 _EOF_
   );
 
 ### construct sequence when no AGP files exist
   if (0 == $partsDone) {
 printf STDERR "creating fake AGP\n";
+    if ($ucscNames) {
       $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz
 hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
 _EOF_
       );
     } else {
+      $bossScript->add(<<_EOF_
+twoBitToFa ../download/\$asmId.2bit stdout | gzip -c > \$asmId.fa.gz
+hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
+zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
+_EOF_
+      );
+    }
+} else {
 printf STDERR "partsDone: %d\n", $partsDone;
   }
 
   $bossScript->add(<<_EOF_
 zcat *.agp.gz | gzip > ../\$asmId.agp.gz
 faToTwoBit *.fa.gz ../\$asmId.2bit
 faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
 twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt
 if [ -s "\$asmId.dups.txt" ]; then
   printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2
   cat \$asmId.dups.txt 1>&2
   awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list
   mv ../\$asmId.unmasked.2bit ../\$asmId.unmasked.dups.2bit
   twoBitToFa ../\$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\
     stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz
@@ -1077,53 +1094,59 @@
   wigToBigWig \$asmId.wigVarStep.gz ../../\$asmId.chrom.sizes \$asmId.gc5Base.bw
   rm -f \$asmId.wigVarStep.gz
   touch -r ../../\$asmId.2bit \$asmId.gc5Base.bw
 else
   printf "# gc5Base step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # gc5Base
 
 #########################################################################
 # * step: repeatMasker [workhorse]
 sub doRepeatMasker {
+  if ($noRmsk) {
+       &HgAutomate::verbose(1, "# -noRmsk == RepeatMasker not being run\n");
+	return;
+  }
   my $runDir = "$buildDir/trackData/repeatMasker";
   if ( -d "$buildDir/trackData/repeatMasker/run.cluster" ) {
      if ( ! -s "$buildDir/trackData/repeatMasker/faSize.rmsk.txt" ) {
        &HgAutomate::verbose(1,
 	"\nERROR: step repeatmasker may be running\n");
        exit 255;
      }
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "construct repeatMasker track data";
   my $bossScript = newBash HgRemoteScript("$runDir/doRepeatMasker.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   my $rmskOpts = "";
   if ($ncbiRmsk) {
+     if ( -s "$buildDir/download/${asmId}_rm.out.gz" ) {
        $rmskOpts = " \\
   -ncbiRmsk=\"$buildDir/download/${asmId}_rm.out.gz\" ";
        if ($ucscNames) {
          $rmskOpts .= " \\
   -liftSpec=\"$buildDir/sequence/$asmId.ncbiToUcsc.lift\"";
        }
      }
+  }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ $buildDir/\$asmId.2bit -nt faSize.rmsk.txt ]; then
 export species=`echo $rmskSpecies | sed -e 's/_/ /g;'`
 
 rm -f versionInfo.txt
 
 doRepeatMasker.pl -stop=mask -buildDir=`pwd` -unmaskedSeq=$buildDir/\$asmId.2bit $rmskOpts \\
   -bigClusterHub=$bigClusterHub -workhorse=$workhorse -species="\$species" \$asmId
 
 if [ -s "\$asmId.fa.out" ]; then
   gzip \$asmId.fa.out
 fi
@@ -1300,60 +1323,65 @@
 else
   printf "# idKeys step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doIdKeys
 
 #########################################################################
 # * step: addMask [workhorse]
 sub doAddMask {
   my $runDir = "$buildDir/trackData/addMask";
 
   my $goNoGo = 0;
+  if (! $noRmsk) {
     if ( ! -s "$buildDir/trackData/repeatMasker/$asmId.rmsk.2bit" ) {
        printf STDERR "ERROR: repeatMasker step not completed\n";
        printf STDERR "can not find: $buildDir/trackData/repeatMasker/$asmId.rmsk.2bit\n";
        $goNoGo = 1;
     }
+  }
   if ( ! -s "$buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit" ) {
       printf STDERR "ERROR: windowMasker step not completed\n";
       printf STDERR "can not find: $buildDir/trackData/windowMasker/$asmId.cleanWMSdust.2bit\n";
       $goNoGo = 1;
   }
   if ( ! -s "$buildDir/trackData/simpleRepeat/trfMask.bed.gz" ) {
       printf STDERR "ERROR: simpleRepeat step not completed\n";
       printf STDERR "can not find: $buildDir/trackData/simpleRepeat/trfMask.bed.gz\n";
       $goNoGo = 1;
   }
   if ($goNoGo) {
       printf STDERR "ERROR: must complete repeatMasker, windowMasker and simpleRepeat before addMask\n";
       exit 255;
   }
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "add together (windowMasker or repeatMasker) and trf/simpleRepeats to construct masked 2bit file";
   my $bossScript = newBash HgRemoteScript("$runDir/doAddMask.bash",
                     $workhorse, $runDir, $whatItDoes);
 
   my $wmMasked=`grep "masked total" $buildDir/trackData/windowMasker/faSize.$asmId.cleanWMSdust.txt | awk '{print \$1}' | sed -e 's/%//;'`;
-  my $rmMasked=`grep "masked total" $buildDir/trackData/repeatMasker/faSize.rmsk.txt | awk '{print \$1}' | sed -e 's/%//;'`;
+  my $rmMasked = 0;
+  if (! $noRmsk) {
+    $rmMasked=`grep "masked total" $buildDir/trackData/repeatMasker/faSize.rmsk.txt | awk '{print \$1}' | sed -e 's/%//;'`;
+  }
 
   my $src2BitToMask = "../repeatMasker/$asmId.rmsk.2bit";
-  if ($wmMasked > $rmMasked) {
+  if ($noRmsk || ($wmMasked > $rmMasked)) {
     $src2BitToMask = "../windowMasker/$asmId.cleanWMSdust.2bit";
   }
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 if [ ../simpleRepeat/trfMask.bed.gz -nt \$asmId.masked.faSize.txt ]; then
   twoBitMask $src2BitToMask -type=.bed \\
      -add ../simpleRepeat/trfMask.bed.gz \$asmId.masked.2bit
   twoBitToFa \$asmId.masked.2bit stdout | faSize stdin > \$asmId.masked.faSize.txt
   touch -r \$asmId.masked.2bit \$asmId.masked.faSize.txt
   cp -p \$asmId.masked.faSize.txt ../../\$asmId.faSize.txt
 else
   printf "# addMask step previously completed\\n" 1>&2
   exit 0
@@ -1378,36 +1406,40 @@
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
 ### if [ ../../\$asmId.unmasked.2bit -nt fb.\$asmId.rmsk.windowmaskerSdust.txt ]; then
 if [ ../../\$asmId.unmasked.2bit -nt faSize.\$asmId.cleanWMSdust.txt ]; then
   \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -stop=twobit -buildDir=`pwd` -dbHost=$dbHost \\
     -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId
   bedInvert.pl ../../\$asmId.chrom.sizes ../allGaps/\$asmId.allGaps.bed.gz \\
     > not.gap.bed
   bedIntersect -minCoverage=0.0000000014 windowmasker.sdust.bed \\
      not.gap.bed stdout | sort -k1,1  -k2,2n > cleanWMask.bed
   twoBitMask $buildDir/\$asmId.unmasked.2bit cleanWMask.bed \\
     \$asmId.cleanWMSdust.2bit
   twoBitToFa \$asmId.cleanWMSdust.2bit stdout \\
     | faSize stdin > faSize.\$asmId.cleanWMSdust.txt
+  if [ -s ../repeatMasker/\$asmId.sorted.fa.out.gz ]; then
     zcat ../repeatMasker/\$asmId.sorted.fa.out.gz | sed -e 's/^  *//; /^\$/d;' \\
       | egrep -v "^SW|^score" | awk '{printf "%s\\t%d\\t%d\\n", \$5, \$6-1, \$7}' \\
         | bedSingleCover.pl stdin > rmsk.bed
     intersectRmskWM=`bedIntersect -minCoverage=0.0000000014 cleanWMask.bed \\
       rmsk.bed stdout | bedSingleCover.pl stdin | ave -col=4 stdin \\
        | grep "^total" | awk '{print \$2}' | sed -e 's/.000000//;'`
+  else
+    intersectRmskWM=0
+  fi
   chromSize=`ave -col=2 ../../\$asmId.chrom.sizes \\
      | grep "^total" | awk '{print \$2}' | sed -e 's/.000000//;'`
   echo \$intersectRmskWM \$chromSize | \\
      awk '{ printf "%d bases of %d (%.3f%%) in intersection\\n", \$1, \$2, 100.0*\$1/\$2}' \\
       > fb.\$asmId.rmsk.windowmaskerSdust.txt
   rm -f not.gap.bed rmsk.bed
   bedToBigBed -type=bed3 cleanWMask.bed ../../\$asmId.chrom.sizes \$asmId.windowMasker.bb
   gzip cleanWMask.bed
   \$HOME/kent/src/hg/utils/automation/doWindowMasker.pl -continue=cleanup -stop=cleanup -buildDir=`pwd` -dbHost=$dbHost \\
     -workhorse=$workhorse -unmaskedSeq=$buildDir/\$asmId.unmasked.2bit \$asmId
 else
   printf "# windowMasker step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
@@ -1861,30 +1893,31 @@
   if (length($species) < 1) {
      die "no -species specified and can not find Organism name: in $asmReport";
   }
 }
 
 $rmskSpecies = $opt_rmskSpecies ? $opt_rmskSpecies : $species;
 $augustusSpecies = $opt_augustusSpecies ? $opt_augustusSpecies : $augustusSpecies;
 $xenoRefSeq = $opt_xenoRefSeq ? $opt_xenoRefSeq : $xenoRefSeq;
 $ucscNames = $opt_ucscNames ? 1 : $ucscNames;   # '1' == 'TRUE'
 $workhorse = $opt_workhorse ? $opt_workhorse : $workhorse;
 $bigClusterHub = $opt_bigClusterHub ? $opt_bigClusterHub : $bigClusterHub;
 $smallClusterHub = $opt_smallClusterHub ? $opt_smallClusterHub : $smallClusterHub;
 $fileServer = $opt_fileServer ? $opt_fileServer : $fileServer;
 $asmHubName = $opt_asmHubName ? $opt_asmHubName : $asmHubName;
 $ncbiRmsk = $opt_ncbiRmsk ? 1 : 0;
+$noRmsk = $opt_noRmsk ? 1 : 0;
 
 die "can not find assembly source directory\n$assemblySource" if ( ! -d $assemblySource);
 printf STDERR "# buildDir: %s\n", $buildDir;
 printf STDERR "# sourceDir %s\n", $sourceDir;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# xenoRefSeq %s\n", $xenoRefSeq;
 printf STDERR "# assemblySource: %s\n", $assemblySource;
 printf STDERR "# asmHubName %s\n", $asmHubName;
 printf STDERR "# rmskSpecies %s\n", $rmskSpecies;
 printf STDERR "# augustusSpecies %s\n", $augustusSpecies;
 printf STDERR "# ncbiRmsk %s\n", $ncbiRmsk ? "TRUE" : "FALSE";
 printf STDERR "# ucscNames %s\n", $ucscNames ? "TRUE" : "FALSE";
 
 # Do everything.
 $stepper->execute();