60499556c04cfbfd9197f76d1e1d0152a8d12614
markd
  Tue Mar 2 21:58:17 2021 -0800
added option to build chrispr for hubs, doing this removed the no longer used gene track options

diff --git src/hg/utils/automation/doCrispr.pl src/hg/utils/automation/doCrispr.pl
index e400ce9..883b498 100755
--- src/hg/utils/automation/doCrispr.pl
+++ src/hg/utils/automation/doCrispr.pl
@@ -5,32 +5,32 @@
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
+    $opt_forHub
     $opt_twoBit
-    $opt_shoulder
     $opt_tableName
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'indexFa',   func => \&doIndexFa },
       { name => 'ranges',   func => \&doRanges },
       { name => 'guides',   func => \&doGuides },
       { name => 'specScoreJobList',   func => \&doSpecScoreJobList },
       { name => 'specScores',   func => \&doSpecScores },
       { name => 'effScores',   func => \&doEffScores },
       { name => 'offTargets',   func => \&doOffTargets },
       { name => 'load',   func => \&doLoad },
       { name => 'cleanup', func => \&doCleanup },
     ]
@@ -39,159 +39,167 @@
 # external database of crispr data:
 my $crisporSrc = "/hive/data/outside/crisprTrack/crispor";
 # path to the crispr track pipeline scripts
 # look at ~/kent/src/hg/makeDb/crisprTrack/README.txt for
 # more info about these scripts in there
 my $crisprScripts = "/hive/data/outside/crisprTrack/scripts";
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $bigClusterHub = 'ku';
 my $smallClusterHub = 'hgwdev-101';
 my $fileServer = 'hgwdev';
 my $workhorse = 'hgwdev';
 my $defaultWorkhorse = 'hgwdev';
 my $twoBit = "$HgAutomate::clusterData/\$db/\$db.2bit";
-my $shoulder = 10000;	# default shoulder around exons
 my $tableName = "crispr10K";	# table name in database to load
 my $python = "/cluster/software/bin/python";
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
-usage: $base db geneTrack
+usage: $base db
 options:
     db - UCSC database name
-    geneTrack - name of track to use for source of genes, valid names:
-                knownGene, ensGene, xenoRefGene
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default:
                               $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/crispr
+    -forHub               This is being build for a hub, remove assumptions about
+                              database-based genome
     -twoBit seq.2bit      Use seq.2bit as the input sequence instead
                               of default: ($twoBit).
-    -shoulder N           Use N as 'shoulder' extra on each side of an exon,
-                              default: $shoulder bases.
     -tableName name       Use name as gbdb path name and base name for db
-i                             table name, default: $tableName(Ranges|Targets)
+                              table name, default: $tableName(Ranges|Targets)
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $defaultWorkhorse,
 						'bigClusterHub' => $bigClusterHub,
 						'fileServer' => $fileServer,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Automates UCSC's crispr procedure.  Steps:
     indexFa Construct bwa index for source fasta, will be skipped when already
             available (workhorse)
-    ranges: Construct ranges to work on around exons (workhorse)
+    ranges: Construct ranges to work on, avoiding gaps (workhorse)
     guides: Extract guide sequences from ranges (smallClusterHub)
     specScoreJobList: Construct spec score jobList (workhorse)
     specScores: Compute spec scores (bigClusterHub)
     effScores: Compute the efficiency scores (bigClusterHub)
     offTargets: Converting off-targets (bigClusterHub)
     load: Construct bigBed file of results, link into /gbdb/<db>/<tableName>
           and load into database if database is present (dbHost)
     cleanup: Removes or compresses intermediate files. (fileServer)
 All operations are performed in the build directory which is
 $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/crispr unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains RepeatMasked sequence for
    database/assembly \$db.
 2. $HgAutomate::clusterData/\$db/chrom.sizes contains all sequence names and sizes from
    \$db.2bit.
 3. The \$db.2bit files have already been distributed to cluster-scratch
    (/scratch/hg or /iscratch, /san etc).
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
-my ($db, $geneTrack);
+my ($db);
 # Other:
-my ($buildDir, $secondsStart, $secondsEnd, $dbExists, $genomeFname,
-    $chromSizes, $exonShoulder);
+my ($buildDir, $forHub, $secondsStart, $secondsEnd, $dbExists, $crisporGenomesDir, $genomeFname,
+    $chromSizes);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'twoBit=s',
-		      'shoulder=s',
 		      'tableName=s',
+                      'forHub',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 # * step: indexFa [workhorse]
 sub doIndexFa {
   my $runDir = "$buildDir/indexFa";
-  my $resultDir = "$crisporSrc/genomes/$db";
-  my $testDone = "${resultDir}/genomeInfo.tab";
+  my $testDone = "${crisporGenomesDir}/${db}/genomeInfo.tab";
 
   &HgAutomate::mustMkdir($runDir);
   my $whatItDoes = "Construct bwa indexes for the new genome fasta.";
   my $bossScript = newBash HgRemoteScript("$runDir/runIndexFa.bash",
 		$workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export PATH=$crisporSrc/tools/usrLocalBin:\$PATH
 export TMPDIR=/dev/shm
 
-if [ ! -s "$testDone" ]; then
-  time ($crisporSrc/tools/crisprAddGenome \\
-      ucscLocal $db --geneTable=$geneTrack --baseDir \\
-          $crisporSrc/genomes) > createIndex.log 2>&1 &
+if [ $forHub = 1 -a ! -s "$chromSizes" ] ; then
+  twoBitInfo $twoBit $chromSizes
+fi
+if [ ! -s "$db.fa" ] ; then
+  twoBitToFa -noMask $twoBit $db.fa
 fi
 
 if [ ! -s "$db.fa.fai" ]; then
-  twoBitToFa -noMask $twoBit $db.fa
   /cluster/bin/samtools-0.1.19/samtools faidx $db.fa &
 fi
 
+if [ ! -s "$testDone" ]; then
+  if [ $forHub = 1 ] ; then
+    time ($crisporSrc/tools/crisprAddGenome \\
+        fasta $db.fa --baseDir $crisporGenomesDir \\
+        --desc='$db|$db|$db|$db') > createIndex.log 2>&1 &
+  else
+    time ($crisporSrc/tools/crisprAddGenome \\
+        ucscLocal $db --baseDir $crisporSrc/genomes) > createIndex.log 2>&1 &
+  fi
+fi
+
 wait
+
 if [ ! -s "$db.fa.fai" ]; then
   printf "ERROR: step indexFa: samtools index of $db.fa has failed\\n" 1>&2
   exit 255
 fi
-
 if [ ! -s "$testDone" ]; then
    printf "ERROR: bwa index not created correctly\\n" 1>&2
    printf "result file does not exist:\\n" 1>&2
    printf "%s\\n" "$testDone"
    exit 255
 fi
+
 _EOF_
   );
   $bossScript->execute();
   if ( -s "${testDone}" ) {
      &HgAutomate::verbose(1,
          "# step indesFa is already completed, continuing...\n");
      return;
   }
 } # doIndexFa
 
 #########################################################################
 # * step: ranges [workhorse]
 sub doRanges {
   my $runDir = "$buildDir/ranges";
   my $inFaDir = "$buildDir/ranges/tmp";
@@ -207,50 +215,32 @@
         "or determine the failure to complete this step.\n";
    } else {
      &HgAutomate::verbose(1,
          "# step ranges is already completed, continuing...\n");
      return;
    }
   }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "Construct ranges to work on around exons.";
   my $bossScript = newBash HgRemoteScript("$runDir/runRanges.bash",
 		$workhorse, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
-printf "# getting genes\\n" 1>&2
-hgsql $db -NB -e 'select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from '$geneTrack' where chrom not like "%_alt" and chrom not like "%hap%"; '  > genes.gp
-printf "# Number of transcripts: %d\\n" "`cat genes.gp | wc -l`" 1>&2
-printf "# break genes into exons and add $exonShoulder bp on each side\\n" 1>&2
-genePredToBed genes.gp stdout | grep -v hap | grep -v chrUn \\
-    | bedToExons stdin stdout \\
-    | awk '{\$2=\$2-$exonShoulder; \$3=\$3+$exonShoulder; \$6="+"; print}' \\
-    | bedClip -truncate  stdin $chromSizes stdout | grep -v _alt \\
-    | grep -i -v hap > ranges.bed
-
 printf "# Get sequence without any gaps.\\n" 1>&2
-# featureBits $db -not gap -bed=notGap.bed
-hgsql -N -e 'select chrom,chromStart,chromEnd from gap;' $db \\
-   | bedInvert.pl $chromSizes stdin > notGap.bed
-
-# the minCoverage means 1 base in 100 billion will be recognized properly
-# the awk '\$4 > 19' selects items 20 bases and larger sized items
-bedIntersect -minCoverage=0.00000000001 ranges.bed notGap.bed \\
-   stdout | bedSingleCover.pl stdin | awk '\$4 > 19' \\
-      > crisprRanges.bed
+twoBitInfo -nBed $twoBit stdout | bedInvert.pl $chromSizes stdin > crisprRanges.bed
 
 twoBitToFa -noMask -bedPos -bed=crisprRanges.bed $twoBit ranges.fa
 
 printf "# split the sequence file into pieces for the cluster\\n" 1>&2
 mkdir -p tmp
 faSplit sequence ranges.fa 100 tmp/
 _EOF_
   );
   $bossScript->execute();
 } # doRanges
 
 #########################################################################
 sub doGuides {
 # * step: guides [bigClusterHub]
   my $paraHub = $bigClusterHub;
@@ -281,32 +271,32 @@
 		     "{check out exists tmp/\$(root1).fa}" );
   &HgAutomate::makeGsub($runDir, $templateCmd);
 
   my $whatItDoes = "Construct guides for alignments.";
   my $bossScript = newBash HgRemoteScript("$runDir/runGuides.bash",
 		$paraHub, $runDir, $whatItDoes);
 
   my $paraRun = &HgAutomate::paraRun();
   my $gensub2 = &HgAutomate::gensub2();
   `touch "$runDir/para_hub_$paraHub"`;
   $bossScript->add(<<_EOF_
 mkdir -p tmp
 find $inFaDir -type f | grep "\.fa\$" > fa.list
 $gensub2 fa.list single gsub jobList
 $paraRun
-catDir -suffix=.fa tmp | grep -v "^>" > ../allGuides.txt
-catDir -suffix=.bed tmp > ../allGuides.bed
+find tmp -name '*.fa' | xargs cat | grep -v "^>" > ../allGuides.txt
+find tmp -name '*.bed' | xargs cat > ../allGuides.bed
 _EOF_
   );
   $bossScript->execute();
 } # doGuides
 
 #########################################################################
 sub doSpecScoreJobList {
 # * step: specScoreJobList [workhorse]
   my $paraHub = $smallClusterHub;
   my $runDir = "$buildDir/specScores";
   my $indexDir = "$buildDir/indexFa";
 
   # First, make sure we're starting clean, or if done already, let
   #   it continue.
   if ( ! $opt_debug && ( -d "$runDir" ) ) {
@@ -526,37 +516,42 @@
      &HgAutomate::verbose(1,
          "# step load is already completed, continuing...\n");
      return;
     }
   }
 
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "Construct bigBed file and load into database.";
   my $bossScript = newBash HgRemoteScript("$runDir/loadUp.bash",
 		$dbHost, $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 # creating the bigBed file
 $python $crisprScripts/createBigBed.py $db allGuides.bed specScores.tab effScores.tab offtargets.offsets.tab
+_EOF_
+  );
+  if (!$forHub) {
+    $bossScript->add(<<_EOF_
 # now link it into gbdb
 mkdir -p /gbdb/$db/$tableName/
 ln -sf `pwd`/crispr.bb /gbdb/$db/$tableName/crispr.bb
 ln -sf `pwd`/crisprDetails.tab /gbdb/$db/$tableName/crisprDetails.tab
 _EOF_
      );
-  if ($dbExists) {
+  }
+  if ($dbExists && !$forHub) {
     $bossScript->add(<<_EOF_
 hgBbiDbLink $db ${tableName}Targets /gbdb/$db/$tableName/crispr.bb
 hgLoadBed $db ${tableName}Ranges ranges/crisprRanges.bed
 _EOF_
     );
   }
   $bossScript->execute();
 } # doLoad
 
 #########################################################################
 # * step: cleanup [fileServer]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doCleanup.bash", $fileServer,
@@ -605,61 +600,66 @@
 printf "#\tdisk space after cleaning\\n"
 df -h .
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 &HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 &checkOptions();
-&usage(1) if (scalar(@ARGV) != 2);
+&usage(1) if (scalar(@ARGV) != 1);
 $workhorse = $opt_workhorse if ($opt_workhorse);
 $bigClusterHub = $opt_bigClusterHub if ($opt_bigClusterHub);
 $smallClusterHub = $opt_smallClusterHub if ($opt_smallClusterHub);
 $fileServer = $opt_fileServer if ($opt_fileServer);
 $dbHost = $opt_dbHost if ($opt_dbHost);
 &HgAutomate::verbose(1, "# bigClusterHub $bigClusterHub\n");
 &HgAutomate::verbose(1, "# smallClusterHub $smallClusterHub\n");
 &HgAutomate::verbose(1, "# fileServer $fileServer\n");
 &HgAutomate::verbose(1, "# dbHost $dbHost\n");
 &HgAutomate::verbose(1, "# workhorse $workhorse\n");
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 
 # required command line arguments:
-($db, $geneTrack) = @ARGV;
+($db) = @ARGV;
 
 # Establish directory to work in:
 $buildDir = $opt_buildDir ? $opt_buildDir :
     "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/crispr";
+$forHub = $opt_forHub ? 1 : 0;
 $twoBit = $opt_twoBit ? $opt_twoBit : "$HgAutomate::clusterData/$db/$db.2bit";
 &HgAutomate::verbose(1, "# 2bit file: $twoBit\n");
 die "can not find 2bit file: '$twoBit'" if ( ! -s $twoBit);
-$shoulder = $opt_shoulder ? $opt_shoulder : $shoulder;
 $tableName = $opt_tableName ? $opt_tableName : $tableName;
-die "illegal value for shoulder: $shoulder, must be >= 30" if ($shoulder < 30);
-&HgAutomate::verbose(1, "# shoulder: $shoulder bases\n");
 &HgAutomate::verbose(1, "# tableName $tableName\n");
-$genomeFname = "$crisporSrc/genomes/$db/$db.fa.bwt";
+
+if ($forHub) {
+    $crisporGenomesDir  = "$buildDir/genomes";
+    $chromSizes = "$buildDir/indexFa/chrom.sizes";
+} else {
+    $crisporGenomesDir = "$crisporSrc/genomes";
     $chromSizes = "/hive/data/genomes/$db/chrom.sizes";
-$exonShoulder = $shoulder;
+}
+$genomeFname = "$crisporGenomesDir/$db/$db.fa.bwt";
+
 
 # may be working on a 2bit file that does not have a database browser
 $dbExists = 0;
 $dbExists = 1 if (&HgAutomate::databaseExists($dbHost, $db));
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;