src/hg/utils/automation/doBigDbSnp.pl 122806164798753fbfbce3cc77411c911a4d9d1e

122806164798753fbfbce3cc77411c911a4d9d1e
angie
  Tue Oct 1 13:15:36 2019 -0700
Adding automation script doBigDbSnp.pl that downloads dbSNP JSON files, does a cluster run of dbSnpJsonToTab, aggregates results, runs checkBigDbSnp & installs track files.  refs #23283

diff --git src/hg/utils/automation/doBigDbSnp.pl src/hg/utils/automation/doBigDbSnp.pl
new file mode 100755
index 0000000..e57d99e
--- /dev/null
+++ src/hg/utils/automation/doBigDbSnp.pl
@@ -0,0 +1,644 @@
+#!/usr/bin/env perl
+
+# DO NOT EDIT the /cluster/bin/scripts copy of this file --
+# edit ~/kent/src/hg/utils/automation/doBigDbSnp.pl instead.
+
+# Copyright (C) 2019 The Regents of the University of California
+
+use Getopt::Long;
+use warnings;
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin";
+use HgAutomate;
+use HgRemoteScript;
+use HgStepManager;
+
+# Option variable names, both common and peculiar to this script:
+use vars @HgAutomate::commonOptionVars;
+use vars @HgStepManager::optionVars;
+use vars qw/
+    $opt_assemblyList
+    $opt_buildDir
+    /;
+
+# Specify the steps supported with -continue / -stop:
+my $stepper = new HgStepManager(
+    [ { name => 'split',   func => \&doSplit },
+      { name => 'convert',   func => \&doConvert },
+      { name => 'mergeToChrom',   func => \&doMergeToChrom },
+      { name => 'mergeChroms',   func => \&doMergeChroms },
+      { name => 'fixHg19ChrM',   func => \&doFixHg19ChrM },
+      { name => 'check',   func => \&doCheck },
+      { name => 'bigBed',   func => \&doBigBed },
+      { name => 'install',   func => \&doInstall },
+      { name => 'cleanup', func => \&doCleanup },
+    ]
+				);
+
+# Files that must exist in $topDir:
+my $refSeqToUcsc = 'refSeqToUcsc.tab';
+my $equivRegions = 'equivRegions.tab';
+
+# Option defaults:
+my $assemblyList = 'GRCh37.p13,GRCh38.p12';
+my $dbHost = 'hgwdev';
+my $bigClusterHub = 'ku';
+my $smallClusterHub = 'hgwdev';
+my $workhorse = 'hgwdev';
+
+my $outRoot = 'dbSnp';
+
+my $base = $0;
+$base =~ s/^(.*\/)?//;
+
+sub usage {
+  # Usage / help / self-documentation:
+  my ($status, $detailed) = @_;
+  # Basic help (for incorrect usage):
+  print STDERR "
+usage: $base topDir buildId freqSourceOrder
+options:
+";
+  print STDERR $stepper->getOptionHelp();
+  print STDERR <<_EOF_
+    -assemblyList list    Comma-separated list of assemblies used by dbSNP
+                          default: $assemblyList
+    -buildDir dir         Use dir instead of default topDir/bigDbSnp.\$date
+                          (necessary when continuing at a later date).
+_EOF_
+  ;
+  print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
+						'workhorse' => $workhorse,
+						'fileServer' => '',
+						'bigClusterHub' => $bigClusterHub,
+						'smallClusterHub' => $smallClusterHub);
+  print STDERR "
+Convert dbSNP JSON into bigDbSnp and associated track files.
+
+topDir is usually /hive/data/outside/dbSNP/NNN where NNN is 152 or greater.
+topDir is expected to have a subdirectory json in which refsnp-*.json.bz2
+files have already been downloaded, as well as files $refSeqToUcsc and $equivRegions
+(see usage statement for dbSnpJsonToTab).
+
+buildId is usually NNN where NNN is 152 or greater, same as topDir; it can also have a
+suffix to distinguish it, e.g. 152Test.  The names of all result files contain $outRoot\$buildId.
+
+freqSourceOrder is a comma-separated list of projects that submit frequency data to dbSNP
+(see usage statement for dbSnpJsonToTab).
+
+Steps:
+    split: splits refsnp-*.json.bz2 files into chunks of 100,000 lines.
+    convert: runs dbSnpJsonToTab on chunks.
+    mergeToChrom: merges chunk result files into per-chrom results files.
+    mergeChroms: merges per-chrom results files.
+    fixHg19ChrM: if annotations on hg19 are included, then liftOver NC_012920 to hg19 chrM.
+    check: runs checkBigDbSnp to add ucscNotes about overlapping items and clustering anomalies.
+    bigBed: Converts BED4+ .bigDbSnp files into bigBed.
+    install: installs links to files in /gbdb.
+    cleanup: Removes or compresses intermediate files.
+All operations are performed in the build directory which is
+topDir/bigDbSnp.\$date unless -buildDir is given.
+";
+  # Detailed help (-help):
+  print STDERR "
+Assumptions:
+1. $HgAutomate::clusterData/\$db/\$db.2bit contains sequence for \$db.
+2. topDir/json/ contains downloaded files refsnp-*.json.bz2
+3. topDir/ contains files refSeqToUcsc.tab and equivRegions.tab - see dbSnpJsonToTab usage
+" if ($detailed);
+  print "\n";
+  exit $status;
+}
+
+
+# Globals:
+# Command line args: db
+my ($topDir, $buildId, $freqSourceOrder);
+# Other:
+my ($buildDir, $jsonDir, @dbList, $secondsStart, $secondsEnd);
+
+sub checkOptions {
+  # Make sure command line options are valid/supported.
+  my $ok = GetOptions(@HgStepManager::optionSpec,
+                      'assemblyList=s',
+		      'buildDir=s',
+                      'buildId=s',
+                      'freqSourceOrder=s',
+		      @HgAutomate::commonOptionSpec,
+		      );
+  usage(1) if (!$ok);
+  usage(0, 1) if ($opt_help);
+  if ($opt_assemblyList) {
+    $assemblyList= $opt_assemblyList;
+  }
+  # buildDir default depends on topDir (undetermined at this point) and is handled in main
+  HgAutomate::processCommonOptions();
+  my $err = $stepper->processOptions();
+  usage(1) if ($err);
+  $dbHost = $opt_dbHost if ($opt_dbHost);
+}
+
+sub grcToDb($) {
+  # dbSNP is only ever going to produce JSON for various patch levels of GRCh38 and 37.
+  my ($grc) = @_;
+  my $db;
+  if ($grc =~ /^GRCh38/) {
+    $db = 'hg38';
+  } elsif ($grc =~ /^GRCh37/) {
+    $db = 'hg19';
+  } else {
+    die "Expected GRC assembly to start with 'GRCh37' or 'GRCh38' but got '$grc'";
+  }
+  return $db;
+}
+
+#########################################################################
+# * step: split [smallCluster]
+sub doSplit {
+  my $runDir = "$buildDir/run.split";
+  HgAutomate::mustMkdir($runDir);
+  my $outDir = "$buildDir/split";
+  HgAutomate::mustMkdir($outDir);
+
+  my $splitScript = "$runDir/splitJson.sh";
+  my $fh = HgAutomate::mustOpen(">$splitScript");
+  print $fh <<EOF
+#!/bin/bash
+set -beEu -o pipefail
+jsonIn=\$1
+N=100000
+prefix=$outDir/\$(basename \$jsonIn .json.bz2)
+
+bzcat \$jsonIn | split -l \$N --filter='bzip2 > \$FILE.bz2' - \$prefix
+EOF
+    ;
+  close($fh);
+  system("chmod a+x $splitScript") == 0 || die "Unable to chmod $splitScript";
+  HgAutomate::makeGsub($runDir, "$splitScript {check in exists+ \$(path1)}");
+
+  my $whatItDoes = "It splits per-chrom JSON files into 100,000 line chunks.";
+  my $bossScript = new HgRemoteScript("$runDir/doSplit.csh", $smallClusterHub,
+				      $runDir, $whatItDoes);
+  my $paraRun = HgAutomate::paraRun();
+  my $gensub2 = HgAutomate::gensub2();
+  $bossScript->add(<<_EOF_
+ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2 > jsonList
+$gensub2 jsonList single gsub jobList
+$paraRun
+_EOF_
+    );
+  $bossScript->execute();
+} # doSplit
+
+
+#########################################################################
+# * step: convert [bigClusterHub]
+sub doConvert {
+  my $runDir = "$buildDir/run.convert";
+  HgAutomate::mustMkdir($runDir);
+  my $outDir = "$buildDir/splitProcessed";
+  HgAutomate::mustMkdir($outDir);
+
+  my $convertScript = "$runDir/jsonToTab.sh";
+  my $fh = HgAutomate::mustOpen(">$convertScript");
+  print $fh <<EOF
+#!/bin/bash
+set -beEu -o pipefail
+# jsonIn needs to be absolute path
+jsonIn=\$1
+
+tmpDir=\$(mktemp -d /dev/shm/dbSnpJsonToTab.XXXXXXXX)
+pushd \$tmpDir
+
+outRoot=\$(basename \$jsonIn .bz2)
+chromOutDir=$outDir/\$(echo \$outRoot | sed -e 's/..\$//;')
+
+bzcat \$jsonIn \\
+| dbSnpJsonToTab -freqSourceOrder=$freqSourceOrder \\
+    -equivRegions=$topDir/$equivRegions \\
+    $assemblyList $topDir/$refSeqToUcsc stdin \$outRoot
+
+# For sorting.  I expected that this would be set already from my shell, but apparently not:
+export LC_COLLATE=C
+
+# Discard the last two bigDbSnp columns -- they only have 0s.  The real values will be added
+# later by bedJoinTabOffset.
+EOF
+    ;
+  foreach my $grc (split(',', $assemblyList)) {
+    my $db = grcToDb($grc);
+    print $fh <<EOF
+cut -f1-15  \$outRoot.$grc.bigDbSnp \\
+| sort -k1,1 -k2n,2n \\
+| bzip2 \\
+  > \$outRoot.$db.sorted.bigDbSnp.bz2
+EOF
+      ;
+  }
+  print $fh <<EOF
+
+sort \${outRoot}Details.tab | bzip2 > \${outRoot}Details.tab.bz2
+sort -u \${outRoot}Failed.json | bzip2 > \${outRoot}Failed.json.bz2
+sort \${outRoot}Errors.tab | bzip2 > \${outRoot}Errors.tab.bz2
+sort \${outRoot}Merged.tab | bzip2 > \${outRoot}Merged.tab.bz2
+
+popd
+
+mkdir -p \$chromOutDir
+cp -p \$tmpDir/\$outRoot*.bz2 \$chromOutDir/
+
+rm -rf \$tmpDir
+EOF
+    ;
+  close($fh);
+  system("chmod a+x $convertScript") == 0 || die "Unable to chmod $convertScript";
+
+  my $whatItDoes = "It converts dbSNP JSON to bigDbSnp, dbSnpDetails and other files.";
+  my $bossScript = new HgRemoteScript("$runDir/doConvert.csh", $bigClusterHub,
+				      $runDir, $whatItDoes);
+
+  HgAutomate::makeGsub($runDir, "$convertScript {check in exists+ \$(path1)}");
+  my $paraRun = HgAutomate::paraRun();
+  my $gensub2 = HgAutomate::gensub2();
+  $bossScript->add(<<_EOF_
+ls -1S $buildDir/split/ref*.bz2 > splitList
+$gensub2 splitList single gsub jobList
+$paraRun
+_EOF_
+  );
+  $bossScript->execute();
+} # doConvert
+
+
+#########################################################################
+# * step: mergeToChrom [smallClusterHub]
+sub doMergeToChrom {
+  my $runDir = "$buildDir/run.mergeToChrom";
+  HgAutomate::mustMkdir($runDir);
+  my $outDir = "$buildDir/mergedToChrom";
+  HgAutomate::mustMkdir($outDir);
+
+  my $sortMergeBzBedScript = "$runDir/sortMergeBzBed.sh";
+  my $fh = HgAutomate::mustOpen(">$sortMergeBzBedScript");
+  print $fh <<EOF
+#!/bin/bash
+set -beEu -o pipefail
+bzBedList=\$1
+outFile=\$2
+
+tmpDir=\$(mktemp -d /dev/shm/dbSnpMergeSortBed.XXXXXXXX)
+pushd \$tmpDir
+
+cp /dev/null bedList
+for bz in \$(cat \$bzBedList); do
+    bed=\$(basename \$bz .bz2)
+    bzcat \$bz > \$bed
+    echo \$bed >> bedList
+done
+
+export LC_COLLATE=C
+sort --merge -k1,1 -k2n,2n \$(cat bedList) > \$outFile
+
+popd
+rm -rf \$tmpDir
+EOF
+    ;
+  close($fh);
+  system("chmod a+x $sortMergeBzBedScript") == 0 || die "Unable to chmod $sortMergeBzBedScript";
+
+  my $sortMergeBzScript = "$runDir/sortMergeBz.sh";
+  $fh = HgAutomate::mustOpen(">$sortMergeBzScript");
+  print $fh <<EOF
+#!/bin/bash
+set -beEu -o pipefail
+bzList=\$1
+outFile=\$2
+
+tmpDir=\$(mktemp -d /dev/shm/dbSnpMergeSort.XXXXXXXX)
+pushd \$tmpDir
+
+cp /dev/null txtList
+for bz in \$(cat \$bzList); do
+    txt=\$(basename \$bz .bz2)
+    bzcat \$bz > \$txt
+    echo \$txt >> txtList
+done
+
+export LC_COLLATE=C
+sort --merge -u \$(cat txtList) > \$outFile
+
+popd
+rm -rf \$tmpDir
+EOF
+    ;
+  close($fh);
+  system("chmod a+x $sortMergeBzScript") == 0 || die "Unable to chmod $sortMergeBzScript";
+
+  my $uniqBzScript = "$runDir/uniqBz.sh";
+  $fh = HgAutomate::mustOpen(">$uniqBzScript");
+  print $fh <<EOF
+#!/bin/bash
+set -beEu -o pipefail
+bzList=\$1
+outFile=\$2
+
+bzcat \$(cat \$bzList) | uniq | bzip2 > \$outFile
+EOF
+    ;
+  close($fh);
+  system("chmod a+x $uniqBzScript") == 0 || die "Unable to chmod $uniqBzScript";
+
+  my $whatItDoes = "It merge-sorts the results from split-up JSON files into per-chromosome files.";
+  my $bossScript = newBash HgRemoteScript("$runDir/doMergeToChrom.sh", $smallClusterHub,
+                                          $runDir, $whatItDoes);
+
+  my $paraRun = HgAutomate::paraRun();
+  $bossScript->add(<<_EOF_
+# One merge per "chrom" per type of dbSnpJsonToTab output
+for jsonFile in \$(ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2); do
+    prefix=\$(basename \$jsonFile .json.bz2)
+    echo \$prefix
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.hg19.* > \$prefix.hg19.list
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.hg38.* > \$prefix.hg38.list
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Details.* > \$prefix.details.list
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Errors.* > \$prefix.errors.list
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Failed.* > \$prefix.failed.list
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Merged.* > \$prefix.merged.list
+done
+
+cp /dev/null jobList
+for list in *.{hg19,hg38}.list; do
+    prefix=\$(basename \$list .list)
+    echo "./sortMergeBzBed.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.bigDbSnp}" >> jobList
+done
+for list in *.details.list; do
+    prefix=\$(basename \$list .list)
+    echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.tab}" >> jobList
+done
+# OK for these to be empty (check out line instead of line+):
+for list in *.errors.list; do
+    prefix=\$(basename \$list .list)
+    echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList
+done
+for list in *.merged.list; do
+    prefix=\$(basename \$list .list)
+    echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList
+done
+for list in *.failed.list; do
+    prefix=\$(basename \$list .list)
+    echo "./uniqBz.sh {check in line+ \$PWD/\$list} {check out exists+ $outDir/\$prefix.json.bz2}" >> jobList
+done
+$paraRun;
+_EOF_
+  );
+  $bossScript->execute();
+} # doMergeToChrom
+
+
+#########################################################################
+# * step: mergeChroms [workhorse]
+sub doMergeChroms {
+  my $runDir = $buildDir;
+  my $inDir = "mergedToChrom";
+  HgAutomate::mustMkdir("$runDir/joined");
+
+  my $whatItDoes = "It merges chrom-level result files.";
+  my $bossScript = newBash HgRemoteScript("$runDir/doMergeChroms.sh", $workhorse,
+                                          $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+# Merge all chroms' *Merged.tab to the final Merged.tab file in background
+time sort --merge -u $inDir/*.merged.tab > ${outRoot}Merged.tab &
+
+# Merge all chroms' *Details.tab to the final Details.tab file
+time sort --merge -u $inDir/*.details.tab > ${outRoot}Details.tab
+
+# Compress Details.tab with bgzip in background.  For now, leave original file uncompressed.
+time bgzip -iI ${outRoot}Details.tab.gz.gzi -c ${outRoot}Details.tab > ${outRoot}Details.tab.gz &
+
+# parallel job of bedJoinTabOffset on each chrom's .bigDbSnp and ${outRoot}Details.tab
+# bedJoinTabOffset builds a massive hash in memory (file offsets of >650M lines of Details),
+# so limit the number of concurrent processes to 10.
+time (ls -1S $inDir/refsnp-*.*.bigDbSnp |
+      parallel --max-procs 10 --ungroup \\
+        bedJoinTabOffset -verbose=2 ${outRoot}Details.tab {} joined/{/})
+
+# Now mergeSort all chrom's data together.  Don't use sort -u because with -k it only
+# compares keys, not the whole line.
+pids=""
+_EOF_
+  );
+  foreach my $db (@dbList) {
+    $bossScript->add(<<_EOF_
+(time sort --merge -k1,1 -k2n,2n joined/*.$db.bigDbSnp | uniq > $db.$outRoot.bigDbSnp) &
+echo \$!
+pids+=" \$!"
+_EOF_
+                    );
+  }
+  $bossScript->add(<<_EOF_
+for pid in \$pids; do
+  if wait \$pid; then
+    echo pid \$pid done
+  else
+    echo pid \$pid FAILED
+    exit 1
+  fi
+done
+_EOF_
+                  );
+  $bossScript->execute();
+} # doMergeChroms
+
+
+#########################################################################
+# * step: fixHg19ChrM [workhorse]
+sub doFixHg19ChrM {
+  my $runDir = $buildDir;
+  if (grep(/hg19/, @dbList)) {
+    my $whatItDoes = "It does a liftOver from NC_012920.1 to hg19 chrM.";
+    my $bossScript = newBash HgRemoteScript("$runDir/doFixHg19ChrM.sh", $workhorse,
+                                            $runDir, $whatItDoes);
+    $bossScript->add(<<_EOF_
+# For hg19, liftOver NC_012920.1 annotations to hg19 chrM.
+sed -e 's/NC_012920 /NC_012920.1 /' \\
+  /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \\
+  > hg19.mitoLiftover.chain
+# For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
+# doesn't deal with 0-base items.
+mv hg19.$outRoot.bigDbSnp hg19.preChrMFix.$outRoot.bigDbSnp
+time (grep ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\
+      | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 += 1; print;}' \\
+      | liftOver -tab -bedPlus=3 stdin \\
+          hg19.mitoLiftover.chain stdout chrM.unmapped \\
+      | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 -= 1; print;}' \\
+      | sort -k2n,2n \\
+        > hg19.chrM.$outRoot.bigDbSnp)
+wc -l hg19.chrM.$outRoot.bigDbSnp chrM.unmapped
+time grep -v ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\
+     | sort --merge -k1,1 -k2n,2n - hg19.chrM.$outRoot.bigDbSnp \\
+       > hg19.$outRoot.bigDbSnp
+_EOF_
+                    );
+    $bossScript->execute()
+  };
+} # doFixHg19ChrM
+
+
+#########################################################################
+# * step: check [workhorse]
+sub doCheck {
+  my $runDir = $buildDir;
+
+  my $whatItDoes = "It runs checkBigDbSnp on merged bigDbSnp files.";
+  my $bossScript = newBash HgRemoteScript("$runDir/doCheck.sh", $workhorse,
+                                          $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+pids=""
+_EOF_
+                  );
+  foreach my $db (@dbList) {
+    $bossScript->add(<<_EOF_
+time checkBigDbSnp $db.$outRoot.bigDbSnp $HgAutomate::clusterData/$db/$db.2bit $db.$outRoot.checked.bigDbSnp &
+echo \$!
+pids+=" \$!"
+_EOF_
+                    );
+  }
+  $bossScript->add(<<_EOF_
+for pid in \$pids; do
+  if wait \$pid; then
+    echo pid \$pid done
+  else
+    echo pid \$pid FAILED
+    exit 1
+  fi
+done
+_EOF_
+                  );
+  $bossScript->execute();
+} # doBigBed
+
+
+#########################################################################
+# * step: bigBed [workhorse]
+sub doBigBed {
+  my $runDir = $buildDir;
+
+  my $whatItDoes = "It runs bedToBigBed on merged & checked bigDbSnp files.";
+  my $bossScript = newBash HgRemoteScript("$runDir/doBigBed.sh", $workhorse,
+                                          $runDir, $whatItDoes);
+
+  $bossScript->add(<<_EOF_
+pids=""
+_EOF_
+                  );
+  foreach my $db (@dbList) {
+    $bossScript->add(<<_EOF_
+time bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigDbSnp.as -type=bed4+ -extraIndex=name \\
+            $db.$outRoot.checked.bigDbSnp /hive/data/genomes/$db/chrom.sizes $db.$outRoot.bb &
+
+echo \$!
+pids+=" \$!"
+_EOF_
+                    );
+  }
+  $bossScript->add(<<_EOF_
+for pid in \$pids; do
+  if wait \$pid; then
+    echo pid \$pid done
+  else
+    echo pid \$pid FAILED
+    exit 1
+  fi
+done
+_EOF_
+                  );
+  $bossScript->execute();
+} # doBigBed
+
+
+#########################################################################
+# * step: install [dbHost]
+sub doInstall {
+  my $runDir = $buildDir;
+
+  my $whatItDoes = "It installs files in /gbdb.";
+  my $bossScript = new HgRemoteScript("$runDir/doInstall.csh", $workhorse,
+				      $runDir, $whatItDoes);
+
+  foreach my $db (@dbList) {
+    $bossScript->add(<<_EOF_
+ln -sf $buildDir/$db.$outRoot.bb /gbdb/$db/snp/$outRoot.bb
+_EOF_
+                    );
+  }
+  $bossScript->add(<<_EOF_
+mkdir -p /gbdb/hgFixed/dbSnp
+ln -sf $buildDir/${outRoot}Details.tab* /gbdb/hgFixed/dbSnp/
+_EOF_
+  );
+  $bossScript->execute();
+} # doInstall
+
+
+#########################################################################
+# * step: cleanup [workhorse]
+sub doCleanup {
+  my $runDir = "$buildDir";
+  my $whatItDoes = "It cleans up or compresses intermediate files.";
+  my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $workhorse,
+				      $runDir, $whatItDoes);
+  $bossScript->add(<<_EOF_
+bzip2 *.bigDbSnp
+rm -rf merged splitProcessed joined
+_EOF_
+  );
+  $bossScript->execute();
+} # doCleanup
+
+
+#########################################################################
+# main
+
+# Prevent "Suspended (tty input)" hanging:
+HgAutomate::closeStdin();
+
+# Make sure we have valid options and exactly 1 argument:
+checkOptions();
+usage(1) if (scalar(@ARGV) != 3);
+$secondsStart = `date "+%s"`;
+chomp $secondsStart;
+($topDir, $buildId, $freqSourceOrder) = @ARGV;
+
+# Establish what directory we will work in.
+my $date = `date +%Y-%m-%d`;
+chomp $date;
+$buildDir = $opt_buildDir ? $opt_buildDir : "$topDir/bigDbSnp.$date";
+
+$outRoot .= $buildId;
+$jsonDir = "$topDir/json";
+@dbList = map { grcToDb($_); } split(',', $assemblyList);
+
+# Do everything.
+$stepper->execute();
+
+# Tell the user anything they should know.
+my $stopStep = $stepper->getStopStep();
+my $upThrough = ($stopStep eq 'cleanup') ? "" :
+  "  (through the '$stopStep' step)";
+
+$secondsEnd = `date "+%s"`;
+chomp $secondsEnd;
+my $elapsedSeconds = $secondsEnd - $secondsStart;
+my $elapsedMinutes = int($elapsedSeconds/60);
+$elapsedSeconds -= $elapsedMinutes * 60;
+
+HgAutomate::verbose(1,
+	"\n *** All done !$upThrough  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
+HgAutomate::verbose(1,
+	" *** Steps were performed in $buildDir\n");
+HgAutomate::verbose(1, "\n");
+