src/hg/utils/automation/doBigDbSnp.pl ec726ffa6050e6b7b65f5ed885789c850ce87e60

ec726ffa6050e6b7b65f5ed885789c850ce87e60
galt
  Wed Jan 26 03:20:35 2022 -0800
Tweaking the dobBigDbSnp step fixHg19ChrM since NC_012920 is now just called chrMT in dbsnp data.

diff --git src/hg/utils/automation/doBigDbSnp.pl src/hg/utils/automation/doBigDbSnp.pl
index 447dda5..6b55025 100755
--- src/hg/utils/automation/doBigDbSnp.pl
+++ src/hg/utils/automation/doBigDbSnp.pl
@@ -1,709 +1,709 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doBigDbSnp.pl instead.
 
 # Copyright (C) 2019 The Regents of the University of California
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_assemblyList
     $opt_buildDir
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'split',   func => \&doSplit },
       { name => 'convert',   func => \&doConvert },
       { name => 'mergeToChrom',   func => \&doMergeToChrom },
       { name => 'mergeChroms',   func => \&doMergeChroms },
       { name => 'fixHg19ChrM',   func => \&doFixHg19ChrM },
       { name => 'check',   func => \&doCheck },
       { name => 'bigBed',   func => \&doBigBed },
       { name => 'install',   func => \&doInstall },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Files that must exist in $topDir:
 my $refSeqToUcsc = 'refSeqToUcsc.tab';
 my $equivRegions = 'equivRegions.tab';
 
 # Option defaults:
 my $assemblyList = 'GRCh37.p13,GRCh38.p13';
 my $dbHost = 'hgwdev';
 my $bigClusterHub = 'ku';
 my $smallClusterHub = 'hgwdev';
 my $workhorse = 'hgwdev';
 
 my $outRoot = 'dbSnp';
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base topDir buildId freqSourceOrder
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -assemblyList list    Comma-separated list of assemblies used by dbSNP
                           default: $assemblyList
     -buildDir dir         Use dir instead of default topDir/bigDbSnp.\$date
                           (necessary when continuing at a later date).
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $workhorse,
 						'fileServer' => '',
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Convert dbSNP JSON into bigDbSnp and associated track files.
 
 topDir is usually /hive/data/outside/dbSNP/NNN where NNN is 152 or greater.
 topDir is expected to have a subdirectory json in which refsnp-*.json.bz2
 files have already been downloaded, as well as files $refSeqToUcsc and $equivRegions
 (see usage statement for dbSnpJsonToTab).
 
 buildId is usually NNN where NNN is 152 or greater, same as topDir; it can also have a
 suffix to distinguish it, e.g. 152Test.  The names of all result files contain $outRoot\$buildId.
 
 freqSourceOrder is a comma-separated list of projects that submit frequency data to dbSNP
 (see usage statement for dbSnpJsonToTab).
 
 Steps:
     split: splits refsnp-*.json.bz2 files into chunks of 100,000 lines.
     convert: runs dbSnpJsonToTab on chunks.
     mergeToChrom: merges chunk result files into per-chrom results files.
     mergeChroms: merges per-chrom results files.
-    fixHg19ChrM: if annotations on hg19 are included, then liftOver NC_012920 to hg19 chrM.
+    fixHg19ChrM: if annotations on hg19 are included, then liftOver chrMT (NC_012920) to hg19 chrM.
     check: runs checkBigDbSnp to add ucscNotes about overlapping items and clustering anomalies.
     bigBed: Converts BED4+ .bigDbSnp files into bigBed.
     install: installs links to files in /gbdb.
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 topDir/bigDbSnp.\$date unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains sequence for \$db.
 2. topDir/json/ contains downloaded files refsnp-*.json.bz2
 3. topDir/ contains files refSeqToUcsc.tab and equivRegions.tab - see dbSnpJsonToTab usage
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($topDir, $buildId, $freqSourceOrder);
 # Other:
 my ($buildDir, $jsonDir, @dbList, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
                       'assemblyList=s',
 		      'buildDir=s',
                       'buildId=s',
                       'freqSourceOrder=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   usage(1) if (!$ok);
   usage(0, 1) if ($opt_help);
   if ($opt_assemblyList) {
     $assemblyList= $opt_assemblyList;
   }
   # buildDir default depends on topDir (undetermined at this point) and is handled in main
   HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 sub grcToDb($) {
   # dbSNP is only ever going to produce JSON for various patch levels of GRCh38 and 37.
   my ($grc) = @_;
   my $db;
   if ($grc =~ /^GRCh38/) {
     $db = 'hg38';
   } elsif ($grc =~ /^GRCh37/) {
     $db = 'hg19';
   } else {
     die "Expected GRC assembly to start with 'GRCh37' or 'GRCh38' but got '$grc'";
   }
   return $db;
 }
 
 #########################################################################
 # * step: split [smallCluster]
 sub doSplit {
   my $runDir = "$buildDir/run.split";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/split";
   HgAutomate::mustMkdir($outDir);
 
   my $splitScript = "$runDir/splitJson.sh";
   my $fh = HgAutomate::mustOpen(">$splitScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 jsonIn=\$1
 N=100000
 prefix=$outDir/\$(basename \$jsonIn .json.bz2)
 
 bzcat \$jsonIn | split -l \$N --filter='bzip2 > \$FILE.bz2' - \$prefix
 EOF
     ;
   close($fh);
   system("chmod a+x $splitScript") == 0 || die "Unable to chmod $splitScript";
   HgAutomate::makeGsub($runDir, "$splitScript {check in exists+ \$(path1)}");
 
   my $whatItDoes = "It splits per-chrom JSON files into 100,000 line chunks.";
   my $bossScript = new HgRemoteScript("$runDir/doSplit.csh", $smallClusterHub,
 				      $runDir, $whatItDoes);
   my $paraRun = HgAutomate::paraRun();
   my $gensub2 = HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2 > jsonList
 $gensub2 jsonList single gsub jobList
 $paraRun
 _EOF_
     );
   $bossScript->execute();
 } # doSplit
 
 
 #########################################################################
 # * step: convert [bigClusterHub]
 sub doConvert {
   my $runDir = "$buildDir/run.convert";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/splitProcessed";
   HgAutomate::mustMkdir($outDir);
 
   my $convertScript = "$runDir/jsonToTab.sh";
   my $fh = HgAutomate::mustOpen(">$convertScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 # jsonIn needs to be absolute path
 jsonIn=\$1
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpJsonToTab.XXXXXXXX)
 pushd \$tmpDir
 
 outRoot=\$(basename \$jsonIn .bz2)
 chromOutDir=$outDir/\$(echo \$outRoot | sed -e 's/..\$//;')
 
 bzcat \$jsonIn \\
 | dbSnpJsonToTab -freqSourceOrder=$freqSourceOrder \\
     -equivRegions=$topDir/$equivRegions \\
     $assemblyList $topDir/$refSeqToUcsc stdin \$outRoot
 
 # For sorting.  I expected that this would be set already from my shell, but apparently not:
 export LC_COLLATE=C
 
 # Discard the last two bigDbSnp columns -- they only have 0s.  The real values will be added
 # later by bedJoinTabOffset.
 EOF
     ;
   foreach my $grc (split(',', $assemblyList)) {
     my $db = grcToDb($grc);
     print $fh <<EOF
 cut -f1-15  \$outRoot.$grc.bigDbSnp \\
 | sort -k1,1 -k2n,2n \\
 | bzip2 \\
   > \$outRoot.$db.sorted.bigDbSnp.bz2
 sort -k1,1 -k2n,2n \$outRoot.$grc.badCoords.bed \\
 | bzip2 \\
   > \$outRoot.$db.sorted.badCoords.bed.bz2
 EOF
       ;
   }
   print $fh <<EOF
 
 sort \${outRoot}Details.tab | bzip2 > \${outRoot}Details.tab.bz2
 sort \${outRoot}Errors.tab | bzip2 > \${outRoot}Errors.tab.bz2
 sort \${outRoot}Merged.tab | bzip2 > \${outRoot}Merged.tab.bz2
 sort \${outRoot}Warnings.tab | bzip2 > \${outRoot}Warnings.tab.bz2
 
 popd
 
 mkdir -p \$chromOutDir
 cp -p \$tmpDir/\$outRoot*.bz2 \$chromOutDir/
 
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $convertScript") == 0 || die "Unable to chmod $convertScript";
 
   my $whatItDoes = "It converts dbSNP JSON to bigDbSnp, dbSnpDetails and other files.";
   my $bossScript = new HgRemoteScript("$runDir/doConvert.csh", $bigClusterHub,
 				      $runDir, $whatItDoes);
 
   HgAutomate::makeGsub($runDir, "$convertScript {check in exists+ \$(path1)}");
   my $paraRun = HgAutomate::paraRun();
   my $gensub2 = HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 ls -1S $buildDir/split/ref*.bz2 > splitList
 $gensub2 splitList single gsub jobList
 $paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doConvert
 
 
 #########################################################################
 # * step: mergeToChrom [smallClusterHub]
 sub doMergeToChrom {
   my $runDir = "$buildDir/run.mergeToChrom";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/mergedToChrom";
   HgAutomate::mustMkdir($outDir);
 
   my $sortMergeBzBedScript = "$runDir/sortMergeBzBed.sh";
   my $fh = HgAutomate::mustOpen(">$sortMergeBzBedScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 bzBedList=\$1
 outFile=\$2
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpMergeSortBed.XXXXXXXX)
 pushd \$tmpDir
 
 cp /dev/null bedList
 for bz in \$(cat \$bzBedList); do
     bed=\$(basename \$bz .bz2)
     bzcat \$bz > \$bed
     echo \$bed >> bedList
 done
 
 export LC_COLLATE=C
 sort --merge -k1,1 -k2n,2n \$(cat bedList) > \$outFile
 
 popd
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $sortMergeBzBedScript") == 0 || die "Unable to chmod $sortMergeBzBedScript";
 
   my $sortMergeBzScript = "$runDir/sortMergeBz.sh";
   $fh = HgAutomate::mustOpen(">$sortMergeBzScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 bzList=\$1
 outFile=\$2
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpMergeSort.XXXXXXXX)
 pushd \$tmpDir
 
 cp /dev/null txtList
 for bz in \$(cat \$bzList); do
     txt=\$(basename \$bz .bz2)
     bzcat \$bz > \$txt
     echo \$txt >> txtList
 done
 
 export LC_COLLATE=C
 sort --merge -u \$(cat txtList) > \$outFile
 
 popd
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $sortMergeBzScript") == 0 || die "Unable to chmod $sortMergeBzScript";
 
   my $whatItDoes = "It merge-sorts the results from split-up JSON files into per-chromosome files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doMergeToChrom.sh", $smallClusterHub,
                                           $runDir, $whatItDoes);
 
   my $paraRun = HgAutomate::paraRun();
   $bossScript->add(<<_EOF_
 # One merge per "chrom" per type of dbSnpJsonToTab output
 for jsonFile in \$(ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2); do
     prefix=\$(basename \$jsonFile .json.bz2)
     echo \$prefix
 _EOF_
                   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.$db.*bigDbSnp* > \$prefix.$db.bigDbSnp.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.$db.*badCoords* > \$prefix.$db.badCoords.list
 _EOF_
                     );
   }
   my $dbListStr = join(',', @dbList);
   $bossScript->add(<<_EOF_
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Details.* > \$prefix.details.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Errors.* > \$prefix.errors.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Merged.* > \$prefix.merged.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Warnings.* > \$prefix.warnings.list
 done
 
 cp /dev/null jobList
 for list in *.{$dbListStr}.bigDbSnp.list; do
     prefix=\$(basename \$list .bigDbSnp.list)
     echo "./sortMergeBzBed.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.bigDbSnp}" >> jobList
 done
 for list in *.details.list; do
     prefix=\$(basename \$list .list)
     echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.tab}" >> jobList
 done
 # OK for these to be empty (check out line instead of line+):
 for list in *.{$dbListStr}.badCoords.list; do
     prefix=\$(basename \$list .badCoords.list)
     echo "./sortMergeBzBed.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.badCoords.bed}" >> jobList
 done
 for list in *.errors.list *.merged.list *.warnings.list; do
     prefix=\$(basename \$list .list)
     echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList
 done
 $paraRun;
 _EOF_
   );
   $bossScript->execute();
 } # doMergeToChrom
 
 
 #########################################################################
 # * step: mergeChroms [workhorse]
 sub doMergeChroms {
   my $runDir = $buildDir;
   my $inDir = "mergedToChrom";
   HgAutomate::mustMkdir("$runDir/joined");
 
   my $whatItDoes = "It merges chrom-level result files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doMergeChroms.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 # Merge all chroms' *Merged.tab to the final Merged.tab file in background,
 # likewise for errors, warnings, and badCoords which should all be relatively small and quick.
 pids=""
 time sort --merge -u $inDir/*.merged.tab > ${outRoot}Merged.tab &
 pids+=" \$!"
 time sort --merge -u $inDir/*.errors.tab > ${outRoot}Errors.tab &
 pids+=" \$!"
 time sort --merge -u $inDir/*.warnings.tab > ${outRoot}Warnings.tab &
 pids+=" \$!"
 _EOF_
   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 (time sort --merge -k1,1 -k2n,2n $inDir/*.$db.badCoords.bed | uniq > $db.$outRoot.badCoords.bed) &
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 
 # Merge all chroms' *Details.tab to the final Details.tab file
 time sort --merge -u $inDir/*.details.tab > ${outRoot}Details.tab
 
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 
 # Compress & index Details.tab with bgzip in background.  Leave original file uncompressed for
 # bedJoinTabOffset.
 time bgzip -iI ${outRoot}Details.tab.gz.gzi -c ${outRoot}Details.tab > ${outRoot}Details.tab.gz &
 pids=\$!
 
 # parallel job of bedJoinTabOffset on each chrom's .bigDbSnp and ${outRoot}Details.tab
 # bedJoinTabOffset builds a massive hash in memory (file offsets of >650M lines of Details),
 # so limit the number of concurrent processes to 10.
 time (ls -1S $inDir/refsnp-*.*.bigDbSnp |
       parallel --max-procs 10 --ungroup \\
         bedJoinTabOffset -verbose=2 ${outRoot}Details.tab {} joined/{/})
 
 # Now mergeSort all chrom's data together.  Don't use sort -u because with -k it only
 # compares keys, not the whole line.
 _EOF_
   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 (time sort --merge -k1,1 -k2n,2n joined/*.$db.bigDbSnp | uniq > $db.$outRoot.bigDbSnp) &
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
 } # doMergeChroms
 
 
 #########################################################################
 # * step: fixHg19ChrM [workhorse]
 sub doFixHg19ChrM {
   my $runDir = $buildDir;
   if (grep(/hg19/, @dbList)) {
-    my $whatItDoes = "It does a liftOver from NC_012920.1 to hg19 chrM.";
+    my $whatItDoes = "It does a liftOver from chrMT (old name NC_012920) to hg19 chrM.";
     my $bossScript = newBash HgRemoteScript("$runDir/doFixHg19ChrM.sh", $workhorse,
                                             $runDir, $whatItDoes);
     $bossScript->add(<<_EOF_
-# For hg19, liftOver NC_012920.1 annotations to hg19 chrM.
-sed -e 's/NC_012920 /NC_012920.1 /' \\
+# For hg19, liftOver chrMT annotations to hg19 chrM.
+sed -e 's/NC_012920 /chrMT /' \\
   /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \\
   > hg19.mitoLiftover.chain
 # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
 # doesn't deal with 0-base items.
 mv hg19.$outRoot.bigDbSnp hg19.preChrMFix.$outRoot.bigDbSnp
-time (grep ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\
+time (grep ^chrMT hg19.preChrMFix.$outRoot.bigDbSnp \\
       | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 += 1; print;}' \\
       | liftOver -tab -bedPlus=3 stdin \\
           hg19.mitoLiftover.chain stdout chrM.unmapped \\
       | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 -= 1; print;}' \\
       | sort -k2n,2n \\
         > hg19.chrM.$outRoot.bigDbSnp)
 wc -l hg19.chrM.$outRoot.bigDbSnp chrM.unmapped
-time grep -v ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\
+time grep -v ^chrMT hg19.preChrMFix.$outRoot.bigDbSnp \\
      | sort --merge -k1,1 -k2n,2n - hg19.chrM.$outRoot.bigDbSnp \\
        > hg19.$outRoot.bigDbSnp
 _EOF_
                     );
     $bossScript->execute()
   };
 } # doFixHg19ChrM
 
 
 #########################################################################
 # * step: check [workhorse]
 sub doCheck {
   my $runDir = $buildDir;
 
   my $whatItDoes = "It runs checkBigDbSnp on merged bigDbSnp files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doCheck.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 cut -f 4 $db.$outRoot.badCoords.bed | sort -u > $db.badCoords.ids.txt
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 pids=""
 _EOF_
                   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 time checkBigDbSnp -mapErrIds=$db.badCoords.ids.txt \\
        $db.$outRoot.bigDbSnp $HgAutomate::clusterData/$db/$db.2bit $db.$outRoot.checked.bigDbSnp &
 echo \$!
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
 } # doCheck
 
 
 #########################################################################
 # * step: bigBed [workhorse]
 sub doBigBed {
   my $runDir = $buildDir;
 
   # Helper script to make Mult, Common and ClinVar subsets and convert to bigBed for one db.
   my $makeSubsetsScript = "$runDir/makeSubsets.sh";
   my $fh = HgAutomate::mustOpen(">$makeSubsetsScript");
   print $fh <<_EOF_
 #!/bin/bash
 set -beEu -o pipefail
 db=\$1
 time $Bin/categorizeBigDbSnp.pl \$db \$db.$outRoot.checked.bigDbSnp
 pids=""
 for subset in Mult Common ClinVar; do
   time bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigDbSnp.as -type=bed4+ -extraIndex=name \\
             \$db.\$subset.bigDbSnp /hive/data/genomes/\$db/chrom.sizes \$db.$outRoot.\$subset.bb &
   pids+=" \$!";
 done
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
     ;
   close($fh);
   system("chmod a+x $makeSubsetsScript") == 0 || die "Unable to chmod $makeSubsetsScript";
 
   my $whatItDoes = "It runs bedToBigBed on merged & checked bigDbSnp files and makes ".
         "Mult, Common and ClinVar subsets.";
   my $bossScript = newBash HgRemoteScript("$runDir/doBigBed.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 pids=""
 _EOF_
                   );
   # Increase max memory allocation from default (on 64-bit machines) of 16GB (exceeded b154):
   my $maxAlloc = 64 * 1024 * 1024 * 1024;
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 time bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigDbSnp.as -type=bed4+ -extraIndex=name \\
             -maxAlloc=$maxAlloc \\
             $db.$outRoot.checked.bigDbSnp /hive/data/genomes/$db/chrom.sizes $db.$outRoot.bb &
 pids+=" \$!"
 time bedToBigBed -tab -type=bed4 -extraIndex=name \\
             $db.$outRoot.badCoords.bed /hive/data/genomes/$db/chrom.sizes $db.${outRoot}BadCoords.bb &
 pids+=" \$!"
 $makeSubsetsScript $db &
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
 } # doBigBed
 
 
 #########################################################################
 # * step: install [dbHost]
 sub doInstall {
   my $runDir = $buildDir;
 
   my $whatItDoes = "It installs files in /gbdb.";
   my $bossScript = newBash HgRemoteScript("$runDir/doInstall.sh", $workhorse,
 				      $runDir, $whatItDoes);
 
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 ln -sf $buildDir/$db.$outRoot.bb /gbdb/$db/snp/$outRoot.bb
 for subset in Mult Common ClinVar; do
   ln -sf $buildDir/$db.$outRoot.\$subset.bb /gbdb/$db/snp/${outRoot}\$subset.bb
 done
 ln -sf $buildDir/$db.${outRoot}BadCoords.bb /gbdb/$db/snp/${outRoot}BadCoords.bb
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 mkdir -p /gbdb/hgFixed/dbSnp
 ln -sf $buildDir/${outRoot}Details.tab* /gbdb/hgFixed/dbSnp/
 _EOF_
   );
   $bossScript->execute();
 } # doInstall
 
 
 #########################################################################
 # * step: cleanup [workhorse]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 bzip2 *.bigDbSnp
 rm -rf merged splitProcessed joined
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 checkOptions();
 usage(1) if (scalar(@ARGV) != 3);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($topDir, $buildId, $freqSourceOrder) = @ARGV;
 
 # Establish what directory we will work in.
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir : "$topDir/bigDbSnp.$date";
 
 $outRoot .= $buildId;
 $jsonDir = "$topDir/json";
 @dbList = map { grcToDb($_); } split(',', $assemblyList);
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 HgAutomate::verbose(1,
 	"\n *** All done !$upThrough  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
 HgAutomate::verbose(1,
 	" *** Steps were performed in $buildDir\n");
 HgAutomate::verbose(1, "\n");