src/hg/utils/automation/doBigDbSnp.pl ca3f7e9e7bebb3ea6f35844fc03c90a988eb1330

ca3f7e9e7bebb3ea6f35844fc03c90a988eb1330
angie
  Fri Oct 18 15:02:51 2019 -0700
bigDbSnp: add Common, ClinVar, Mult subset tracks to bigBed and install steps.  refs #23283

diff --git src/hg/utils/automation/doBigDbSnp.pl src/hg/utils/automation/doBigDbSnp.pl
index e57d99e..9a6f744 100755
--- src/hg/utils/automation/doBigDbSnp.pl
+++ src/hg/utils/automation/doBigDbSnp.pl
@@ -1,644 +1,675 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doBigDbSnp.pl instead.
 
 # Copyright (C) 2019 The Regents of the University of California
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_assemblyList
     $opt_buildDir
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'split',   func => \&doSplit },
       { name => 'convert',   func => \&doConvert },
       { name => 'mergeToChrom',   func => \&doMergeToChrom },
       { name => 'mergeChroms',   func => \&doMergeChroms },
       { name => 'fixHg19ChrM',   func => \&doFixHg19ChrM },
       { name => 'check',   func => \&doCheck },
       { name => 'bigBed',   func => \&doBigBed },
       { name => 'install',   func => \&doInstall },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Files that must exist in $topDir:
 my $refSeqToUcsc = 'refSeqToUcsc.tab';
 my $equivRegions = 'equivRegions.tab';
 
 # Option defaults:
 my $assemblyList = 'GRCh37.p13,GRCh38.p12';
 my $dbHost = 'hgwdev';
 my $bigClusterHub = 'ku';
 my $smallClusterHub = 'hgwdev';
 my $workhorse = 'hgwdev';
 
 my $outRoot = 'dbSnp';
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base topDir buildId freqSourceOrder
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -assemblyList list    Comma-separated list of assemblies used by dbSNP
                           default: $assemblyList
     -buildDir dir         Use dir instead of default topDir/bigDbSnp.\$date
                           (necessary when continuing at a later date).
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $workhorse,
 						'fileServer' => '',
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Convert dbSNP JSON into bigDbSnp and associated track files.
 
 topDir is usually /hive/data/outside/dbSNP/NNN where NNN is 152 or greater.
 topDir is expected to have a subdirectory json in which refsnp-*.json.bz2
 files have already been downloaded, as well as files $refSeqToUcsc and $equivRegions
 (see usage statement for dbSnpJsonToTab).
 
 buildId is usually NNN where NNN is 152 or greater, same as topDir; it can also have a
 suffix to distinguish it, e.g. 152Test.  The names of all result files contain $outRoot\$buildId.
 
 freqSourceOrder is a comma-separated list of projects that submit frequency data to dbSNP
 (see usage statement for dbSnpJsonToTab).
 
 Steps:
     split: splits refsnp-*.json.bz2 files into chunks of 100,000 lines.
     convert: runs dbSnpJsonToTab on chunks.
     mergeToChrom: merges chunk result files into per-chrom results files.
     mergeChroms: merges per-chrom results files.
     fixHg19ChrM: if annotations on hg19 are included, then liftOver NC_012920 to hg19 chrM.
     check: runs checkBigDbSnp to add ucscNotes about overlapping items and clustering anomalies.
     bigBed: Converts BED4+ .bigDbSnp files into bigBed.
     install: installs links to files in /gbdb.
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 topDir/bigDbSnp.\$date unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains sequence for \$db.
 2. topDir/json/ contains downloaded files refsnp-*.json.bz2
 3. topDir/ contains files refSeqToUcsc.tab and equivRegions.tab - see dbSnpJsonToTab usage
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($topDir, $buildId, $freqSourceOrder);
 # Other:
 my ($buildDir, $jsonDir, @dbList, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
                       'assemblyList=s',
 		      'buildDir=s',
                       'buildId=s',
                       'freqSourceOrder=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   usage(1) if (!$ok);
   usage(0, 1) if ($opt_help);
   if ($opt_assemblyList) {
     $assemblyList= $opt_assemblyList;
   }
   # buildDir default depends on topDir (undetermined at this point) and is handled in main
   HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 sub grcToDb($) {
   # dbSNP is only ever going to produce JSON for various patch levels of GRCh38 and 37.
   my ($grc) = @_;
   my $db;
   if ($grc =~ /^GRCh38/) {
     $db = 'hg38';
   } elsif ($grc =~ /^GRCh37/) {
     $db = 'hg19';
   } else {
     die "Expected GRC assembly to start with 'GRCh37' or 'GRCh38' but got '$grc'";
   }
   return $db;
 }
 
 #########################################################################
 # * step: split [smallCluster]
 sub doSplit {
   my $runDir = "$buildDir/run.split";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/split";
   HgAutomate::mustMkdir($outDir);
 
   my $splitScript = "$runDir/splitJson.sh";
   my $fh = HgAutomate::mustOpen(">$splitScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 jsonIn=\$1
 N=100000
 prefix=$outDir/\$(basename \$jsonIn .json.bz2)
 
 bzcat \$jsonIn | split -l \$N --filter='bzip2 > \$FILE.bz2' - \$prefix
 EOF
     ;
   close($fh);
   system("chmod a+x $splitScript") == 0 || die "Unable to chmod $splitScript";
   HgAutomate::makeGsub($runDir, "$splitScript {check in exists+ \$(path1)}");
 
   my $whatItDoes = "It splits per-chrom JSON files into 100,000 line chunks.";
   my $bossScript = new HgRemoteScript("$runDir/doSplit.csh", $smallClusterHub,
 				      $runDir, $whatItDoes);
   my $paraRun = HgAutomate::paraRun();
   my $gensub2 = HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2 > jsonList
 $gensub2 jsonList single gsub jobList
 $paraRun
 _EOF_
     );
   $bossScript->execute();
 } # doSplit
 
 
 #########################################################################
 # * step: convert [bigClusterHub]
 sub doConvert {
   my $runDir = "$buildDir/run.convert";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/splitProcessed";
   HgAutomate::mustMkdir($outDir);
 
   my $convertScript = "$runDir/jsonToTab.sh";
   my $fh = HgAutomate::mustOpen(">$convertScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 # jsonIn needs to be absolute path
 jsonIn=\$1
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpJsonToTab.XXXXXXXX)
 pushd \$tmpDir
 
 outRoot=\$(basename \$jsonIn .bz2)
 chromOutDir=$outDir/\$(echo \$outRoot | sed -e 's/..\$//;')
 
 bzcat \$jsonIn \\
 | dbSnpJsonToTab -freqSourceOrder=$freqSourceOrder \\
     -equivRegions=$topDir/$equivRegions \\
     $assemblyList $topDir/$refSeqToUcsc stdin \$outRoot
 
 # For sorting.  I expected that this would be set already from my shell, but apparently not:
 export LC_COLLATE=C
 
 # Discard the last two bigDbSnp columns -- they only have 0s.  The real values will be added
 # later by bedJoinTabOffset.
 EOF
     ;
   foreach my $grc (split(',', $assemblyList)) {
     my $db = grcToDb($grc);
     print $fh <<EOF
 cut -f1-15  \$outRoot.$grc.bigDbSnp \\
 | sort -k1,1 -k2n,2n \\
 | bzip2 \\
   > \$outRoot.$db.sorted.bigDbSnp.bz2
 EOF
       ;
   }
   print $fh <<EOF
 
 sort \${outRoot}Details.tab | bzip2 > \${outRoot}Details.tab.bz2
 sort -u \${outRoot}Failed.json | bzip2 > \${outRoot}Failed.json.bz2
 sort \${outRoot}Errors.tab | bzip2 > \${outRoot}Errors.tab.bz2
 sort \${outRoot}Merged.tab | bzip2 > \${outRoot}Merged.tab.bz2
 
 popd
 
 mkdir -p \$chromOutDir
 cp -p \$tmpDir/\$outRoot*.bz2 \$chromOutDir/
 
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $convertScript") == 0 || die "Unable to chmod $convertScript";
 
   my $whatItDoes = "It converts dbSNP JSON to bigDbSnp, dbSnpDetails and other files.";
   my $bossScript = new HgRemoteScript("$runDir/doConvert.csh", $bigClusterHub,
 				      $runDir, $whatItDoes);
 
   HgAutomate::makeGsub($runDir, "$convertScript {check in exists+ \$(path1)}");
   my $paraRun = HgAutomate::paraRun();
   my $gensub2 = HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 ls -1S $buildDir/split/ref*.bz2 > splitList
 $gensub2 splitList single gsub jobList
 $paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doConvert
 
 
 #########################################################################
 # * step: mergeToChrom [smallClusterHub]
 sub doMergeToChrom {
   my $runDir = "$buildDir/run.mergeToChrom";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/mergedToChrom";
   HgAutomate::mustMkdir($outDir);
 
   my $sortMergeBzBedScript = "$runDir/sortMergeBzBed.sh";
   my $fh = HgAutomate::mustOpen(">$sortMergeBzBedScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 bzBedList=\$1
 outFile=\$2
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpMergeSortBed.XXXXXXXX)
 pushd \$tmpDir
 
 cp /dev/null bedList
 for bz in \$(cat \$bzBedList); do
     bed=\$(basename \$bz .bz2)
     bzcat \$bz > \$bed
     echo \$bed >> bedList
 done
 
 export LC_COLLATE=C
 sort --merge -k1,1 -k2n,2n \$(cat bedList) > \$outFile
 
 popd
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $sortMergeBzBedScript") == 0 || die "Unable to chmod $sortMergeBzBedScript";
 
   my $sortMergeBzScript = "$runDir/sortMergeBz.sh";
   $fh = HgAutomate::mustOpen(">$sortMergeBzScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 bzList=\$1
 outFile=\$2
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpMergeSort.XXXXXXXX)
 pushd \$tmpDir
 
 cp /dev/null txtList
 for bz in \$(cat \$bzList); do
     txt=\$(basename \$bz .bz2)
     bzcat \$bz > \$txt
     echo \$txt >> txtList
 done
 
 export LC_COLLATE=C
 sort --merge -u \$(cat txtList) > \$outFile
 
 popd
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $sortMergeBzScript") == 0 || die "Unable to chmod $sortMergeBzScript";
 
   my $uniqBzScript = "$runDir/uniqBz.sh";
   $fh = HgAutomate::mustOpen(">$uniqBzScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 bzList=\$1
 outFile=\$2
 
 bzcat \$(cat \$bzList) | uniq | bzip2 > \$outFile
 EOF
     ;
   close($fh);
   system("chmod a+x $uniqBzScript") == 0 || die "Unable to chmod $uniqBzScript";
 
   my $whatItDoes = "It merge-sorts the results from split-up JSON files into per-chromosome files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doMergeToChrom.sh", $smallClusterHub,
                                           $runDir, $whatItDoes);
 
   my $paraRun = HgAutomate::paraRun();
   $bossScript->add(<<_EOF_
 # One merge per "chrom" per type of dbSnpJsonToTab output
 for jsonFile in \$(ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2); do
     prefix=\$(basename \$jsonFile .json.bz2)
     echo \$prefix
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.hg19.* > \$prefix.hg19.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.hg38.* > \$prefix.hg38.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Details.* > \$prefix.details.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Errors.* > \$prefix.errors.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Failed.* > \$prefix.failed.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Merged.* > \$prefix.merged.list
 done
 
 cp /dev/null jobList
 for list in *.{hg19,hg38}.list; do
     prefix=\$(basename \$list .list)
     echo "./sortMergeBzBed.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.bigDbSnp}" >> jobList
 done
 for list in *.details.list; do
     prefix=\$(basename \$list .list)
     echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.tab}" >> jobList
 done
 # OK for these to be empty (check out line instead of line+):
 for list in *.errors.list; do
     prefix=\$(basename \$list .list)
     echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList
 done
 for list in *.merged.list; do
     prefix=\$(basename \$list .list)
     echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList
 done
 for list in *.failed.list; do
     prefix=\$(basename \$list .list)
     echo "./uniqBz.sh {check in line+ \$PWD/\$list} {check out exists+ $outDir/\$prefix.json.bz2}" >> jobList
 done
 $paraRun;
 _EOF_
   );
   $bossScript->execute();
 } # doMergeToChrom
 
 
 #########################################################################
 # * step: mergeChroms [workhorse]
 sub doMergeChroms {
   my $runDir = $buildDir;
   my $inDir = "mergedToChrom";
   HgAutomate::mustMkdir("$runDir/joined");
 
   my $whatItDoes = "It merges chrom-level result files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doMergeChroms.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 # Merge all chroms' *Merged.tab to the final Merged.tab file in background
 time sort --merge -u $inDir/*.merged.tab > ${outRoot}Merged.tab &
 
 # Merge all chroms' *Details.tab to the final Details.tab file
 time sort --merge -u $inDir/*.details.tab > ${outRoot}Details.tab
 
 # Compress Details.tab with bgzip in background.  For now, leave original file uncompressed.
 time bgzip -iI ${outRoot}Details.tab.gz.gzi -c ${outRoot}Details.tab > ${outRoot}Details.tab.gz &
 
 # parallel job of bedJoinTabOffset on each chrom's .bigDbSnp and ${outRoot}Details.tab
 # bedJoinTabOffset builds a massive hash in memory (file offsets of >650M lines of Details),
 # so limit the number of concurrent processes to 10.
 time (ls -1S $inDir/refsnp-*.*.bigDbSnp |
       parallel --max-procs 10 --ungroup \\
         bedJoinTabOffset -verbose=2 ${outRoot}Details.tab {} joined/{/})
 
 # Now mergeSort all chrom's data together.  Don't use sort -u because with -k it only
 # compares keys, not the whole line.
 pids=""
 _EOF_
   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 (time sort --merge -k1,1 -k2n,2n joined/*.$db.bigDbSnp | uniq > $db.$outRoot.bigDbSnp) &
 echo \$!
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
 } # doMergeChroms
 
 
 #########################################################################
 # * step: fixHg19ChrM [workhorse]
 sub doFixHg19ChrM {
   my $runDir = $buildDir;
   if (grep(/hg19/, @dbList)) {
     my $whatItDoes = "It does a liftOver from NC_012920.1 to hg19 chrM.";
     my $bossScript = newBash HgRemoteScript("$runDir/doFixHg19ChrM.sh", $workhorse,
                                             $runDir, $whatItDoes);
     $bossScript->add(<<_EOF_
 # For hg19, liftOver NC_012920.1 annotations to hg19 chrM.
 sed -e 's/NC_012920 /NC_012920.1 /' \\
   /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \\
   > hg19.mitoLiftover.chain
 # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
 # doesn't deal with 0-base items.
 mv hg19.$outRoot.bigDbSnp hg19.preChrMFix.$outRoot.bigDbSnp
 time (grep ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\
       | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 += 1; print;}' \\
       | liftOver -tab -bedPlus=3 stdin \\
           hg19.mitoLiftover.chain stdout chrM.unmapped \\
       | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 -= 1; print;}' \\
       | sort -k2n,2n \\
         > hg19.chrM.$outRoot.bigDbSnp)
 wc -l hg19.chrM.$outRoot.bigDbSnp chrM.unmapped
 time grep -v ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\
      | sort --merge -k1,1 -k2n,2n - hg19.chrM.$outRoot.bigDbSnp \\
        > hg19.$outRoot.bigDbSnp
 _EOF_
                     );
     $bossScript->execute()
   };
 } # doFixHg19ChrM
 
 
 #########################################################################
 # * step: check [workhorse]
 sub doCheck {
   my $runDir = $buildDir;
 
   my $whatItDoes = "It runs checkBigDbSnp on merged bigDbSnp files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doCheck.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 pids=""
 _EOF_
                   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 time checkBigDbSnp $db.$outRoot.bigDbSnp $HgAutomate::clusterData/$db/$db.2bit $db.$outRoot.checked.bigDbSnp &
 echo \$!
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
-} # doBigBed
+} # doCheck
 
 
 #########################################################################
 # * step: bigBed [workhorse]
 sub doBigBed {
   my $runDir = $buildDir;
 
-  my $whatItDoes = "It runs bedToBigBed on merged & checked bigDbSnp files.";
+  # Helper script to make Mult, Common and ClinVar subsets and convert to bigBed for one db.
+  my $makeSubsetsScript = "$runDir/makeSubsets.sh";
+  my $fh = HgAutomate::mustOpen(">$makeSubsetsScript");
+  print $fh <<_EOF_
+#!/bin/bash
+set -beEu -o pipefail
+db=\$1
+time $Bin/categorizeBigDbSnp.pl \$db \$db.$outRoot.checked.bigDbSnp
+pids=""
+for subset in Mult Common ClinVar; do
+  time bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigDbSnp.as -type=bed4+ -extraIndex=name \\
+            \$db.\$subset.bigDbSnp /hive/data/genomes/\$db/chrom.sizes \$db.$outRoot.\$subset.bb &
+  pids+=" \$!";
+done
+for pid in \$pids; do
+  if wait \$pid; then
+    echo pid \$pid done
+  else
+    echo pid \$pid FAILED
+    exit 1
+  fi
+done
+_EOF_
+    ;
+  close($fh);
+  system("chmod a+x $makeSubsetsScript") == 0 || die "Unable to chmod $makeSubsetsScript";
+
+  my $whatItDoes = "It runs bedToBigBed on merged & checked bigDbSnp files and makes ".
+        "Mult, Common and ClinVar subsets.";
   my $bossScript = newBash HgRemoteScript("$runDir/doBigBed.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 pids=""
 _EOF_
                   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 time bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigDbSnp.as -type=bed4+ -extraIndex=name \\
             $db.$outRoot.checked.bigDbSnp /hive/data/genomes/$db/chrom.sizes $db.$outRoot.bb &
-
-echo \$!
+pids+=" \$!"
+$makeSubsetsScript $db &
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
 } # doBigBed
 
 
 #########################################################################
 # * step: install [dbHost]
 sub doInstall {
   my $runDir = $buildDir;
 
   my $whatItDoes = "It installs files in /gbdb.";
-  my $bossScript = new HgRemoteScript("$runDir/doInstall.csh", $workhorse,
+  my $bossScript = newBash HgRemoteScript("$runDir/doInstall.sh", $workhorse,
 				      $runDir, $whatItDoes);
 
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 ln -sf $buildDir/$db.$outRoot.bb /gbdb/$db/snp/$outRoot.bb
+for subset in Mult Common ClinVar; do
+  ln -sf $buildDir/$db.$outRoot.\$subset.bb /gbdb/$db/snp/${outRoot}\$subset.bb
+done
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 mkdir -p /gbdb/hgFixed/dbSnp
 ln -sf $buildDir/${outRoot}Details.tab* /gbdb/hgFixed/dbSnp/
 _EOF_
   );
   $bossScript->execute();
 } # doInstall
 
 
 #########################################################################
 # * step: cleanup [workhorse]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 bzip2 *.bigDbSnp
 rm -rf merged splitProcessed joined
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 checkOptions();
 usage(1) if (scalar(@ARGV) != 3);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($topDir, $buildId, $freqSourceOrder) = @ARGV;
 
 # Establish what directory we will work in.
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir : "$topDir/bigDbSnp.$date";
 
 $outRoot .= $buildId;
 $jsonDir = "$topDir/json";
 @dbList = map { grcToDb($_); } split(',', $assemblyList);
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 HgAutomate::verbose(1,
 	"\n *** All done !$upThrough  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
 HgAutomate::verbose(1,
 	" *** Steps were performed in $buildDir\n");
 HgAutomate::verbose(1, "\n");