src/hg/utils/automation/doBigDbSnp.pl b49e61be4ad54a46e01be904fa8a8985e9850f0d

b49e61be4ad54a46e01be904fa8a8985e9850f0d
angie
  Tue Nov 12 12:27:30 2019 -0800
dbSnp153: add a bigBed4 subtrack of coordinate ranges for mappings that we dropped due to inconsistent SPDI.  refs #23283
Overall counts increased because we used to bail on an entire variant when we discovered an inconsistent SPDI,
losing some valid mappings.  Now we go through all mappings, and the bad ones are stored instead of dropped.

diff --git src/hg/utils/automation/doBigDbSnp.pl src/hg/utils/automation/doBigDbSnp.pl
index 9a6f744..f3d1e5d 100755
--- src/hg/utils/automation/doBigDbSnp.pl
+++ src/hg/utils/automation/doBigDbSnp.pl
@@ -1,675 +1,698 @@
 #!/usr/bin/env perl
 
 # DO NOT EDIT the /cluster/bin/scripts copy of this file --
 # edit ~/kent/src/hg/utils/automation/doBigDbSnp.pl instead.
 
 # Copyright (C) 2019 The Regents of the University of California
 
 use Getopt::Long;
 use warnings;
 use strict;
 use FindBin qw($Bin);
 use lib "$Bin";
 use HgAutomate;
 use HgRemoteScript;
 use HgStepManager;
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_assemblyList
     $opt_buildDir
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'split',   func => \&doSplit },
       { name => 'convert',   func => \&doConvert },
       { name => 'mergeToChrom',   func => \&doMergeToChrom },
       { name => 'mergeChroms',   func => \&doMergeChroms },
       { name => 'fixHg19ChrM',   func => \&doFixHg19ChrM },
       { name => 'check',   func => \&doCheck },
       { name => 'bigBed',   func => \&doBigBed },
       { name => 'install',   func => \&doInstall },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Files that must exist in $topDir:
 my $refSeqToUcsc = 'refSeqToUcsc.tab';
 my $equivRegions = 'equivRegions.tab';
 
 # Option defaults:
 my $assemblyList = 'GRCh37.p13,GRCh38.p12';
 my $dbHost = 'hgwdev';
 my $bigClusterHub = 'ku';
 my $smallClusterHub = 'hgwdev';
 my $workhorse = 'hgwdev';
 
 my $outRoot = 'dbSnp';
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base topDir buildId freqSourceOrder
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -assemblyList list    Comma-separated list of assemblies used by dbSNP
                           default: $assemblyList
     -buildDir dir         Use dir instead of default topDir/bigDbSnp.\$date
                           (necessary when continuing at a later date).
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => $workhorse,
 						'fileServer' => '',
 						'bigClusterHub' => $bigClusterHub,
 						'smallClusterHub' => $smallClusterHub);
   print STDERR "
 Convert dbSNP JSON into bigDbSnp and associated track files.
 
 topDir is usually /hive/data/outside/dbSNP/NNN where NNN is 152 or greater.
 topDir is expected to have a subdirectory json in which refsnp-*.json.bz2
 files have already been downloaded, as well as files $refSeqToUcsc and $equivRegions
 (see usage statement for dbSnpJsonToTab).
 
 buildId is usually NNN where NNN is 152 or greater, same as topDir; it can also have a
 suffix to distinguish it, e.g. 152Test.  The names of all result files contain $outRoot\$buildId.
 
 freqSourceOrder is a comma-separated list of projects that submit frequency data to dbSNP
 (see usage statement for dbSnpJsonToTab).
 
 Steps:
     split: splits refsnp-*.json.bz2 files into chunks of 100,000 lines.
     convert: runs dbSnpJsonToTab on chunks.
     mergeToChrom: merges chunk result files into per-chrom results files.
     mergeChroms: merges per-chrom results files.
     fixHg19ChrM: if annotations on hg19 are included, then liftOver NC_012920 to hg19 chrM.
     check: runs checkBigDbSnp to add ucscNotes about overlapping items and clustering anomalies.
     bigBed: Converts BED4+ .bigDbSnp files into bigBed.
     install: installs links to files in /gbdb.
     cleanup: Removes or compresses intermediate files.
 All operations are performed in the build directory which is
 topDir/bigDbSnp.\$date unless -buildDir is given.
 ";
   # Detailed help (-help):
   print STDERR "
 Assumptions:
 1. $HgAutomate::clusterData/\$db/\$db.2bit contains sequence for \$db.
 2. topDir/json/ contains downloaded files refsnp-*.json.bz2
 3. topDir/ contains files refSeqToUcsc.tab and equivRegions.tab - see dbSnpJsonToTab usage
 " if ($detailed);
   print "\n";
   exit $status;
 }
 
 
 # Globals:
 # Command line args: db
 my ($topDir, $buildId, $freqSourceOrder);
 # Other:
 my ($buildDir, $jsonDir, @dbList, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
                       'assemblyList=s',
 		      'buildDir=s',
                       'buildId=s',
                       'freqSourceOrder=s',
 		      @HgAutomate::commonOptionSpec,
 		      );
   usage(1) if (!$ok);
   usage(0, 1) if ($opt_help);
   if ($opt_assemblyList) {
     $assemblyList= $opt_assemblyList;
   }
   # buildDir default depends on topDir (undetermined at this point) and is handled in main
   HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 sub grcToDb($) {
   # dbSNP is only ever going to produce JSON for various patch levels of GRCh38 and 37.
   my ($grc) = @_;
   my $db;
   if ($grc =~ /^GRCh38/) {
     $db = 'hg38';
   } elsif ($grc =~ /^GRCh37/) {
     $db = 'hg19';
   } else {
     die "Expected GRC assembly to start with 'GRCh37' or 'GRCh38' but got '$grc'";
   }
   return $db;
 }
 
 #########################################################################
 # * step: split [smallCluster]
 sub doSplit {
   my $runDir = "$buildDir/run.split";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/split";
   HgAutomate::mustMkdir($outDir);
 
   my $splitScript = "$runDir/splitJson.sh";
   my $fh = HgAutomate::mustOpen(">$splitScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 jsonIn=\$1
 N=100000
 prefix=$outDir/\$(basename \$jsonIn .json.bz2)
 
 bzcat \$jsonIn | split -l \$N --filter='bzip2 > \$FILE.bz2' - \$prefix
 EOF
     ;
   close($fh);
   system("chmod a+x $splitScript") == 0 || die "Unable to chmod $splitScript";
   HgAutomate::makeGsub($runDir, "$splitScript {check in exists+ \$(path1)}");
 
   my $whatItDoes = "It splits per-chrom JSON files into 100,000 line chunks.";
   my $bossScript = new HgRemoteScript("$runDir/doSplit.csh", $smallClusterHub,
 				      $runDir, $whatItDoes);
   my $paraRun = HgAutomate::paraRun();
   my $gensub2 = HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2 > jsonList
 $gensub2 jsonList single gsub jobList
 $paraRun
 _EOF_
     );
   $bossScript->execute();
 } # doSplit
 
 
 #########################################################################
 # * step: convert [bigClusterHub]
 sub doConvert {
   my $runDir = "$buildDir/run.convert";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/splitProcessed";
   HgAutomate::mustMkdir($outDir);
 
   my $convertScript = "$runDir/jsonToTab.sh";
   my $fh = HgAutomate::mustOpen(">$convertScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 # jsonIn needs to be absolute path
 jsonIn=\$1
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpJsonToTab.XXXXXXXX)
 pushd \$tmpDir
 
 outRoot=\$(basename \$jsonIn .bz2)
 chromOutDir=$outDir/\$(echo \$outRoot | sed -e 's/..\$//;')
 
 bzcat \$jsonIn \\
 | dbSnpJsonToTab -freqSourceOrder=$freqSourceOrder \\
     -equivRegions=$topDir/$equivRegions \\
     $assemblyList $topDir/$refSeqToUcsc stdin \$outRoot
 
 # For sorting.  I expected that this would be set already from my shell, but apparently not:
 export LC_COLLATE=C
 
 # Discard the last two bigDbSnp columns -- they only have 0s.  The real values will be added
 # later by bedJoinTabOffset.
 EOF
     ;
   foreach my $grc (split(',', $assemblyList)) {
     my $db = grcToDb($grc);
     print $fh <<EOF
 cut -f1-15  \$outRoot.$grc.bigDbSnp \\
 | sort -k1,1 -k2n,2n \\
 | bzip2 \\
   > \$outRoot.$db.sorted.bigDbSnp.bz2
+sort -k1,1 -k2n,2n \$outRoot.$grc.badCoords.bed \\
+| bzip2 \\
+  > \$outRoot.$db.sorted.badCoords.bed.bz2
 EOF
       ;
   }
   print $fh <<EOF
 
 sort \${outRoot}Details.tab | bzip2 > \${outRoot}Details.tab.bz2
-sort -u \${outRoot}Failed.json | bzip2 > \${outRoot}Failed.json.bz2
 sort \${outRoot}Errors.tab | bzip2 > \${outRoot}Errors.tab.bz2
 sort \${outRoot}Merged.tab | bzip2 > \${outRoot}Merged.tab.bz2
+sort \${outRoot}Warnings.tab | bzip2 > \${outRoot}Warnings.tab.bz2
 
 popd
 
 mkdir -p \$chromOutDir
 cp -p \$tmpDir/\$outRoot*.bz2 \$chromOutDir/
 
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $convertScript") == 0 || die "Unable to chmod $convertScript";
 
   my $whatItDoes = "It converts dbSNP JSON to bigDbSnp, dbSnpDetails and other files.";
   my $bossScript = new HgRemoteScript("$runDir/doConvert.csh", $bigClusterHub,
 				      $runDir, $whatItDoes);
 
   HgAutomate::makeGsub($runDir, "$convertScript {check in exists+ \$(path1)}");
   my $paraRun = HgAutomate::paraRun();
   my $gensub2 = HgAutomate::gensub2();
   $bossScript->add(<<_EOF_
 ls -1S $buildDir/split/ref*.bz2 > splitList
 $gensub2 splitList single gsub jobList
 $paraRun
 _EOF_
   );
   $bossScript->execute();
 } # doConvert
 
 
 #########################################################################
 # * step: mergeToChrom [smallClusterHub]
 sub doMergeToChrom {
   my $runDir = "$buildDir/run.mergeToChrom";
   HgAutomate::mustMkdir($runDir);
   my $outDir = "$buildDir/mergedToChrom";
   HgAutomate::mustMkdir($outDir);
 
   my $sortMergeBzBedScript = "$runDir/sortMergeBzBed.sh";
   my $fh = HgAutomate::mustOpen(">$sortMergeBzBedScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 bzBedList=\$1
 outFile=\$2
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpMergeSortBed.XXXXXXXX)
 pushd \$tmpDir
 
 cp /dev/null bedList
 for bz in \$(cat \$bzBedList); do
     bed=\$(basename \$bz .bz2)
     bzcat \$bz > \$bed
     echo \$bed >> bedList
 done
 
 export LC_COLLATE=C
 sort --merge -k1,1 -k2n,2n \$(cat bedList) > \$outFile
 
 popd
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $sortMergeBzBedScript") == 0 || die "Unable to chmod $sortMergeBzBedScript";
 
   my $sortMergeBzScript = "$runDir/sortMergeBz.sh";
   $fh = HgAutomate::mustOpen(">$sortMergeBzScript");
   print $fh <<EOF
 #!/bin/bash
 set -beEu -o pipefail
 bzList=\$1
 outFile=\$2
 
 tmpDir=\$(mktemp -d /dev/shm/dbSnpMergeSort.XXXXXXXX)
 pushd \$tmpDir
 
 cp /dev/null txtList
 for bz in \$(cat \$bzList); do
     txt=\$(basename \$bz .bz2)
     bzcat \$bz > \$txt
     echo \$txt >> txtList
 done
 
 export LC_COLLATE=C
 sort --merge -u \$(cat txtList) > \$outFile
 
 popd
 rm -rf \$tmpDir
 EOF
     ;
   close($fh);
   system("chmod a+x $sortMergeBzScript") == 0 || die "Unable to chmod $sortMergeBzScript";
 
-  my $uniqBzScript = "$runDir/uniqBz.sh";
-  $fh = HgAutomate::mustOpen(">$uniqBzScript");
-  print $fh <<EOF
-#!/bin/bash
-set -beEu -o pipefail
-bzList=\$1
-outFile=\$2
-
-bzcat \$(cat \$bzList) | uniq | bzip2 > \$outFile
-EOF
-    ;
-  close($fh);
-  system("chmod a+x $uniqBzScript") == 0 || die "Unable to chmod $uniqBzScript";
-
   my $whatItDoes = "It merge-sorts the results from split-up JSON files into per-chromosome files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doMergeToChrom.sh", $smallClusterHub,
                                           $runDir, $whatItDoes);
 
   my $paraRun = HgAutomate::paraRun();
   $bossScript->add(<<_EOF_
 # One merge per "chrom" per type of dbSnpJsonToTab output
 for jsonFile in \$(ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2); do
     prefix=\$(basename \$jsonFile .json.bz2)
     echo \$prefix
-    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.hg19.* > \$prefix.hg19.list
-    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.hg38.* > \$prefix.hg38.list
+_EOF_
+                  );
+  foreach my $db (@dbList) {
+    $bossScript->add(<<_EOF_
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.$db.*bigDbSnp* > \$prefix.$db.bigDbSnp.list
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.$db.*badCoords* > \$prefix.$db.badCoords.list
+_EOF_
+                    );
+  }
+  my $dbListStr = join(',', @dbList);
+  $bossScript->add(<<_EOF_
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Details.* > \$prefix.details.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Errors.* > \$prefix.errors.list
-    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Failed.* > \$prefix.failed.list
     ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Merged.* > \$prefix.merged.list
+    ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Warnings.* > \$prefix.warnings.list
 done
 
 cp /dev/null jobList
-for list in *.{hg19,hg38}.list; do
-    prefix=\$(basename \$list .list)
+for list in *.{$dbListStr}.bigDbSnp.list; do
+    prefix=\$(basename \$list .bigDbSnp.list)
     echo "./sortMergeBzBed.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.bigDbSnp}" >> jobList
 done
 for list in *.details.list; do
     prefix=\$(basename \$list .list)
     echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.tab}" >> jobList
 done
 # OK for these to be empty (check out line instead of line+):
-for list in *.errors.list; do
-    prefix=\$(basename \$list .list)
-    echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList
+for list in *.{$dbListStr}.badCoords.list; do
+    prefix=\$(basename \$list .badCoords.list)
+    echo "./sortMergeBzBed.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.badCoords.bed}" >> jobList
 done
-for list in *.merged.list; do
+for list in *.errors.list *.merged.list *.warnings.list; do
     prefix=\$(basename \$list .list)
     echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList
 done
-for list in *.failed.list; do
-    prefix=\$(basename \$list .list)
-    echo "./uniqBz.sh {check in line+ \$PWD/\$list} {check out exists+ $outDir/\$prefix.json.bz2}" >> jobList
-done
 $paraRun;
 _EOF_
   );
   $bossScript->execute();
 } # doMergeToChrom
 
 
 #########################################################################
 # * step: mergeChroms [workhorse]
 sub doMergeChroms {
   my $runDir = $buildDir;
   my $inDir = "mergedToChrom";
   HgAutomate::mustMkdir("$runDir/joined");
 
   my $whatItDoes = "It merges chrom-level result files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doMergeChroms.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
-# Merge all chroms' *Merged.tab to the final Merged.tab file in background
+# Merge all chroms' *Merged.tab to the final Merged.tab file in background,
+# likewise for errors, warnings, and badCoords which should all be relatively small and quick.
+pids=""
 time sort --merge -u $inDir/*.merged.tab > ${outRoot}Merged.tab &
+pids+=" \$!"
+time sort --merge -u $inDir/*.errors.tab > ${outRoot}Errors.tab &
+pids+=" \$!"
+time sort --merge -u $inDir/*.warnings.tab > ${outRoot}Warnings.tab &
+pids+=" \$!"
+_EOF_
+  );
+  foreach my $db (@dbList) {
+    $bossScript->add(<<_EOF_
+(time sort --merge -k1,1 -k2n,2n $inDir/*.$db.badCoords.bed | uniq > $db.$outRoot.badCoords.bed) &
+pids+=" \$!"
+_EOF_
+                    );
+  }
+  $bossScript->add(<<_EOF_
 
 # Merge all chroms' *Details.tab to the final Details.tab file
 time sort --merge -u $inDir/*.details.tab > ${outRoot}Details.tab
 
-# Compress Details.tab with bgzip in background.  For now, leave original file uncompressed.
+for pid in \$pids; do
+  if wait \$pid; then
+    echo pid \$pid done
+  else
+    echo pid \$pid FAILED
+    exit 1
+  fi
+done
+
+# Compress & index Details.tab with bgzip in background.  Leave original file uncompressed for
+# bedJoinTabOffset.
 time bgzip -iI ${outRoot}Details.tab.gz.gzi -c ${outRoot}Details.tab > ${outRoot}Details.tab.gz &
+pids=\$!
 
 # parallel job of bedJoinTabOffset on each chrom's .bigDbSnp and ${outRoot}Details.tab
 # bedJoinTabOffset builds a massive hash in memory (file offsets of >650M lines of Details),
 # so limit the number of concurrent processes to 10.
 time (ls -1S $inDir/refsnp-*.*.bigDbSnp |
       parallel --max-procs 10 --ungroup \\
         bedJoinTabOffset -verbose=2 ${outRoot}Details.tab {} joined/{/})
 
 # Now mergeSort all chrom's data together.  Don't use sort -u because with -k it only
 # compares keys, not the whole line.
-pids=""
 _EOF_
   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 (time sort --merge -k1,1 -k2n,2n joined/*.$db.bigDbSnp | uniq > $db.$outRoot.bigDbSnp) &
-echo \$!
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
 } # doMergeChroms
 
 
 #########################################################################
 # * step: fixHg19ChrM [workhorse]
 sub doFixHg19ChrM {
   my $runDir = $buildDir;
   if (grep(/hg19/, @dbList)) {
     my $whatItDoes = "It does a liftOver from NC_012920.1 to hg19 chrM.";
     my $bossScript = newBash HgRemoteScript("$runDir/doFixHg19ChrM.sh", $workhorse,
                                             $runDir, $whatItDoes);
     $bossScript->add(<<_EOF_
 # For hg19, liftOver NC_012920.1 annotations to hg19 chrM.
 sed -e 's/NC_012920 /NC_012920.1 /' \\
   /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \\
   > hg19.mitoLiftover.chain
 # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
 # doesn't deal with 0-base items.
 mv hg19.$outRoot.bigDbSnp hg19.preChrMFix.$outRoot.bigDbSnp
 time (grep ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\
       | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 += 1; print;}' \\
       | liftOver -tab -bedPlus=3 stdin \\
           hg19.mitoLiftover.chain stdout chrM.unmapped \\
       | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 -= 1; print;}' \\
       | sort -k2n,2n \\
         > hg19.chrM.$outRoot.bigDbSnp)
 wc -l hg19.chrM.$outRoot.bigDbSnp chrM.unmapped
 time grep -v ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\
      | sort --merge -k1,1 -k2n,2n - hg19.chrM.$outRoot.bigDbSnp \\
        > hg19.$outRoot.bigDbSnp
 _EOF_
                     );
     $bossScript->execute()
   };
 } # doFixHg19ChrM
 
 
 #########################################################################
 # * step: check [workhorse]
 sub doCheck {
   my $runDir = $buildDir;
 
   my $whatItDoes = "It runs checkBigDbSnp on merged bigDbSnp files.";
   my $bossScript = newBash HgRemoteScript("$runDir/doCheck.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 pids=""
 _EOF_
                   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 time checkBigDbSnp $db.$outRoot.bigDbSnp $HgAutomate::clusterData/$db/$db.2bit $db.$outRoot.checked.bigDbSnp &
 echo \$!
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
 } # doCheck
 
 
 #########################################################################
 # * step: bigBed [workhorse]
 sub doBigBed {
   my $runDir = $buildDir;
 
   # Helper script to make Mult, Common and ClinVar subsets and convert to bigBed for one db.
   my $makeSubsetsScript = "$runDir/makeSubsets.sh";
   my $fh = HgAutomate::mustOpen(">$makeSubsetsScript");
   print $fh <<_EOF_
 #!/bin/bash
 set -beEu -o pipefail
 db=\$1
 time $Bin/categorizeBigDbSnp.pl \$db \$db.$outRoot.checked.bigDbSnp
 pids=""
 for subset in Mult Common ClinVar; do
   time bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigDbSnp.as -type=bed4+ -extraIndex=name \\
             \$db.\$subset.bigDbSnp /hive/data/genomes/\$db/chrom.sizes \$db.$outRoot.\$subset.bb &
   pids+=" \$!";
 done
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
     ;
   close($fh);
   system("chmod a+x $makeSubsetsScript") == 0 || die "Unable to chmod $makeSubsetsScript";
 
   my $whatItDoes = "It runs bedToBigBed on merged & checked bigDbSnp files and makes ".
         "Mult, Common and ClinVar subsets.";
   my $bossScript = newBash HgRemoteScript("$runDir/doBigBed.sh", $workhorse,
                                           $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 pids=""
 _EOF_
                   );
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 time bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigDbSnp.as -type=bed4+ -extraIndex=name \\
             $db.$outRoot.checked.bigDbSnp /hive/data/genomes/$db/chrom.sizes $db.$outRoot.bb &
+time bedToBigBed -tab -type=bed4 -extraIndex=name \\
+            $db.$outRoot.badCoords.bed /hive/data/genomes/$db/chrom.sizes $db.${outRoot}BadCoords.bb &
 pids+=" \$!"
 $makeSubsetsScript $db &
 pids+=" \$!"
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 for pid in \$pids; do
   if wait \$pid; then
     echo pid \$pid done
   else
     echo pid \$pid FAILED
     exit 1
   fi
 done
 _EOF_
                   );
   $bossScript->execute();
 } # doBigBed
 
 
 #########################################################################
 # * step: install [dbHost]
 sub doInstall {
   my $runDir = $buildDir;
 
   my $whatItDoes = "It installs files in /gbdb.";
   my $bossScript = newBash HgRemoteScript("$runDir/doInstall.sh", $workhorse,
 				      $runDir, $whatItDoes);
 
   foreach my $db (@dbList) {
     $bossScript->add(<<_EOF_
 ln -sf $buildDir/$db.$outRoot.bb /gbdb/$db/snp/$outRoot.bb
 for subset in Mult Common ClinVar; do
   ln -sf $buildDir/$db.$outRoot.\$subset.bb /gbdb/$db/snp/${outRoot}\$subset.bb
 done
+ln -sf $buildDir/$db.${outRoot}BadCoords.bb /gbdb/$db/snp/${outRoot}BadCoords.bb
 _EOF_
                     );
   }
   $bossScript->add(<<_EOF_
 mkdir -p /gbdb/hgFixed/dbSnp
 ln -sf $buildDir/${outRoot}Details.tab* /gbdb/hgFixed/dbSnp/
 _EOF_
   );
   $bossScript->execute();
 } # doInstall
 
 
 #########################################################################
 # * step: cleanup [workhorse]
 sub doCleanup {
   my $runDir = "$buildDir";
   my $whatItDoes = "It cleans up or compresses intermediate files.";
   my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $workhorse,
 				      $runDir, $whatItDoes);
   $bossScript->add(<<_EOF_
 bzip2 *.bigDbSnp
 rm -rf merged splitProcessed joined
 _EOF_
   );
   $bossScript->execute();
 } # doCleanup
 
 
 #########################################################################
 # main
 
 # Prevent "Suspended (tty input)" hanging:
 HgAutomate::closeStdin();
 
 # Make sure we have valid options and exactly 1 argument:
 checkOptions();
 usage(1) if (scalar(@ARGV) != 3);
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($topDir, $buildId, $freqSourceOrder) = @ARGV;
 
 # Establish what directory we will work in.
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir : "$topDir/bigDbSnp.$date";
 
 $outRoot .= $buildId;
 $jsonDir = "$topDir/json";
 @dbList = map { grcToDb($_); } split(',', $assemblyList);
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;
 
 HgAutomate::verbose(1,
 	"\n *** All done !$upThrough  Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n");
 HgAutomate::verbose(1,
 	" *** Steps were performed in $buildDir\n");
 HgAutomate::verbose(1, "\n");