122806164798753fbfbce3cc77411c911a4d9d1e angie Tue Oct 1 13:15:36 2019 -0700 Adding automation script doBigDbSnp.pl that downloads dbSNP JSON files, does a cluster run of dbSnpJsonToTab, aggregates results, runs checkBigDbSnp & installs track files. refs #23283 diff --git src/hg/utils/automation/doBigDbSnp.pl src/hg/utils/automation/doBigDbSnp.pl new file mode 100755 index 0000000..e57d99e --- /dev/null +++ src/hg/utils/automation/doBigDbSnp.pl @@ -0,0 +1,644 @@ +#!/usr/bin/env perl + +# DO NOT EDIT the /cluster/bin/scripts copy of this file -- +# edit ~/kent/src/hg/utils/automation/doBigDbSnp.pl instead. + +# Copyright (C) 2019 The Regents of the University of California + +use Getopt::Long; +use warnings; +use strict; +use FindBin qw($Bin); +use lib "$Bin"; +use HgAutomate; +use HgRemoteScript; +use HgStepManager; + +# Option variable names, both common and peculiar to this script: +use vars @HgAutomate::commonOptionVars; +use vars @HgStepManager::optionVars; +use vars qw/ + $opt_assemblyList + $opt_buildDir + /; + +# Specify the steps supported with -continue / -stop: +my $stepper = new HgStepManager( + [ { name => 'split', func => \&doSplit }, + { name => 'convert', func => \&doConvert }, + { name => 'mergeToChrom', func => \&doMergeToChrom }, + { name => 'mergeChroms', func => \&doMergeChroms }, + { name => 'fixHg19ChrM', func => \&doFixHg19ChrM }, + { name => 'check', func => \&doCheck }, + { name => 'bigBed', func => \&doBigBed }, + { name => 'install', func => \&doInstall }, + { name => 'cleanup', func => \&doCleanup }, + ] + ); + +# Files that must exist in $topDir: +my $refSeqToUcsc = 'refSeqToUcsc.tab'; +my $equivRegions = 'equivRegions.tab'; + +# Option defaults: +my $assemblyList = 'GRCh37.p13,GRCh38.p12'; +my $dbHost = 'hgwdev'; +my $bigClusterHub = 'ku'; +my $smallClusterHub = 'hgwdev'; +my $workhorse = 'hgwdev'; + +my $outRoot = 'dbSnp'; + +my $base = $0; +$base =~ s/^(.*\/)?//; + +sub usage { + # Usage / help / self-documentation: + my ($status, $detailed) = @_; + # Basic help (for incorrect usage): + print STDERR " +usage: $base topDir buildId freqSourceOrder +options: +"; + print STDERR $stepper->getOptionHelp(); + print STDERR <<_EOF_ + -assemblyList list Comma-separated list of assemblies used by dbSNP + default: $assemblyList + -buildDir dir Use dir instead of default topDir/bigDbSnp.\$date + (necessary when continuing at a later date). +_EOF_ + ; + print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost, + 'workhorse' => $workhorse, + 'fileServer' => '', + 'bigClusterHub' => $bigClusterHub, + 'smallClusterHub' => $smallClusterHub); + print STDERR " +Convert dbSNP JSON into bigDbSnp and associated track files. + +topDir is usually /hive/data/outside/dbSNP/NNN where NNN is 152 or greater. +topDir is expected to have a subdirectory json in which refsnp-*.json.bz2 +files have already been downloaded, as well as files $refSeqToUcsc and $equivRegions +(see usage statement for dbSnpJsonToTab). + +buildId is usually NNN where NNN is 152 or greater, same as topDir; it can also have a +suffix to distinguish it, e.g. 152Test. The names of all result files contain $outRoot\$buildId. + +freqSourceOrder is a comma-separated list of projects that submit frequency data to dbSNP +(see usage statement for dbSnpJsonToTab). + +Steps: + split: splits refsnp-*.json.bz2 files into chunks of 100,000 lines. + convert: runs dbSnpJsonToTab on chunks. + mergeToChrom: merges chunk result files into per-chrom results files. + mergeChroms: merges per-chrom results files. + fixHg19ChrM: if annotations on hg19 are included, then liftOver NC_012920 to hg19 chrM. + check: runs checkBigDbSnp to add ucscNotes about overlapping items and clustering anomalies. + bigBed: Converts BED4+ .bigDbSnp files into bigBed. + install: installs links to files in /gbdb. + cleanup: Removes or compresses intermediate files. +All operations are performed in the build directory which is +topDir/bigDbSnp.\$date unless -buildDir is given. +"; + # Detailed help (-help): + print STDERR " +Assumptions: +1. $HgAutomate::clusterData/\$db/\$db.2bit contains sequence for \$db. +2. topDir/json/ contains downloaded files refsnp-*.json.bz2 +3. topDir/ contains files refSeqToUcsc.tab and equivRegions.tab - see dbSnpJsonToTab usage +" if ($detailed); + print "\n"; + exit $status; +} + + +# Globals: +# Command line args: db +my ($topDir, $buildId, $freqSourceOrder); +# Other: +my ($buildDir, $jsonDir, @dbList, $secondsStart, $secondsEnd); + +sub checkOptions { + # Make sure command line options are valid/supported. + my $ok = GetOptions(@HgStepManager::optionSpec, + 'assemblyList=s', + 'buildDir=s', + 'buildId=s', + 'freqSourceOrder=s', + @HgAutomate::commonOptionSpec, + ); + usage(1) if (!$ok); + usage(0, 1) if ($opt_help); + if ($opt_assemblyList) { + $assemblyList= $opt_assemblyList; + } + # buildDir default depends on topDir (undetermined at this point) and is handled in main + HgAutomate::processCommonOptions(); + my $err = $stepper->processOptions(); + usage(1) if ($err); + $dbHost = $opt_dbHost if ($opt_dbHost); +} + +sub grcToDb($) { + # dbSNP is only ever going to produce JSON for various patch levels of GRCh38 and 37. + my ($grc) = @_; + my $db; + if ($grc =~ /^GRCh38/) { + $db = 'hg38'; + } elsif ($grc =~ /^GRCh37/) { + $db = 'hg19'; + } else { + die "Expected GRC assembly to start with 'GRCh37' or 'GRCh38' but got '$grc'"; + } + return $db; +} + +######################################################################### +# * step: split [smallCluster] +sub doSplit { + my $runDir = "$buildDir/run.split"; + HgAutomate::mustMkdir($runDir); + my $outDir = "$buildDir/split"; + HgAutomate::mustMkdir($outDir); + + my $splitScript = "$runDir/splitJson.sh"; + my $fh = HgAutomate::mustOpen(">$splitScript"); + print $fh <add(<<_EOF_ +ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2 > jsonList +$gensub2 jsonList single gsub jobList +$paraRun +_EOF_ + ); + $bossScript->execute(); +} # doSplit + + +######################################################################### +# * step: convert [bigClusterHub] +sub doConvert { + my $runDir = "$buildDir/run.convert"; + HgAutomate::mustMkdir($runDir); + my $outDir = "$buildDir/splitProcessed"; + HgAutomate::mustMkdir($outDir); + + my $convertScript = "$runDir/jsonToTab.sh"; + my $fh = HgAutomate::mustOpen(">$convertScript"); + print $fh < \$outRoot.$db.sorted.bigDbSnp.bz2 +EOF + ; + } + print $fh < \${outRoot}Details.tab.bz2 +sort -u \${outRoot}Failed.json | bzip2 > \${outRoot}Failed.json.bz2 +sort \${outRoot}Errors.tab | bzip2 > \${outRoot}Errors.tab.bz2 +sort \${outRoot}Merged.tab | bzip2 > \${outRoot}Merged.tab.bz2 + +popd + +mkdir -p \$chromOutDir +cp -p \$tmpDir/\$outRoot*.bz2 \$chromOutDir/ + +rm -rf \$tmpDir +EOF + ; + close($fh); + system("chmod a+x $convertScript") == 0 || die "Unable to chmod $convertScript"; + + my $whatItDoes = "It converts dbSNP JSON to bigDbSnp, dbSnpDetails and other files."; + my $bossScript = new HgRemoteScript("$runDir/doConvert.csh", $bigClusterHub, + $runDir, $whatItDoes); + + HgAutomate::makeGsub($runDir, "$convertScript {check in exists+ \$(path1)}"); + my $paraRun = HgAutomate::paraRun(); + my $gensub2 = HgAutomate::gensub2(); + $bossScript->add(<<_EOF_ +ls -1S $buildDir/split/ref*.bz2 > splitList +$gensub2 splitList single gsub jobList +$paraRun +_EOF_ + ); + $bossScript->execute(); +} # doConvert + + +######################################################################### +# * step: mergeToChrom [smallClusterHub] +sub doMergeToChrom { + my $runDir = "$buildDir/run.mergeToChrom"; + HgAutomate::mustMkdir($runDir); + my $outDir = "$buildDir/mergedToChrom"; + HgAutomate::mustMkdir($outDir); + + my $sortMergeBzBedScript = "$runDir/sortMergeBzBed.sh"; + my $fh = HgAutomate::mustOpen(">$sortMergeBzBedScript"); + print $fh < \$bed + echo \$bed >> bedList +done + +export LC_COLLATE=C +sort --merge -k1,1 -k2n,2n \$(cat bedList) > \$outFile + +popd +rm -rf \$tmpDir +EOF + ; + close($fh); + system("chmod a+x $sortMergeBzBedScript") == 0 || die "Unable to chmod $sortMergeBzBedScript"; + + my $sortMergeBzScript = "$runDir/sortMergeBz.sh"; + $fh = HgAutomate::mustOpen(">$sortMergeBzScript"); + print $fh < \$txt + echo \$txt >> txtList +done + +export LC_COLLATE=C +sort --merge -u \$(cat txtList) > \$outFile + +popd +rm -rf \$tmpDir +EOF + ; + close($fh); + system("chmod a+x $sortMergeBzScript") == 0 || die "Unable to chmod $sortMergeBzScript"; + + my $uniqBzScript = "$runDir/uniqBz.sh"; + $fh = HgAutomate::mustOpen(">$uniqBzScript"); + print $fh < \$outFile +EOF + ; + close($fh); + system("chmod a+x $uniqBzScript") == 0 || die "Unable to chmod $uniqBzScript"; + + my $whatItDoes = "It merge-sorts the results from split-up JSON files into per-chromosome files."; + my $bossScript = newBash HgRemoteScript("$runDir/doMergeToChrom.sh", $smallClusterHub, + $runDir, $whatItDoes); + + my $paraRun = HgAutomate::paraRun(); + $bossScript->add(<<_EOF_ +# One merge per "chrom" per type of dbSnpJsonToTab output +for jsonFile in \$(ls -1S $jsonDir/refsnp-{chr*,other}.json.bz2); do + prefix=\$(basename \$jsonFile .json.bz2) + echo \$prefix + ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.hg19.* > \$prefix.hg19.list + ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??.hg38.* > \$prefix.hg38.list + ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Details.* > \$prefix.details.list + ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Errors.* > \$prefix.errors.list + ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Failed.* > \$prefix.failed.list + ls -1S $buildDir/splitProcessed/\$prefix/\$prefix??Merged.* > \$prefix.merged.list +done + +cp /dev/null jobList +for list in *.{hg19,hg38}.list; do + prefix=\$(basename \$list .list) + echo "./sortMergeBzBed.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.bigDbSnp}" >> jobList +done +for list in *.details.list; do + prefix=\$(basename \$list .list) + echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line+ $outDir/\$prefix.tab}" >> jobList +done +# OK for these to be empty (check out line instead of line+): +for list in *.errors.list; do + prefix=\$(basename \$list .list) + echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList +done +for list in *.merged.list; do + prefix=\$(basename \$list .list) + echo "./sortMergeBz.sh {check in line+ \$PWD/\$list} {check out line $outDir/\$prefix.tab}" >> jobList +done +for list in *.failed.list; do + prefix=\$(basename \$list .list) + echo "./uniqBz.sh {check in line+ \$PWD/\$list} {check out exists+ $outDir/\$prefix.json.bz2}" >> jobList +done +$paraRun; +_EOF_ + ); + $bossScript->execute(); +} # doMergeToChrom + + +######################################################################### +# * step: mergeChroms [workhorse] +sub doMergeChroms { + my $runDir = $buildDir; + my $inDir = "mergedToChrom"; + HgAutomate::mustMkdir("$runDir/joined"); + + my $whatItDoes = "It merges chrom-level result files."; + my $bossScript = newBash HgRemoteScript("$runDir/doMergeChroms.sh", $workhorse, + $runDir, $whatItDoes); + + $bossScript->add(<<_EOF_ +# Merge all chroms' *Merged.tab to the final Merged.tab file in background +time sort --merge -u $inDir/*.merged.tab > ${outRoot}Merged.tab & + +# Merge all chroms' *Details.tab to the final Details.tab file +time sort --merge -u $inDir/*.details.tab > ${outRoot}Details.tab + +# Compress Details.tab with bgzip in background. For now, leave original file uncompressed. +time bgzip -iI ${outRoot}Details.tab.gz.gzi -c ${outRoot}Details.tab > ${outRoot}Details.tab.gz & + +# parallel job of bedJoinTabOffset on each chrom's .bigDbSnp and ${outRoot}Details.tab +# bedJoinTabOffset builds a massive hash in memory (file offsets of >650M lines of Details), +# so limit the number of concurrent processes to 10. +time (ls -1S $inDir/refsnp-*.*.bigDbSnp | + parallel --max-procs 10 --ungroup \\ + bedJoinTabOffset -verbose=2 ${outRoot}Details.tab {} joined/{/}) + +# Now mergeSort all chrom's data together. Don't use sort -u because with -k it only +# compares keys, not the whole line. +pids="" +_EOF_ + ); + foreach my $db (@dbList) { + $bossScript->add(<<_EOF_ +(time sort --merge -k1,1 -k2n,2n joined/*.$db.bigDbSnp | uniq > $db.$outRoot.bigDbSnp) & +echo \$! +pids+=" \$!" +_EOF_ + ); + } + $bossScript->add(<<_EOF_ +for pid in \$pids; do + if wait \$pid; then + echo pid \$pid done + else + echo pid \$pid FAILED + exit 1 + fi +done +_EOF_ + ); + $bossScript->execute(); +} # doMergeChroms + + +######################################################################### +# * step: fixHg19ChrM [workhorse] +sub doFixHg19ChrM { + my $runDir = $buildDir; + if (grep(/hg19/, @dbList)) { + my $whatItDoes = "It does a liftOver from NC_012920.1 to hg19 chrM."; + my $bossScript = newBash HgRemoteScript("$runDir/doFixHg19ChrM.sh", $workhorse, + $runDir, $whatItDoes); + $bossScript->add(<<_EOF_ +# For hg19, liftOver NC_012920.1 annotations to hg19 chrM. +sed -e 's/NC_012920 /NC_012920.1 /' \\ + /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \\ + > hg19.mitoLiftover.chain +# For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver +# doesn't deal with 0-base items. +mv hg19.$outRoot.bigDbSnp hg19.preChrMFix.$outRoot.bigDbSnp +time (grep ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\ + | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 += 1; print;}' \\ + | liftOver -tab -bedPlus=3 stdin \\ + hg19.mitoLiftover.chain stdout chrM.unmapped \\ + | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 -= 1; print;}' \\ + | sort -k2n,2n \\ + > hg19.chrM.$outRoot.bigDbSnp) +wc -l hg19.chrM.$outRoot.bigDbSnp chrM.unmapped +time grep -v ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\ + | sort --merge -k1,1 -k2n,2n - hg19.chrM.$outRoot.bigDbSnp \\ + > hg19.$outRoot.bigDbSnp +_EOF_ + ); + $bossScript->execute() + }; +} # doFixHg19ChrM + + +######################################################################### +# * step: check [workhorse] +sub doCheck { + my $runDir = $buildDir; + + my $whatItDoes = "It runs checkBigDbSnp on merged bigDbSnp files."; + my $bossScript = newBash HgRemoteScript("$runDir/doCheck.sh", $workhorse, + $runDir, $whatItDoes); + + $bossScript->add(<<_EOF_ +pids="" +_EOF_ + ); + foreach my $db (@dbList) { + $bossScript->add(<<_EOF_ +time checkBigDbSnp $db.$outRoot.bigDbSnp $HgAutomate::clusterData/$db/$db.2bit $db.$outRoot.checked.bigDbSnp & +echo \$! +pids+=" \$!" +_EOF_ + ); + } + $bossScript->add(<<_EOF_ +for pid in \$pids; do + if wait \$pid; then + echo pid \$pid done + else + echo pid \$pid FAILED + exit 1 + fi +done +_EOF_ + ); + $bossScript->execute(); +} # doBigBed + + +######################################################################### +# * step: bigBed [workhorse] +sub doBigBed { + my $runDir = $buildDir; + + my $whatItDoes = "It runs bedToBigBed on merged & checked bigDbSnp files."; + my $bossScript = newBash HgRemoteScript("$runDir/doBigBed.sh", $workhorse, + $runDir, $whatItDoes); + + $bossScript->add(<<_EOF_ +pids="" +_EOF_ + ); + foreach my $db (@dbList) { + $bossScript->add(<<_EOF_ +time bedToBigBed -tab -as=\$HOME/kent/src/hg/lib/bigDbSnp.as -type=bed4+ -extraIndex=name \\ + $db.$outRoot.checked.bigDbSnp /hive/data/genomes/$db/chrom.sizes $db.$outRoot.bb & + +echo \$! +pids+=" \$!" +_EOF_ + ); + } + $bossScript->add(<<_EOF_ +for pid in \$pids; do + if wait \$pid; then + echo pid \$pid done + else + echo pid \$pid FAILED + exit 1 + fi +done +_EOF_ + ); + $bossScript->execute(); +} # doBigBed + + +######################################################################### +# * step: install [dbHost] +sub doInstall { + my $runDir = $buildDir; + + my $whatItDoes = "It installs files in /gbdb."; + my $bossScript = new HgRemoteScript("$runDir/doInstall.csh", $workhorse, + $runDir, $whatItDoes); + + foreach my $db (@dbList) { + $bossScript->add(<<_EOF_ +ln -sf $buildDir/$db.$outRoot.bb /gbdb/$db/snp/$outRoot.bb +_EOF_ + ); + } + $bossScript->add(<<_EOF_ +mkdir -p /gbdb/hgFixed/dbSnp +ln -sf $buildDir/${outRoot}Details.tab* /gbdb/hgFixed/dbSnp/ +_EOF_ + ); + $bossScript->execute(); +} # doInstall + + +######################################################################### +# * step: cleanup [workhorse] +sub doCleanup { + my $runDir = "$buildDir"; + my $whatItDoes = "It cleans up or compresses intermediate files."; + my $bossScript = new HgRemoteScript("$runDir/doCleanup.csh", $workhorse, + $runDir, $whatItDoes); + $bossScript->add(<<_EOF_ +bzip2 *.bigDbSnp +rm -rf merged splitProcessed joined +_EOF_ + ); + $bossScript->execute(); +} # doCleanup + + +######################################################################### +# main + +# Prevent "Suspended (tty input)" hanging: +HgAutomate::closeStdin(); + +# Make sure we have valid options and exactly 1 argument: +checkOptions(); +usage(1) if (scalar(@ARGV) != 3); +$secondsStart = `date "+%s"`; +chomp $secondsStart; +($topDir, $buildId, $freqSourceOrder) = @ARGV; + +# Establish what directory we will work in. +my $date = `date +%Y-%m-%d`; +chomp $date; +$buildDir = $opt_buildDir ? $opt_buildDir : "$topDir/bigDbSnp.$date"; + +$outRoot .= $buildId; +$jsonDir = "$topDir/json"; +@dbList = map { grcToDb($_); } split(',', $assemblyList); + +# Do everything. +$stepper->execute(); + +# Tell the user anything they should know. +my $stopStep = $stepper->getStopStep(); +my $upThrough = ($stopStep eq 'cleanup') ? "" : + " (through the '$stopStep' step)"; + +$secondsEnd = `date "+%s"`; +chomp $secondsEnd; +my $elapsedSeconds = $secondsEnd - $secondsStart; +my $elapsedMinutes = int($elapsedSeconds/60); +$elapsedSeconds -= $elapsedMinutes * 60; + +HgAutomate::verbose(1, + "\n *** All done !$upThrough Elapsed time: ${elapsedMinutes}m${elapsedSeconds}s\n"); +HgAutomate::verbose(1, + " *** Steps were performed in $buildDir\n"); +HgAutomate::verbose(1, "\n"); +