ec726ffa6050e6b7b65f5ed885789c850ce87e60 galt Wed Jan 26 03:20:35 2022 -0800 Tweaking the dobBigDbSnp step fixHg19ChrM since NC_012920 is now just called chrMT in dbsnp data. diff --git src/hg/utils/automation/doBigDbSnp.pl src/hg/utils/automation/doBigDbSnp.pl index 447dda5..6b55025 100755 --- src/hg/utils/automation/doBigDbSnp.pl +++ src/hg/utils/automation/doBigDbSnp.pl @@ -80,31 +80,31 @@ topDir is expected to have a subdirectory json in which refsnp-*.json.bz2 files have already been downloaded, as well as files $refSeqToUcsc and $equivRegions (see usage statement for dbSnpJsonToTab). buildId is usually NNN where NNN is 152 or greater, same as topDir; it can also have a suffix to distinguish it, e.g. 152Test. The names of all result files contain $outRoot\$buildId. freqSourceOrder is a comma-separated list of projects that submit frequency data to dbSNP (see usage statement for dbSnpJsonToTab). Steps: split: splits refsnp-*.json.bz2 files into chunks of 100,000 lines. convert: runs dbSnpJsonToTab on chunks. mergeToChrom: merges chunk result files into per-chrom results files. mergeChroms: merges per-chrom results files. - fixHg19ChrM: if annotations on hg19 are included, then liftOver NC_012920 to hg19 chrM. + fixHg19ChrM: if annotations on hg19 are included, then liftOver chrMT (NC_012920) to hg19 chrM. check: runs checkBigDbSnp to add ucscNotes about overlapping items and clustering anomalies. bigBed: Converts BED4+ .bigDbSnp files into bigBed. install: installs links to files in /gbdb. cleanup: Removes or compresses intermediate files. All operations are performed in the build directory which is topDir/bigDbSnp.\$date unless -buildDir is given. "; # Detailed help (-help): print STDERR " Assumptions: 1. $HgAutomate::clusterData/\$db/\$db.2bit contains sequence for \$db. 2. topDir/json/ contains downloaded files refsnp-*.json.bz2 3. topDir/ contains files refSeqToUcsc.tab and equivRegions.tab - see dbSnpJsonToTab usage " if ($detailed); print "\n"; @@ -465,50 +465,50 @@ echo pid \$pid FAILED exit 1 fi done _EOF_ ); $bossScript->execute(); } # doMergeChroms ######################################################################### # * step: fixHg19ChrM [workhorse] sub doFixHg19ChrM { my $runDir = $buildDir; if (grep(/hg19/, @dbList)) { - my $whatItDoes = "It does a liftOver from NC_012920.1 to hg19 chrM."; + my $whatItDoes = "It does a liftOver from chrMT (old name NC_012920) to hg19 chrM."; my $bossScript = newBash HgRemoteScript("$runDir/doFixHg19ChrM.sh", $workhorse, $runDir, $whatItDoes); $bossScript->add(<<_EOF_ -# For hg19, liftOver NC_012920.1 annotations to hg19 chrM. -sed -e 's/NC_012920 /NC_012920.1 /' \\ +# For hg19, liftOver chrMT annotations to hg19 chrM. +sed -e 's/NC_012920 /chrMT /' \\ /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \\ > hg19.mitoLiftover.chain # For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver # doesn't deal with 0-base items. mv hg19.$outRoot.bigDbSnp hg19.preChrMFix.$outRoot.bigDbSnp -time (grep ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\ +time (grep ^chrMT hg19.preChrMFix.$outRoot.bigDbSnp \\ | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 += 1; print;}' \\ | liftOver -tab -bedPlus=3 stdin \\ hg19.mitoLiftover.chain stdout chrM.unmapped \\ | awk -F"\t" 'BEGIN{OFS="\t";} {\$3 -= 1; print;}' \\ | sort -k2n,2n \\ > hg19.chrM.$outRoot.bigDbSnp) wc -l hg19.chrM.$outRoot.bigDbSnp chrM.unmapped -time grep -v ^NC_012920 hg19.preChrMFix.$outRoot.bigDbSnp \\ +time grep -v ^chrMT hg19.preChrMFix.$outRoot.bigDbSnp \\ | sort --merge -k1,1 -k2n,2n - hg19.chrM.$outRoot.bigDbSnp \\ > hg19.$outRoot.bigDbSnp _EOF_ ); $bossScript->execute() }; } # doFixHg19ChrM ######################################################################### # * step: check [workhorse] sub doCheck { my $runDir = $buildDir; my $whatItDoes = "It runs checkBigDbSnp on merged bigDbSnp files.";