91814318ba5e9eec945f690b2598e390f407c7a0
angie
  Fri May 17 17:24:12 2024 -0700
Update dbSnp153 to fix comma-formatting problem as Galt did for dbSnp155.  refs #33070

diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt
index 5c5a323..a3c20b8 100644
--- src/hg/makeDb/doc/bigDbSnp.txt
+++ src/hg/makeDb/doc/bigDbSnp.txt
@@ -488,30 +488,131 @@
 #690160687 rareSome
 #      111 refIsAmbiguous
 #  3360435 refIsMinor
 #   160827 refIsRare
 #    50927 refIsSingleton
 #       33 refMismatch
 #  4532511 revStrand
 #real    36m36.972s
 #user    49m41.817s
 #sys     4m43.806s
 
     # Check count of rs's with at least one bad mapping:
     grep otherMapErr hg38.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l
 #86636
 
+
+# Partial redo 2024-03-08 similar to Galt's dbSnp155 partial redo 2023-03-26 below, to fix the same
+# comma-parsing problem with clinicalSignificance -- see RM #33070.  Finished 2024-03-14.
+    topDir=/hive/data/outside/dbSNP/153
+    freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
+    # Run doBigDbSnp.pl (first with -debug to make runDir):
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
+        -assemblyList=GRCh37.p13,GRCh38.p12 -debug
+
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08
+
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08
+
+    # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
+    rmdir split
+    ln -s ../bigDbSnp.2019-08-07/split split
+    rmdir splitProcessed
+    ln -s ../bigDbSnp.2019-08-07/splitProcessed splitProcessed
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
+        -assemblyList=GRCh37.p13,GRCh38.p12 \
+        -buildDir=`pwd` -continue convert -stop install \
+      >& do.log &
+    tail -f do.log
+    # It failed at the fixHg19ChrM step because the script expects chrom to be "chrMT" but instead
+    # it's "NC_012920.1".  Maybe things changed after b153?  Anyway, run these modified commands
+    # in place of doFixHg19ChrM.sh:
+# For hg19, liftOver chrMT annotations to hg19 chrM.
+sed -e 's/NC_012920 /NC_012920.1 /' \
+  /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \
+  > hg19.mitoLiftover.chain
+# For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
+# doesn't deal with 0-base items.
+# already done: mv hg19.dbSnp153.bigDbSnp hg19.preChrMFix.dbSnp153.bigDbSnp
+time (grep ^NC_012920.1 hg19.preChrMFix.dbSnp153.bigDbSnp \
+      | awk -F$'\t' 'BEGIN{OFS="'$'\t''";} {$3 += 1; print;}' \
+      | liftOver -tab -bedPlus=3 stdin \
+          hg19.mitoLiftover.chain stdout chrM.unmapped \
+      | awk -F$'\t' 'BEGIN{OFS="'$'\t''";} {$3 -= 1; print;}' \
+      | sort -k2n,2n \
+        > hg19.chrM.dbSnp153.bigDbSnp)
+#real    4m20.207s
+wc -l hg19.chrM.dbSnp153.bigDbSnp chrM.unmapped
+#  3312 hg19.chrM.dbSnp153.bigDbSnp
+#    14 chrM.unmapped
+# 7 "Partially deleted in new" items
+time grep -v ^NC_012920.1 hg19.preChrMFix.dbSnp153.bigDbSnp \
+     | sort --merge -k1,1 -k2n,2n - hg19.chrM.dbSnp153.bigDbSnp \
+       > hg19.dbSnp153.bigDbSnp
+#real    5m21.835s
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
+        -assemblyList=GRCh37.p13,GRCh38.p12 \
+        -buildDir=`pwd` -continue check -stop install \
+      >& do2.log &
+    tail -f do2.log
+# *** All done !  (through the 'install' step)  Elapsed time: 177m56s
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08
+
+    # Ah, but the compare-bot, Gerardo and Lou found that dbSnp153Common lost some items --
+    # it turned out that no indels made it into dbSnp153Common due to a change for 155 in
+    # dbSnpJsonToTab.c that broke something else for 153.  Run again (but repeat the manual
+    # version of the fixHg19ChrM step.
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
+        -assemblyList=GRCh37.p13,GRCh38.p12 \
+        -buildDir=`pwd` -continue convert -stop mergeChroms \
+      >& do.log &
+    tail -f do.log
+# *** All done !  (through the 'mergeChroms' step)  Elapsed time: 164m12s
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08
+
+# For hg19, liftOver chrMT annotations to hg19 chrM.
+sed -e 's/NC_012920 /NC_012920.1 /' \
+  /hive/data/outside/dbSNP/131/human/NC_012920ToChrM.over.chain \
+  > hg19.mitoLiftover.chain
+# For liftOver, convert 0-base fully-closed to 0-based half-open because liftOver
+# doesn't deal with 0-base items.
+mv hg19.dbSnp153.bigDbSnp hg19.preChrMFix.dbSnp153.bigDbSnp
+time (grep ^NC_012920.1 hg19.preChrMFix.dbSnp153.bigDbSnp \
+      | awk -F$'\t' 'BEGIN{OFS="'$'\t''";} {$3 += 1; print;}' \
+      | liftOver -tab -bedPlus=3 stdin \
+          hg19.mitoLiftover.chain stdout chrM.unmapped \
+      | awk -F$'\t' 'BEGIN{OFS="'$'\t''";} {$3 -= 1; print;}' \
+      | sort -k2n,2n \
+        > hg19.chrM.dbSnp153.bigDbSnp)
+#real    4m44.075s
+wc -l hg19.chrM.dbSnp153.bigDbSnp chrM.unmapped
+#  3312 hg19.chrM.dbSnp153.bigDbSnp
+#    14 chrM.unmapped
+# 7 "Partially deleted in new" items
+time grep -v ^NC_012920.1 hg19.preChrMFix.dbSnp153.bigDbSnp \
+     | sort --merge -k1,1 -k2n,2n - hg19.chrM.dbSnp153.bigDbSnp \
+       > hg19.dbSnp153.bigDbSnp
+#real    5m58.364s
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
+        -assemblyList=GRCh37.p13,GRCh38.p12 \
+        -buildDir=`pwd` -continue check -stop install \
+      >& do2.log &
+    tail -f do2.log
+# *** All done !  (through the 'install' step)  Elapsed time: 176m58s
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2024-03-08
+
+
 ##############################################################################
 # dbSnp155: dbSNP build 155 (IN-PROGRESS 11/25/19 galt)
 
     topDir=/hive/data/outside/dbSNP/155
     mkdir -p $topDir/json
     cd $topDir/json
 
     wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
     md5sum -c CHECKSUMS
 #refsnp-chr10.json.bz2: OK
 #...
 #refsnp-withdrawn.json.bz2: OK