aa6795a2461f1356b09ce782be1f979f7edbab9b
angie
  Thu Oct 10 14:15:07 2019 -0700
dbSnp152, dbSnp153: Count up how many distinct rsIDs got frequency counts from each source; also how many times each ucscNote was added.  refs #23283

diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt
index fd7c647..8fca709 100644
--- src/hg/makeDb/doc/bigDbSnp.txt
+++ src/hg/makeDb/doc/bigDbSnp.txt
@@ -117,30 +117,93 @@
     # 9/17/19: re-run from dbSnpJsonToTab onward after lots of changes
     topDir=/hive/data/outside/dbSNP/152
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install -debug
 # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
     cd /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
     # Link to ../split, -continue convert to avoid re-splitting (the slowest part of the process):
     rm split
     ln -s ../split split
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
       -buildDir=`pwd` -continue convert -stop install \
       >& do.log &
     tail -f do.log
 # *** All done !  (through the 'install' step)  Elapsed time: 449m6s
 # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
 
+    # 10/8/19: count up how many variants have freq counts for each project
+    cut -f 4 dbSnp152Details.tab \
+    | perl -wne 'chomp; next unless $_; @w = split ",";
+        if ($w[0]) { print "1000Genomes\n" }
+        if ($w[1]) { print "GnomAD_exomes\n"; }
+        if ($w[2]) { print "TOPMED\n" }
+        if ($w[3]) { print "ExAC\n" }
+        if ($w[4]) { print "GnomAD\n" }
+        if ($w[5]) { print "GoESP\n" }
+        if ($w[6]) { print "ALSPAC\n" }
+        if ($w[7]) { print "TWINSUK\n" }
+        if ($w[8]) { print "Estonian\n" }' \
+    | sort | uniq -c | sort -nr
+#437624857 TOPMED
+#234158623 GnomAD
+#84743526 1000Genomes
+#44887599 TWINSUK
+#44887599 ALSPAC
+#31397792 Estonian
+#11721224 GnomAD_exomes
+#8854021 ExAC
+#1973787 GoESP
+
+    # 10/11/19: count up how many instances of each type of ucscNote:
+    cut -f 15 hg19.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
+#   10680 altIsAmbiguous
+#    4808 classMismatch
+#  409132 clinvar
+#  106941 clusterError
+#12757487 commonAll
+#18901486 commonSome
+#  823424 diffMajor
+#    7635 freqIsAmbiguous
+#   23027 freqNotRefAlt
+#  555144 multiMap
+#99618012 overlapDiffClass
+#14790469 overlapSameClass
+#     101 refIsAmbiguous
+# 2933684 refIsMinor
+#  150892 refIsRare
+#   45618 refIsSingleton
+#       4 refMismatch
+# 3761191 revStrand
+    cut -f 15 hg38.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
+#    10807 altIsAmbiguous
+#     5103 classMismatch
+#   408665 clinvar
+#    94310 clusterError
+# 13027110 commonAll
+# 19258751 commonSome
+#   836327 diffMajor
+#     7736 freqIsAmbiguous
+#    36306 freqNotRefAlt
+#   130175 multiMap
+#102260850 overlapDiffClass
+# 15075710 overlapSameClass
+#      110 refIsAmbiguous
+#  3033691 refIsMinor
+#   189809 refIsRare
+#    63804 refIsSingleton
+#       33 refMismatch
+#  4439534 revStrand
+
 
 ##############################################################################
 # dbSnp153: dbSNP build 153 (DONE 9/19/19 angie)
 
     topDir=/hive/data/outside/dbSNP/153
     mkdir -p $topDir/json
     cd $topDir/json
     wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
     md5sum -c CHECKSUMS
 #refsnp-chr10.json.bz2: OK
 #...
 #refsnp-withdrawn.json.bz2: OK
 
     # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
     # by sampling first 10,000 variants on an arbitrary chrom:
@@ -263,17 +326,86 @@
     # 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids:
     # Run doBigDbSnp.pl (first with -debug to make runDir):
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug
 # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
     cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
     # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
     rmdir split
     ln -s ../bigDbSnp.2019-08-07/split split
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
       -buildDir=`pwd` -continue convert -stop install \
       >& do.log &
     tail -f do.log
 # *** All done !  (through the 'install' step)  Elapsed time: 491m30s
 # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
 
+    # 10/8/19: count up how many variants have freq counts for each project
+    cut -f 4 dbSnp153Details.tab \
+    | perl -wne 'chomp; next unless $_; @w = split ",";
+        if ($w[0]) { print "1000Genomes\n" }
+        if ($w[1]) { print "GnomAD_exomes\n"; }
+        if ($w[2]) { print "TOPMED\n" }
+        if ($w[3]) { print "ExAC\n" }
+        if ($w[4]) { print "PAGE_STUDY\n" }
+        if ($w[5]) { print "GnomAD\n" }
+        if ($w[6]) { print "GoESP\n" }
+        if ($w[7]) { print "Estonian\n" }
+        if ($w[8]) { print "ALSPAC\n" }
+        if ($w[9]) { print "TWINSUK\n" }
+        if ($w[10]) { print "NorthernSweden\n" }
+        if ($w[11]) { print "Vietnamese\n" }' \
+    | sort | uniq -c | sort -nr
+#437625009 TOPMED
+#211192420 GnomAD
+#84744375 1000Genomes
+#44888383 TWINSUK
+#44888383 ALSPAC
+#31397940 Estonian
+#16351632 NorthernSweden
+#12283940 GnomAD_exomes
+#10004052 Vietnamese
+#8854128 ExAC
+#1973841 GoESP
+#1323033 PAGE_STUDY
+
+    # 10/11/19: count up how many instances of each type of ucscNote:
+    cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
+#    10747 altIsAmbiguous
+#     5701 classMismatch
+#   454656 clinvar
+#   113678 clusterError
+# 12178426 commonAll
+# 20534330 commonSome
+#  3522349 diffMajor
+#     7649 freqIsAmbiguous
+#    25413 freqNotRefAlt
+#   561309 multiMap
+#106940656 overlapDiffClass
+# 16890303 overlapSameClass
+#      101 refIsAmbiguous
+# 16032028 refIsMinor
+#   142937 refIsRare
+#    44382 refIsSingleton
+#        4 refMismatch
+#  3813390 revStrand
+    cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
+#    10873 altIsAmbiguous
+#     5864 classMismatch
+#   453954 clinvar
+#   126973 clusterError
+# 12430253 commonAll
+# 20893174 commonSome
+#  3573503 diffMajor
+#     7749 freqIsAmbiguous
+#    39038 freqNotRefAlt
+#   132015 multiMap
+#109838613 overlapDiffClass
+# 17228657 overlapSameClass
+#      111 refIsAmbiguous
+# 16277729 refIsMinor
+#   166192 refIsRare
+#    56491 refIsSingleton
+#       33 refMismatch
+#  4512600 revStrand
+
 
 ##############################################################################