356c96fb811f1c8649d7685de33011877268d062
angie
  Tue Nov 5 10:35:31 2019 -0800
dbSnp153: rebuild to get correct counts of diffMajor and refIsMinor.  refs #23283

diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt
index ae7c3a7..9a5b53a 100644
--- src/hg/makeDb/doc/bigDbSnp.txt
+++ src/hg/makeDb/doc/bigDbSnp.txt
@@ -187,31 +187,31 @@
 # 15075710 overlapSameClass
 #      110 refIsAmbiguous
 #  3033691 refIsMinor
 #   189809 refIsRare
 #    63804 refIsSingleton
 #       33 refMismatch
 #  4439534 revStrand
 
     # 10/18/19: add subset tracks
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
        -buildDir=`pwd` -continue=bigBed -stop=install >& subsets.log &
     tail -f subsets.log
 
 
 ##############################################################################
-# dbSnp153: dbSNP build 153 (DONE 9/19/19 angie)
+# dbSnp153: dbSNP build 153 (DONE 11/4/19 angie)
 
     topDir=/hive/data/outside/dbSNP/153
     mkdir -p $topDir/json
     cd $topDir/json
     wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
     md5sum -c CHECKSUMS
 #refsnp-chr10.json.bz2: OK
 #...
 #refsnp-withdrawn.json.bz2: OK
 
     # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
     # by sampling first 10,000 variants on an arbitrary chrom:
     assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
     rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
     proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
@@ -318,48 +318,48 @@
       >& redo.log &
     tail -f redo.log
 # *** All done !  (through the 'install' step)  Elapsed time: 263m59s
 # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
 
     #*** uh-oh... when checkBigDbSnp failed, doCheck.sh did not fail... I guess backgrounding
     #*** the jobs and 'wait' hide errors?
     cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
       -continue check -stop install \
       >& check.log &
     tail -f check.log
 
     # 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids:
     # 10/30/19: and again after adding new ucscNotes (#23283).
+    # 11/4/19: and again after finding that refIsMinor & diffMajor could be appended multiple times
     topDir=/hive/data/outside/dbSNP/153
     freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
     # Run doBigDbSnp.pl (first with -debug to make runDir):
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug
-# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-10-30
-    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-10-30
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-04
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-04
     # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
     rmdir split
     ln -s ../bigDbSnp.2019-08-07/split split
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
       -buildDir=`pwd` -continue convert -stop install \
       >& do.log &
     tail -f do.log
-# *** All done !  (through the 'install' step)  Elapsed time: 472m19s
-# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-10-30
+# *** All done !  (through the 'install' step)  Elapsed time: 525m19s
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-04
 
     # count up how many variants have freq counts for each project
-#TODO
     cut -f 4 dbSnp153Details.tab \
     | perl -wne 'chomp; next unless $_; @w = split ",";
         if ($w[0]) { print "1000Genomes\n" }
         if ($w[1]) { print "GnomAD_exomes\n"; }
         if ($w[2]) { print "TOPMED\n" }
         if ($w[3]) { print "ExAC\n" }
         if ($w[4]) { print "PAGE_STUDY\n" }
         if ($w[5]) { print "GnomAD\n" }
         if ($w[6]) { print "GoESP\n" }
         if ($w[7]) { print "Estonian\n" }
         if ($w[8]) { print "ALSPAC\n" }
         if ($w[9]) { print "TWINSUK\n" }
         if ($w[10]) { print "NorthernSweden\n" }
         if ($w[11]) { print "Vietnamese\n" }' \
     | sort | uniq -c | sort -nr
@@ -375,56 +375,56 @@
 #8854128 ExAC
 #1973841 GoESP
 #1323033 PAGE_STUDY
 
     # count up how many instances of each type of ucscNote:
     cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
 #    10747 altIsAmbiguous
 #     5701 classMismatch
 #   454656 clinvar
 #   143844 clinvarBenign
 #     7932 clinvarConflicting
 #    96242 clinvarPathogenic
 #   113678 clusterError
 # 12178426 commonAll
 # 20534330 commonSome
-#  3522349 diffMajor
+#  1378125 diffMajor
 #     7649 freqIsAmbiguous
 #    25413 freqNotRefAlt
 #   561309 multiMap
 #106940656 overlapDiffClass
 # 16890303 overlapSameClass
 #662571654 rareAll
 #670927558 rareSome
 #      101 refIsAmbiguous
-# 16032028 refIsMinor
+#  3277722 refIsMinor
 #   142937 refIsRare
 #    44382 refIsSingleton
 #        4 refMismatch
 #  3813390 revStrand
     cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
 #    10873 altIsAmbiguous
 #     5864 classMismatch
 #   453954 clinvar
 #   143696 clinvarBenign
 #     7950 clinvarConflicting
 #    95262 clinvarPathogenic
 #   126973 clusterError
 # 12430253 commonAll
 # 20893174 commonSome
-#  3573503 diffMajor
+#  1399317 diffMajor
 #     7749 freqIsAmbiguous
 #    39038 freqNotRefAlt
 #   132015 multiMap
 #109838613 overlapDiffClass
 # 17228657 overlapSameClass
 #681626796 rareAll
 #690089717 rareSome
 #      111 refIsAmbiguous
-# 16277729 refIsMinor
+#  3364788 refIsMinor
 #   166192 refIsRare
 #    56491 refIsSingleton
 #       33 refMismatch
 #  4512600 revStrand
 
 
 ##############################################################################