b49e61be4ad54a46e01be904fa8a8985e9850f0d
angie
  Tue Nov 12 12:27:30 2019 -0800
dbSnp153: add a bigBed4 subtrack of coordinate ranges for mappings that we dropped due to inconsistent SPDI.  refs #23283
Overall counts increased because we used to bail on an entire variant when we discovered an inconsistent SPDI,
losing some valid mappings.  Now we go through all mappings, and the bad ones are stored instead of dropped.

diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt
index f429904..47fa08e 100644
--- src/hg/makeDb/doc/bigDbSnp.txt
+++ src/hg/makeDb/doc/bigDbSnp.txt
@@ -187,31 +187,31 @@
 # 15075710 overlapSameClass
 #      110 refIsAmbiguous
 #  3033691 refIsMinor
 #   189809 refIsRare
 #    63804 refIsSingleton
 #       33 refMismatch
 #  4439534 revStrand
 
     # 10/18/19: add subset tracks
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
        -buildDir=`pwd` -continue=bigBed -stop=install >& subsets.log &
     tail -f subsets.log
 
 
 ##############################################################################
-# dbSnp153: dbSNP build 153 (DONE 11/4/19 angie)
+# dbSnp153: dbSNP build 153 (DONE 11/8/19 angie)
 
     topDir=/hive/data/outside/dbSNP/153
     mkdir -p $topDir/json
     cd $topDir/json
     wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
     md5sum -c CHECKSUMS
 #refsnp-chr10.json.bz2: OK
 #...
 #refsnp-withdrawn.json.bz2: OK
 
     # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
     # by sampling first 10,000 variants on an arbitrary chrom:
     assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
     rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
     proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
@@ -320,45 +320,46 @@
 # *** All done !  (through the 'install' step)  Elapsed time: 263m59s
 # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
 
     #*** uh-oh... when checkBigDbSnp failed, doCheck.sh did not fail... I guess backgrounding
     #*** the jobs and 'wait' hide errors?
     cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
       -continue check -stop install \
       >& check.log &
     tail -f check.log
 
     # 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids:
     # 10/30/19: and again after adding new ucscNotes (#23283).
     # 11/4/19: and again after finding that refIsMinor & diffMajor could be appended multiple times
     # 11/7/19: and again after finding that some cases of freqNotRefAlt are VCF normalization probs
+    # 11/8/19: and again after adding badCoords.bed and warnings output files
     topDir=/hive/data/outside/dbSNP/153
     freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
     # Run doBigDbSnp.pl (first with -debug to make runDir):
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug
-# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-07
-    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-07
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08
     # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
     rmdir split
     ln -s ../bigDbSnp.2019-08-07/split split
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
       -buildDir=`pwd` -continue convert -stop install \
       >& do.log &
     tail -f do.log
 # *** All done !  (through the 'install' step)  Elapsed time: 504m48s
-# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-07
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08
 
     # count up how many variants have freq counts for each project
     cut -f 4 dbSnp153Details.tab \
     | perl -wne 'chomp; next unless $_; @w = split ",";
         if ($w[0]) { print "1000Genomes\n" }
         if ($w[1]) { print "GnomAD_exomes\n"; }
         if ($w[2]) { print "TOPMED\n" }
         if ($w[3]) { print "ExAC\n" }
         if ($w[4]) { print "PAGE_STUDY\n" }
         if ($w[5]) { print "GnomAD\n" }
         if ($w[6]) { print "GoESP\n" }
         if ($w[7]) { print "Estonian\n" }
         if ($w[8]) { print "ALSPAC\n" }
         if ($w[9]) { print "TWINSUK\n" }
         if ($w[10]) { print "NorthernSweden\n" }
@@ -367,65 +368,66 @@
 #437625009 TOPMED
 #211192420 GnomAD
 #84744375 1000Genomes
 #44888383 TWINSUK
 #44888383 ALSPAC
 #31397940 Estonian
 #16351632 NorthernSweden
 #12283940 GnomAD_exomes
 #10004052 Vietnamese
 #8854128 ExAC
 #1973841 GoESP
 #1323033 PAGE_STUDY
 
     # count up how many instances of each type of ucscNote:
     cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
-#    10747 altIsAmbiguous
-#     5701 classMismatch
-#   454656 clinvar
-#   143844 clinvarBenign
+#    10754 altIsAmbiguous
+#     5995 classMismatch
+#   454674 clinvar
+#   143860 clinvarBenign
 #     7932 clinvarConflicting
 #    96242 clinvarPathogenic
-#   113678 clusterError
-# 12178426 commonAll
-# 20534330 commonSome
-#  1377402 diffMajor
-#     7649 freqIsAmbiguous
-#    16950 freqNotRefAlt
-#   561309 multiMap
-#106940656 overlapDiffClass
-# 16890303 overlapSameClass
-#662571654 rareAll
-#670927558 rareSome
+#   114685 clusterError
+# 12184226 commonAll
+# 20540882 commonSome
+#  1377817 diffMajor
+#     7656 freqIsAmbiguous
+#    17684 freqNotRefAlt
+#   562157 multiMap
+#107003090 overlapDiffClass
+# 16910407 overlapSameClass
+#662595470 rareAll
+#670952126 rareSome
 #      101 refIsAmbiguous
-#  3269451 refIsMinor
-#   135265 refIsRare
-#    36709 refIsSingleton
+#  3271878 refIsMinor
+#   136452 refIsRare
+#    37783 refIsSingleton
 #        4 refMismatch
-#  3813390 revStrand
+#  3813467 revStrand
+
     cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
-#    10873 altIsAmbiguous
-#     5864 classMismatch
-#   453954 clinvar
-#   143696 clinvarBenign
+#  10880 altIsAmbiguous
+#   6206 classMismatch
+# 453990 clinvar
+# 143730 clinvarBenign
 #   7950 clinvarConflicting
 #  95262 clinvarPathogenic
-#   126973 clusterError
-# 12430253 commonAll
-# 20893174 commonSome
-#  1398591 diffMajor
-#     7749 freqIsAmbiguous
-#    30615 freqNotRefAlt
-#   132015 multiMap
-#109838613 overlapDiffClass
-# 17228657 overlapSameClass
-#681626796 rareAll
-#690089717 rareSome
+# 128109 clusterError
+#12438325 commonAll
+#20902602 commonSome
+#1399094 diffMajor
+#   7756 freqIsAmbiguous
+#  32150 freqNotRefAlt
+# 132051 multiMap
+#109991096 overlapDiffClass
+#17281744 overlapSameClass
+#681685476 rareAll
+#690149753 rareSome
 #    111 refIsAmbiguous
-#  3356557 refIsMinor
-#   158562 refIsRare
-#    48859 refIsSingleton
+#3360159 refIsMinor
+# 160723 refIsRare
+#  50865 refIsSingleton
 #     33 refMismatch
-#  4512600 revStrand
+#4532270 revStrand
 
 
 ##############################################################################