src/hg/makeDb/doc/bigDbSnp.txt 726a4c9c82592c662ae6b8c715e67b783c95beb3

726a4c9c82592c662ae6b8c715e67b783c95beb3
angie
  Mon Nov 18 13:45:42 2019 -0800
Instead of dropping rs IDs that have incomplete frequency data, add two new ucscNotes: freqIncomplete and freqNotMapped.  refs #23283

diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt
index 86d46a3..288d3b3 100644
--- src/hg/makeDb/doc/bigDbSnp.txt
+++ src/hg/makeDb/doc/bigDbSnp.txt
@@ -1,448 +1,454 @@
 # for emacs: -*- mode: sh; -*-
 
 # dbSnpNNN ("dbSNP 2.0") bigDbSnp tracks for hg38 / GRCh38 and hg19 / GRCh37
 
 ##############################################################################
 # dbSnp152: dbSNP build 152 (DONE 9/18/19 angie)
 
     topDir=/hive/data/outside/dbSNP/152
     mkdir -p $topDir/json
     cd $topDir/json
     wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
     md5sum -c CHECKSUMS
 #refsnp-chr10.json.bz2: OK
 #...
 #refsnp-withdrawn.json.bz2: OK
 
     # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
     # by sampling first 10,000 variants on an arbitrary chrom:
     set assemblyPath = "primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
     set rnaSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
     set proteinSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
     set freqSourcePath = "primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
     foreach jPath ("$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath")
       echo "$jPath"
       bzcat refsnp-chr3.json.bz2 \
       | head -10000 \
       | jsonQuery -countUniq -verbose=2 stdin "$jPath" stdout \
       | sort -nr
       echo ""
     end
     # Assemblies:
 #     10229 "GRCh38.p12"
 #     10111 "GRCh37.p13"
     # RNA SO terms -- make sure all of these appear in soTerm.[ch]:
 #    147701 "SO:0001627"
 #     26809 "SO:0002153"
 #     19013 "SO:0002152"
 #      8334 "SO:0001624"
 #      3082 "SO:0001986"
 #      2657 "SO:0001580"
 #      1772 "SO:0001619"
 #      1741 "SO:0001987"
 #       833 "SO:0001623"
 #        30 "SO:0001575"
 #        10 "SO:0001574"
 #         4 "SO:0001590"
     # Protein SO terms -- ditto for soTerm.[ch]:
 #       784 "SO:0001819"
 #       713 "SO:0001583"
 #         6 "SO:0001821"
 #         5 "SO:0001587"
 #         2 "SO:0000865"
     # Made sure all those are in hg/{inc,lib}/soTerm.[ch]
 
     # Projects reporting allele counts/frequencies:
 #     18130 "GnomAD"
 #     17473 "1000Genomes"
 #     17376 "TOPMED"
 #     15952 "TWINSUK"
 #     15952 "ALSPAC"
 #     15188 "Estonian"
 #       555 "GnomAD_exomes"
 #       473 "GoESP"
 #       468 "ExAC"
     freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,GnomAD,GoESP,ALSPAC,TWINSUK,Estonian
 
     cd $topDir
     # Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name.
     hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
     | tawk '{print $1, "GRCh38.p12", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \
       > refSeqToUcsc.tab
     hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
     | tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \
       >> refSeqToUcsc.tab
 
     # Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37,
     # so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings.
     refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions
     grep -v ^# \
       $refseqAssemblies/GCF_000001405.{25_GRCh37.p13,38_GRCh38.p12}/*_assembly_structure/all_alt_scaffold_placement.txt \
     | tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \
     | sort -k 1,1 -k2n,2n \
     | tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \
       > equivRegions.tab
     # Add PARs:
     grep -w PAR \
       $refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \
     | sort \
     | sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \
     | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
 #PAR#1   NC_000023.10:60000:2699520
 #PAR#1   NC_000024.9:10000:2649520
 #PAR#2   NC_000023.10:154931043:155260560
 #PAR#2   NC_000024.9:59034049:59363566
     echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab
     echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab
     grep -w PAR \
       $refseqAssemblies/GCF_000001405.38_GRCh38.p12/*_assembly_regions.txt \
     | sort \
     | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \
     | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
 #PAR#1   NC_000023.11:10000:2781479
 #PAR#1   NC_000024.10:10000:2781479
 #PAR#2   NC_000023.11:155701382:156030895
 #PAR#2   NC_000024.10:56887902:57217415
     echo "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab
     echo "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab
 
 
     # Run doBigDbSnp.pl...
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -continue convert -stop bigBed \
       >& do.log &
     tail -f do.log
 # *** All done !  (through the 'install' step)  Elapsed time: 278m36s
 # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
 
     # 9/17/19: re-run from dbSnpJsonToTab onward after lots of changes
     topDir=/hive/data/outside/dbSNP/152
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install -debug
 # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
     cd /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
     # Link to ../split, -continue convert to avoid re-splitting (the slowest part of the process):
     rm split
     ln -s ../split split
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
       -buildDir=`pwd` -continue convert -stop install \
       >& do.log &
     tail -f do.log
 # *** All done !  (through the 'install' step)  Elapsed time: 449m6s
 # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
 
     # 10/8/19: count up how many variants have freq counts for each project
     cut -f 4 dbSnp152Details.tab \
     | perl -wne 'chomp; next unless $_; @w = split ",";
         if ($w[0]) { print "1000Genomes\n" }
         if ($w[1]) { print "GnomAD_exomes\n"; }
         if ($w[2]) { print "TOPMED\n" }
         if ($w[3]) { print "ExAC\n" }
         if ($w[4]) { print "GnomAD\n" }
         if ($w[5]) { print "GoESP\n" }
         if ($w[6]) { print "ALSPAC\n" }
         if ($w[7]) { print "TWINSUK\n" }
         if ($w[8]) { print "Estonian\n" }' \
     | sort | uniq -c | sort -nr
 #437624857 TOPMED
 #234158623 GnomAD
 #84743526 1000Genomes
 #44887599 TWINSUK
 #44887599 ALSPAC
 #31397792 Estonian
 #11721224 GnomAD_exomes
 #8854021 ExAC
 #1973787 GoESP
 
     # 10/11/19: count up how many instances of each type of ucscNote:
     cut -f 15 hg19.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
 #   10680 altIsAmbiguous
 #    4808 classMismatch
 #  409132 clinvar
 #  106941 clusterError
 #12757487 commonAll
 #18901486 commonSome
 #  823424 diffMajor
 #    7635 freqIsAmbiguous
 #   23027 freqNotRefAlt
 #  555144 multiMap
 #99618012 overlapDiffClass
 #14790469 overlapSameClass
 #     101 refIsAmbiguous
 # 2933684 refIsMinor
 #  150892 refIsRare
 #   45618 refIsSingleton
 #       4 refMismatch
 # 3761191 revStrand
     cut -f 15 hg38.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
 #    10807 altIsAmbiguous
 #     5103 classMismatch
 #   408665 clinvar
 #    94310 clusterError
 # 13027110 commonAll
 # 19258751 commonSome
 #   836327 diffMajor
 #     7736 freqIsAmbiguous
 #    36306 freqNotRefAlt
 #   130175 multiMap
 #102260850 overlapDiffClass
 # 15075710 overlapSameClass
 #      110 refIsAmbiguous
 #  3033691 refIsMinor
 #   189809 refIsRare
 #    63804 refIsSingleton
 #       33 refMismatch
 #  4439534 revStrand
 
     # 10/18/19: add subset tracks
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
        -buildDir=`pwd` -continue=bigBed -stop=install >& subsets.log &
     tail -f subsets.log
 
 
 ##############################################################################
-# dbSnp153: dbSNP build 153 (DONE 11/8/19 angie)
+# dbSnp153: dbSNP build 153 (DONE 11/18/19 angie)
 
     topDir=/hive/data/outside/dbSNP/153
     mkdir -p $topDir/json
     cd $topDir/json
     wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
     md5sum -c CHECKSUMS
 #refsnp-chr10.json.bz2: OK
 #...
 #refsnp-withdrawn.json.bz2: OK
 
     # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
     # by sampling first 10,000 variants on an arbitrary chrom:
     assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
     rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
     proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
     freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
     for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do
       echo "$jPath"
       bzcat refsnp-chr3.json.bz2 \
       | head -10000 \
       | jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \
       | sort -nr
       echo ""
     done
     # Assemblies:
 #     10229 "GRCh38.p12"
 #     10111 "GRCh37.p13"
 
     # RNA SO terms -- make sure all of these appear in soTerm.[ch]:
 #    144288 "SO:0001627"
 #     25829 "SO:0002153"
 #     19729 "SO:0002152"
 #      8506 "SO:0001624"
 #      3112 "SO:0001986"
 #      2648 "SO:0001580"
 #      1769 "SO:0001619"
 #      1712 "SO:0001987"
 #       878 "SO:0001623"
 #        30 "SO:0001575"
 #        10 "SO:0001574"
 #         4 "SO:0001590"
     # Protein SO terms -- ditto for soTerm.[ch]:
 #       770 "SO:0001819"
 #       726 "SO:0001583"
 #         6 "SO:0001821"
 #         5 "SO:0001587"
 #         2 "SO:0000865"
     # Made sure all those are in hg/{inc,lib}/soTerm.[ch] (nothing new since b152)
 
     # Projects reporting allele counts/frequencies:
 #     17493 "1000Genomes"
 #     17392 "TOPMED"
 #     16619 "GnomAD"
 #     16306 "NorthernSweden"
 #     15970 "TWINSUK"
 #     15970 "ALSPAC"
 #     15202 "Estonian"
 #     13096 "Vietnamese"
 #      1844 "PAGE_STUDY"
 #       473 "GoESP"
 #       468 "ExAC"
 #       458 "GnomAD_exomes"
 
     # This time the JSON downloads include a file frequency_studies.json that describes each study.
     # Will be useful for making a details page, but some descriptions are just study names.
     # Get total_count values from refsnp-chr3.json.bz2, put 1000Genomes first, then order by
     # decreasing total_count:
     # 1000Genomes:  5008
     # GnomAD_exomes: 251006
     # TOPMED: 125568
     # ExAC: 121234
     # PAGE_STUDY: 78694
     # GnomAD: 31348
     # GoESP: 12494
     # Estonian: 4480
     # ALSPAC: 3854
     # TWINSUK: 3708
     # NorthernSweden: 600
     # Vietnamese: 212
     freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
     # Hmm, the PAGE study is a population genetics study.  It excludes caucasians:
     # https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf
     # "Genotyped individuals self-identified as Hispanic/Latino (N=22,216),
     #  African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940),
     #  Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage,
     #  as well as participants who did not identify with any of the available options."
     # They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost
     # as many Native Hawaiians as Asians), so I'll keep 1000Genomes first.
 
     # Reuse assembly sequence mapping files from b152 since the assemblies are the same.
     cd $topDir
     cp ../152/refSeqToUcsc.tab .
     cp ../152/equivRegions.tab .
 
     # Run doBigDbSnp.pl (first with -debug to make runDir
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -debug
 # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
     cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install \
       >& do.log &
     tail -f do.log
 # *** All done !  (through the 'install' step)  Elapsed time: 2095m35s
 
     # 9/13/19: Now that checkDbSnp has been added to doBigDbSnp.pl, re-run from the check
     # stage onward (but don't cleanup just yet in case we need to debug files).
     cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
       -continue check -stop install \
       >& check.log &
     tail -f check.log
 
     # 9/18/19: re-run from dbSnpJsonToTab onward after adding several more ucscNotes.
     cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir \
       -buildDir=`pwd` -continue convert -stop install \
       >& redo.log &
     tail -f redo.log
 # *** All done !  (through the 'install' step)  Elapsed time: 263m59s
 # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
 
     #*** uh-oh... when checkBigDbSnp failed, doCheck.sh did not fail... I guess backgrounding
     #*** the jobs and 'wait' hide errors?
     cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
       -continue check -stop install \
       >& check.log &
     tail -f check.log
 
     # 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids:
     # 10/30/19: and again after adding new ucscNotes (#23283).
     # 11/4/19: and again after finding that refIsMinor & diffMajor could be appended multiple times
     # 11/7/19: and again after finding that some cases of freqNotRefAlt are VCF normalization probs
     # 11/8/19: and again after adding badCoords.bed and warnings output files
+    # 11/15/19: and again after adding ucscNotes freqIncomplete, freqNotMapped
     topDir=/hive/data/outside/dbSNP/153
     freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
     # Run doBigDbSnp.pl (first with -debug to make runDir):
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug
-# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08
-    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-15
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-15
     # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
     rmdir split
     ln -s ../bigDbSnp.2019-08-07/split split
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
       -buildDir=`pwd` -continue convert -stop install \
       >& do.log &
     tail -f do.log
-# *** All done !  (through the 'install' step)  Elapsed time: 504m48s
-# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08
-
-    # 11/12/19: Add new ucscNote otherMapErr by re-running from check onward.
-    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
-      -buildDir=`pwd` -continue check -stop install \
-      >& check.log &
-    tail -f check.log
-# *** All done !  (through the 'install' step)  Elapsed time: 172m35s
+# *** All done !  (through the 'install' step)  Elapsed time: 545m26s
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-15
 
     # count up how many variants have freq counts for each project
     cut -f 4 dbSnp153Details.tab \
     | perl -wne 'chomp; next unless $_; @w = split ",";
         if ($w[0]) { print "1000Genomes\n" }
         if ($w[1]) { print "GnomAD_exomes\n"; }
         if ($w[2]) { print "TOPMED\n" }
         if ($w[3]) { print "ExAC\n" }
         if ($w[4]) { print "PAGE_STUDY\n" }
         if ($w[5]) { print "GnomAD\n" }
         if ($w[6]) { print "GoESP\n" }
         if ($w[7]) { print "Estonian\n" }
         if ($w[8]) { print "ALSPAC\n" }
         if ($w[9]) { print "TWINSUK\n" }
         if ($w[10]) { print "NorthernSweden\n" }
         if ($w[11]) { print "Vietnamese\n" }' \
     | sort | uniq -c | sort -nr
 #437625009 TOPMED
 #211192420 GnomAD
 #84744375 1000Genomes
 #44888383 TWINSUK
 #44888383 ALSPAC
 #31397940 Estonian
 #16351632 NorthernSweden
 #12283940 GnomAD_exomes
 #10004052 Vietnamese
 #8854128 ExAC
 #1973841 GoESP
 #1323033 PAGE_STUDY
 
     # count up how many instances of each type of ucscNote:
     time cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
-#    10754 altIsAmbiguous
-#     5995 classMismatch
-#   454674 clinvar
-#   143860 clinvarBenign
+#    10755 altIsAmbiguous
+#     5998 classMismatch
+#   454678 clinvar
+#   143864 clinvarBenign
 #     7932 clinvarConflicting
 #    96242 clinvarPathogenic
-#   114685 clusterError
-# 12184226 commonAll
-# 20540882 commonSome
-#  1377817 diffMajor
+#   114686 clusterError
+# 12184520 commonAll
+# 20541189 commonSome
+#  1377831 diffMajor
+#     3922 freqIncomplete
 #     7656 freqIsAmbiguous
-#    17684 freqNotRefAlt
-#   562157 multiMap
-#   113416 otherMapErr
-#107003090 overlapDiffClass
-# 16910407 overlapSameClass
-#662595470 rareAll
-#670952126 rareSome
+#     2685 freqNotMapped
+#    17693 freqNotRefAlt
+#   562180 multiMap
+#   114094 otherMapErr
+#107015340 overlapDiffClass
+# 16915237 overlapSameClass
+#662601770 rareAll
+#670958439 rareSome
 #      101 refIsAmbiguous
-#  3271878 refIsMinor
-#   136452 refIsRare
-#    37783 refIsSingleton
+#  3272115 refIsMinor
+#   136546 refIsRare
+#    37831 refIsSingleton
 #        4 refMismatch
-#  3813467 revStrand
+#  3813702 revStrand
+#real    34m57.796s
+#user    47m49.283s
+#sys     4m29.442s
+
     # Check count of rs's with at least one bad mapping:
     grep otherMapErr hg19.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l
-#54871
+#55453
 
     time cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
-#  10880 altIsAmbiguous
-#   6206 classMismatch
-# 453990 clinvar
-# 143730 clinvarBenign
+#    10888 altIsAmbiguous
+#     6216 classMismatch
+#   453996 clinvar
+#   143736 clinvarBenign
 #     7950 clinvarConflicting
 #    95262 clinvarPathogenic
-# 128109 clusterError
-#12438325 commonAll
-#20902602 commonSome
-#1399094 diffMajor
+#   128306 clusterError
+# 12438654 commonAll
+# 20902943 commonSome
+#  1399109 diffMajor
+#     4673 freqIncomplete
 #     7756 freqIsAmbiguous
-#  32150 freqNotRefAlt
-# 132051 multiMap
-# 203580 otherMapErr
-#109991096 overlapDiffClass
-#17281744 overlapSameClass
-#681685476 rareAll
-#690149753 rareSome
+#     6590 freqNotMapped
+#    32169 freqNotRefAlt
+#   132123 multiMap
+#   204219 otherMapErr
+#110007681 overlapDiffClass
+# 17291287 overlapSameClass
+#681696398 rareAll
+#690160687 rareSome
 #      111 refIsAmbiguous
-#3360159 refIsMinor
-# 160723 refIsRare
-#  50865 refIsSingleton
+#  3360434 refIsMinor
+#   160826 refIsRare
+#    50926 refIsSingleton
 #       33 refMismatch
-#4532270 revStrand
+#  4532511 revStrand
+#real    36m36.972s
+#user    49m41.817s
+#sys     4m43.806s
+
     # Check count of rs's with at least one bad mapping:
     grep otherMapErr hg38.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l
 #86258
 
 
 ##############################################################################