acca3deffc05c4d8d11590a1cf3d893763254712
angie
Thu Oct 31 13:43:05 2019 -0700
dbSnp153: Adding new ucscNotes suggested by Ana Benet: clinvar{Benign,Conflicting,Pathogenic}, rareAll, rareSome. refs #23283
diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt
index 5b27f29..ae7c3a7 100644
--- src/hg/makeDb/doc/bigDbSnp.txt
+++ src/hg/makeDb/doc/bigDbSnp.txt
@@ -1,423 +1,430 @@
# for emacs: -*- mode: sh; -*-
# dbSnpNNN ("dbSNP 2.0") bigDbSnp tracks for hg38 / GRCh38 and hg19 / GRCh37
##############################################################################
# dbSnp152: dbSNP build 152 (DONE 9/18/19 angie)
topDir=/hive/data/outside/dbSNP/152
mkdir -p $topDir/json
cd $topDir/json
wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
md5sum -c CHECKSUMS
#refsnp-chr10.json.bz2: OK
#...
#refsnp-withdrawn.json.bz2: OK
# jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
# by sampling first 10,000 variants on an arbitrary chrom:
set assemblyPath = "primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
set rnaSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
set proteinSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
set freqSourcePath = "primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
foreach jPath ("$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath")
echo "$jPath"
bzcat refsnp-chr3.json.bz2 \
| head -10000 \
| jsonQuery -countUniq -verbose=2 stdin "$jPath" stdout \
| sort -nr
echo ""
end
# Assemblies:
# 10229 "GRCh38.p12"
# 10111 "GRCh37.p13"
# RNA SO terms -- make sure all of these appear in soTerm.[ch]:
# 147701 "SO:0001627"
# 26809 "SO:0002153"
# 19013 "SO:0002152"
# 8334 "SO:0001624"
# 3082 "SO:0001986"
# 2657 "SO:0001580"
# 1772 "SO:0001619"
# 1741 "SO:0001987"
# 833 "SO:0001623"
# 30 "SO:0001575"
# 10 "SO:0001574"
# 4 "SO:0001590"
# Protein SO terms -- ditto for soTerm.[ch]:
# 784 "SO:0001819"
# 713 "SO:0001583"
# 6 "SO:0001821"
# 5 "SO:0001587"
# 2 "SO:0000865"
# Made sure all those are in hg/{inc,lib}/soTerm.[ch]
# Projects reporting allele counts/frequencies:
# 18130 "GnomAD"
# 17473 "1000Genomes"
# 17376 "TOPMED"
# 15952 "TWINSUK"
# 15952 "ALSPAC"
# 15188 "Estonian"
# 555 "GnomAD_exomes"
# 473 "GoESP"
# 468 "ExAC"
freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,GnomAD,GoESP,ALSPAC,TWINSUK,Estonian
cd $topDir
# Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name.
hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
| tawk '{print $1, "GRCh38.p12", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \
> refSeqToUcsc.tab
hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
| tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \
>> refSeqToUcsc.tab
# Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37,
# so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings.
refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions
grep -v ^# \
$refseqAssemblies/GCF_000001405.{25_GRCh37.p13,38_GRCh38.p12}/*_assembly_structure/all_alt_scaffold_placement.txt \
| tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \
| sort -k 1,1 -k2n,2n \
| tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \
> equivRegions.tab
# Add PARs:
grep -w PAR \
$refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \
| sort \
| sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \
| tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
#PAR#1 NC_000023.10:60000:2699520
#PAR#1 NC_000024.9:10000:2649520
#PAR#2 NC_000023.10:154931043:155260560
#PAR#2 NC_000024.9:59034049:59363566
echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab
echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab
grep -w PAR \
$refseqAssemblies/GCF_000001405.38_GRCh38.p12/*_assembly_regions.txt \
| sort \
| sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \
| tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
#PAR#1 NC_000023.11:10000:2781479
#PAR#1 NC_000024.10:10000:2781479
#PAR#2 NC_000023.11:155701382:156030895
#PAR#2 NC_000024.10:56887902:57217415
echo "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab
echo "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab
# Run doBigDbSnp.pl...
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -continue convert -stop bigBed \
>& do.log &
tail -f do.log
# *** All done ! (through the 'install' step) Elapsed time: 278m36s
# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
# 9/17/19: re-run from dbSnpJsonToTab onward after lots of changes
topDir=/hive/data/outside/dbSNP/152
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install -debug
# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
cd /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
# Link to ../split, -continue convert to avoid re-splitting (the slowest part of the process):
rm split
ln -s ../split split
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
-buildDir=`pwd` -continue convert -stop install \
>& do.log &
tail -f do.log
# *** All done ! (through the 'install' step) Elapsed time: 449m6s
# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
# 10/8/19: count up how many variants have freq counts for each project
cut -f 4 dbSnp152Details.tab \
| perl -wne 'chomp; next unless $_; @w = split ",";
if ($w[0]) { print "1000Genomes\n" }
if ($w[1]) { print "GnomAD_exomes\n"; }
if ($w[2]) { print "TOPMED\n" }
if ($w[3]) { print "ExAC\n" }
if ($w[4]) { print "GnomAD\n" }
if ($w[5]) { print "GoESP\n" }
if ($w[6]) { print "ALSPAC\n" }
if ($w[7]) { print "TWINSUK\n" }
if ($w[8]) { print "Estonian\n" }' \
| sort | uniq -c | sort -nr
#437624857 TOPMED
#234158623 GnomAD
#84743526 1000Genomes
#44887599 TWINSUK
#44887599 ALSPAC
#31397792 Estonian
#11721224 GnomAD_exomes
#8854021 ExAC
#1973787 GoESP
# 10/11/19: count up how many instances of each type of ucscNote:
cut -f 15 hg19.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
# 10680 altIsAmbiguous
# 4808 classMismatch
# 409132 clinvar
# 106941 clusterError
#12757487 commonAll
#18901486 commonSome
# 823424 diffMajor
# 7635 freqIsAmbiguous
# 23027 freqNotRefAlt
# 555144 multiMap
#99618012 overlapDiffClass
#14790469 overlapSameClass
# 101 refIsAmbiguous
# 2933684 refIsMinor
# 150892 refIsRare
# 45618 refIsSingleton
# 4 refMismatch
# 3761191 revStrand
cut -f 15 hg38.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
# 10807 altIsAmbiguous
# 5103 classMismatch
# 408665 clinvar
# 94310 clusterError
# 13027110 commonAll
# 19258751 commonSome
# 836327 diffMajor
# 7736 freqIsAmbiguous
# 36306 freqNotRefAlt
# 130175 multiMap
#102260850 overlapDiffClass
# 15075710 overlapSameClass
# 110 refIsAmbiguous
# 3033691 refIsMinor
# 189809 refIsRare
# 63804 refIsSingleton
# 33 refMismatch
# 4439534 revStrand
# 10/18/19: add subset tracks
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
-buildDir=`pwd` -continue=bigBed -stop=install >& subsets.log &
tail -f subsets.log
##############################################################################
# dbSnp153: dbSNP build 153 (DONE 9/19/19 angie)
topDir=/hive/data/outside/dbSNP/153
mkdir -p $topDir/json
cd $topDir/json
wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
md5sum -c CHECKSUMS
#refsnp-chr10.json.bz2: OK
#...
#refsnp-withdrawn.json.bz2: OK
# jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
# by sampling first 10,000 variants on an arbitrary chrom:
assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do
echo "$jPath"
bzcat refsnp-chr3.json.bz2 \
| head -10000 \
| jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \
| sort -nr
echo ""
done
# Assemblies:
# 10229 "GRCh38.p12"
# 10111 "GRCh37.p13"
# RNA SO terms -- make sure all of these appear in soTerm.[ch]:
# 144288 "SO:0001627"
# 25829 "SO:0002153"
# 19729 "SO:0002152"
# 8506 "SO:0001624"
# 3112 "SO:0001986"
# 2648 "SO:0001580"
# 1769 "SO:0001619"
# 1712 "SO:0001987"
# 878 "SO:0001623"
# 30 "SO:0001575"
# 10 "SO:0001574"
# 4 "SO:0001590"
# Protein SO terms -- ditto for soTerm.[ch]:
# 770 "SO:0001819"
# 726 "SO:0001583"
# 6 "SO:0001821"
# 5 "SO:0001587"
# 2 "SO:0000865"
# Made sure all those are in hg/{inc,lib}/soTerm.[ch] (nothing new since b152)
# Projects reporting allele counts/frequencies:
# 17493 "1000Genomes"
# 17392 "TOPMED"
# 16619 "GnomAD"
# 16306 "NorthernSweden"
# 15970 "TWINSUK"
# 15970 "ALSPAC"
# 15202 "Estonian"
# 13096 "Vietnamese"
# 1844 "PAGE_STUDY"
# 473 "GoESP"
# 468 "ExAC"
# 458 "GnomAD_exomes"
# This time the JSON downloads include a file frequency_studies.json that describes each study.
# Will be useful for making a details page, but some descriptions are just study names.
# Get total_count values from refsnp-chr3.json.bz2, put 1000Genomes first, then order by
# decreasing total_count:
# 1000Genomes: 5008
# GnomAD_exomes: 251006
# TOPMED: 125568
# ExAC: 121234
# PAGE_STUDY: 78694
# GnomAD: 31348
# GoESP: 12494
# Estonian: 4480
# ALSPAC: 3854
# TWINSUK: 3708
# NorthernSweden: 600
# Vietnamese: 212
freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
# Hmm, the PAGE study is a population genetics study. It excludes caucasians:
# https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf
# "Genotyped individuals self-identified as Hispanic/Latino (N=22,216),
# African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940),
# Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage,
# as well as participants who did not identify with any of the available options."
# They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost
# as many Native Hawaiians as Asians), so I'll keep 1000Genomes first.
# Reuse assembly sequence mapping files from b152 since the assemblies are the same.
cd $topDir
cp ../152/refSeqToUcsc.tab .
cp ../152/equivRegions.tab .
# Run doBigDbSnp.pl (first with -debug to make runDir
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -debug
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install \
>& do.log &
tail -f do.log
# *** All done ! (through the 'install' step) Elapsed time: 2095m35s
# 9/13/19: Now that checkDbSnp has been added to doBigDbSnp.pl, re-run from the check
# stage onward (but don't cleanup just yet in case we need to debug files).
cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
-continue check -stop install \
>& check.log &
tail -f check.log
# 9/18/19: re-run from dbSnpJsonToTab onward after adding several more ucscNotes.
cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir \
-buildDir=`pwd` -continue convert -stop install \
>& redo.log &
tail -f redo.log
# *** All done ! (through the 'install' step) Elapsed time: 263m59s
# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
#*** uh-oh... when checkBigDbSnp failed, doCheck.sh did not fail... I guess backgrounding
#*** the jobs and 'wait' hide errors?
cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
-continue check -stop install \
>& check.log &
tail -f check.log
# 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids:
+ # 10/30/19: and again after adding new ucscNotes (#23283).
+ topDir=/hive/data/outside/dbSNP/153
+ freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
# Run doBigDbSnp.pl (first with -debug to make runDir):
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug
-# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
- cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-10-30
+ cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-10-30
# Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
rmdir split
ln -s ../bigDbSnp.2019-08-07/split split
$HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
-buildDir=`pwd` -continue convert -stop install \
>& do.log &
tail -f do.log
-# *** All done ! (through the 'install' step) Elapsed time: 491m30s
-# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
+# *** All done ! (through the 'install' step) Elapsed time: 472m19s
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-10-30
- # 10/8/19: count up how many variants have freq counts for each project
+ # count up how many variants have freq counts for each project
+#TODO
cut -f 4 dbSnp153Details.tab \
| perl -wne 'chomp; next unless $_; @w = split ",";
if ($w[0]) { print "1000Genomes\n" }
if ($w[1]) { print "GnomAD_exomes\n"; }
if ($w[2]) { print "TOPMED\n" }
if ($w[3]) { print "ExAC\n" }
if ($w[4]) { print "PAGE_STUDY\n" }
if ($w[5]) { print "GnomAD\n" }
if ($w[6]) { print "GoESP\n" }
if ($w[7]) { print "Estonian\n" }
if ($w[8]) { print "ALSPAC\n" }
if ($w[9]) { print "TWINSUK\n" }
if ($w[10]) { print "NorthernSweden\n" }
if ($w[11]) { print "Vietnamese\n" }' \
| sort | uniq -c | sort -nr
#437625009 TOPMED
#211192420 GnomAD
#84744375 1000Genomes
#44888383 TWINSUK
#44888383 ALSPAC
#31397940 Estonian
#16351632 NorthernSweden
#12283940 GnomAD_exomes
#10004052 Vietnamese
#8854128 ExAC
#1973841 GoESP
#1323033 PAGE_STUDY
- # 10/11/19: count up how many instances of each type of ucscNote:
+ # count up how many instances of each type of ucscNote:
cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
# 10747 altIsAmbiguous
# 5701 classMismatch
# 454656 clinvar
+# 143844 clinvarBenign
+# 7932 clinvarConflicting
+# 96242 clinvarPathogenic
# 113678 clusterError
# 12178426 commonAll
# 20534330 commonSome
# 3522349 diffMajor
# 7649 freqIsAmbiguous
# 25413 freqNotRefAlt
# 561309 multiMap
#106940656 overlapDiffClass
# 16890303 overlapSameClass
+#662571654 rareAll
+#670927558 rareSome
# 101 refIsAmbiguous
# 16032028 refIsMinor
# 142937 refIsRare
# 44382 refIsSingleton
# 4 refMismatch
# 3813390 revStrand
cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c
# 10873 altIsAmbiguous
# 5864 classMismatch
# 453954 clinvar
+# 143696 clinvarBenign
+# 7950 clinvarConflicting
+# 95262 clinvarPathogenic
# 126973 clusterError
# 12430253 commonAll
# 20893174 commonSome
# 3573503 diffMajor
# 7749 freqIsAmbiguous
# 39038 freqNotRefAlt
# 132015 multiMap
#109838613 overlapDiffClass
# 17228657 overlapSameClass
+#681626796 rareAll
+#690089717 rareSome
# 111 refIsAmbiguous
# 16277729 refIsMinor
# 166192 refIsRare
# 56491 refIsSingleton
# 33 refMismatch
# 4512600 revStrand
- # 10/18/19: add subset tracks
- # I added new commands to the bigBed and install steps of doBigDbSnp.pl,
- # ran -debug and copy-pasted commands from the generated scripts.
- $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
- -buildDir=`pwd` -continue=bigBed -stop=install \
- -debug
-
##############################################################################