54452ec022a6073410955c04e110a1784f71fb57 angie Wed Nov 13 17:37:34 2019 -0800 dbSnp153: add new ucscNote otherMapErr for mappings with the same rs# as a mapping w/inconsistent SPDI in BadCoords/Map Err subtrack. refs #23283 diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt index 47fa08e..86d46a3 100644 --- src/hg/makeDb/doc/bigDbSnp.txt +++ src/hg/makeDb/doc/bigDbSnp.txt @@ -1,433 +1,448 @@ # for emacs: -*- mode: sh; -*- # dbSnpNNN ("dbSNP 2.0") bigDbSnp tracks for hg38 / GRCh38 and hg19 / GRCh37 ############################################################################## # dbSnp152: dbSNP build 152 (DONE 9/18/19 angie) topDir=/hive/data/outside/dbSNP/152 mkdir -p $topDir/json cd $topDir/json wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\* md5sum -c CHECKSUMS #refsnp-chr10.json.bz2: OK #... #refsnp-withdrawn.json.bz2: OK # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there, # by sampling first 10,000 variants on an arbitrary chrom: set assemblyPath = "primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name" set rnaSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession" set proteinSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession" set freqSourcePath = "primary_snapshot_data.allele_annotations[*].frequency[*].study_name" foreach jPath ("$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath") echo "$jPath" bzcat refsnp-chr3.json.bz2 \ | head -10000 \ | jsonQuery -countUniq -verbose=2 stdin "$jPath" stdout \ | sort -nr echo "" end # Assemblies: # 10229 "GRCh38.p12" # 10111 "GRCh37.p13" # RNA SO terms -- make sure all of these appear in soTerm.[ch]: # 147701 "SO:0001627" # 26809 "SO:0002153" # 19013 "SO:0002152" # 8334 "SO:0001624" # 3082 "SO:0001986" # 2657 "SO:0001580" # 1772 "SO:0001619" # 1741 "SO:0001987" # 833 "SO:0001623" # 30 "SO:0001575" # 10 "SO:0001574" # 4 "SO:0001590" # Protein SO terms -- ditto for soTerm.[ch]: # 784 "SO:0001819" # 713 "SO:0001583" # 6 "SO:0001821" # 5 "SO:0001587" # 2 "SO:0000865" # Made sure all those are in hg/{inc,lib}/soTerm.[ch] # Projects reporting allele counts/frequencies: # 18130 "GnomAD" # 17473 "1000Genomes" # 17376 "TOPMED" # 15952 "TWINSUK" # 15952 "ALSPAC" # 15188 "Estonian" # 555 "GnomAD_exomes" # 473 "GoESP" # 468 "ExAC" freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,GnomAD,GoESP,ALSPAC,TWINSUK,Estonian cd $topDir # Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name. hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \ | tawk '{print $1, "GRCh38.p12", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \ > refSeqToUcsc.tab hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \ | tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \ >> refSeqToUcsc.tab # Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37, # so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings. refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions grep -v ^# \ $refseqAssemblies/GCF_000001405.{25_GRCh37.p13,38_GRCh38.p12}/*_assembly_structure/all_alt_scaffold_placement.txt \ | tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \ | sort -k 1,1 -k2n,2n \ | tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \ > equivRegions.tab # Add PARs: grep -w PAR \ $refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \ | sort \ | sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \ | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}' #PAR#1 NC_000023.10:60000:2699520 #PAR#1 NC_000024.9:10000:2649520 #PAR#2 NC_000023.10:154931043:155260560 #PAR#2 NC_000024.9:59034049:59363566 echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab grep -w PAR \ $refseqAssemblies/GCF_000001405.38_GRCh38.p12/*_assembly_regions.txt \ | sort \ | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \ | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}' #PAR#1 NC_000023.11:10000:2781479 #PAR#1 NC_000024.10:10000:2781479 #PAR#2 NC_000023.11:155701382:156030895 #PAR#2 NC_000024.10:56887902:57217415 echo "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab echo "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab # Run doBigDbSnp.pl... $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -continue convert -stop bigBed \ >& do.log & tail -f do.log # *** All done ! (through the 'install' step) Elapsed time: 278m36s # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17 # 9/17/19: re-run from dbSnpJsonToTab onward after lots of changes topDir=/hive/data/outside/dbSNP/152 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install -debug # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17 cd /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17 # Link to ../split, -continue convert to avoid re-splitting (the slowest part of the process): rm split ln -s ../split split $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \ -buildDir=`pwd` -continue convert -stop install \ >& do.log & tail -f do.log # *** All done ! (through the 'install' step) Elapsed time: 449m6s # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17 # 10/8/19: count up how many variants have freq counts for each project cut -f 4 dbSnp152Details.tab \ | perl -wne 'chomp; next unless $_; @w = split ","; if ($w[0]) { print "1000Genomes\n" } if ($w[1]) { print "GnomAD_exomes\n"; } if ($w[2]) { print "TOPMED\n" } if ($w[3]) { print "ExAC\n" } if ($w[4]) { print "GnomAD\n" } if ($w[5]) { print "GoESP\n" } if ($w[6]) { print "ALSPAC\n" } if ($w[7]) { print "TWINSUK\n" } if ($w[8]) { print "Estonian\n" }' \ | sort | uniq -c | sort -nr #437624857 TOPMED #234158623 GnomAD #84743526 1000Genomes #44887599 TWINSUK #44887599 ALSPAC #31397792 Estonian #11721224 GnomAD_exomes #8854021 ExAC #1973787 GoESP # 10/11/19: count up how many instances of each type of ucscNote: cut -f 15 hg19.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c # 10680 altIsAmbiguous # 4808 classMismatch # 409132 clinvar # 106941 clusterError #12757487 commonAll #18901486 commonSome # 823424 diffMajor # 7635 freqIsAmbiguous # 23027 freqNotRefAlt # 555144 multiMap #99618012 overlapDiffClass #14790469 overlapSameClass # 101 refIsAmbiguous # 2933684 refIsMinor # 150892 refIsRare # 45618 refIsSingleton # 4 refMismatch # 3761191 revStrand cut -f 15 hg38.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c # 10807 altIsAmbiguous # 5103 classMismatch # 408665 clinvar # 94310 clusterError # 13027110 commonAll # 19258751 commonSome # 836327 diffMajor # 7736 freqIsAmbiguous # 36306 freqNotRefAlt # 130175 multiMap #102260850 overlapDiffClass # 15075710 overlapSameClass # 110 refIsAmbiguous # 3033691 refIsMinor # 189809 refIsRare # 63804 refIsSingleton # 33 refMismatch # 4439534 revStrand # 10/18/19: add subset tracks $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \ -buildDir=`pwd` -continue=bigBed -stop=install >& subsets.log & tail -f subsets.log ############################################################################## # dbSnp153: dbSNP build 153 (DONE 11/8/19 angie) topDir=/hive/data/outside/dbSNP/153 mkdir -p $topDir/json cd $topDir/json wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\* md5sum -c CHECKSUMS #refsnp-chr10.json.bz2: OK #... #refsnp-withdrawn.json.bz2: OK # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there, # by sampling first 10,000 variants on an arbitrary chrom: assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name" rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession" proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession" freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name" for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do echo "$jPath" bzcat refsnp-chr3.json.bz2 \ | head -10000 \ | jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \ | sort -nr echo "" done # Assemblies: # 10229 "GRCh38.p12" # 10111 "GRCh37.p13" # RNA SO terms -- make sure all of these appear in soTerm.[ch]: # 144288 "SO:0001627" # 25829 "SO:0002153" # 19729 "SO:0002152" # 8506 "SO:0001624" # 3112 "SO:0001986" # 2648 "SO:0001580" # 1769 "SO:0001619" # 1712 "SO:0001987" # 878 "SO:0001623" # 30 "SO:0001575" # 10 "SO:0001574" # 4 "SO:0001590" # Protein SO terms -- ditto for soTerm.[ch]: # 770 "SO:0001819" # 726 "SO:0001583" # 6 "SO:0001821" # 5 "SO:0001587" # 2 "SO:0000865" # Made sure all those are in hg/{inc,lib}/soTerm.[ch] (nothing new since b152) # Projects reporting allele counts/frequencies: # 17493 "1000Genomes" # 17392 "TOPMED" # 16619 "GnomAD" # 16306 "NorthernSweden" # 15970 "TWINSUK" # 15970 "ALSPAC" # 15202 "Estonian" # 13096 "Vietnamese" # 1844 "PAGE_STUDY" # 473 "GoESP" # 468 "ExAC" # 458 "GnomAD_exomes" # This time the JSON downloads include a file frequency_studies.json that describes each study. # Will be useful for making a details page, but some descriptions are just study names. # Get total_count values from refsnp-chr3.json.bz2, put 1000Genomes first, then order by # decreasing total_count: # 1000Genomes: 5008 # GnomAD_exomes: 251006 # TOPMED: 125568 # ExAC: 121234 # PAGE_STUDY: 78694 # GnomAD: 31348 # GoESP: 12494 # Estonian: 4480 # ALSPAC: 3854 # TWINSUK: 3708 # NorthernSweden: 600 # Vietnamese: 212 freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese # Hmm, the PAGE study is a population genetics study. It excludes caucasians: # https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf # "Genotyped individuals self-identified as Hispanic/Latino (N=22,216), # African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940), # Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage, # as well as participants who did not identify with any of the available options." # They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost # as many Native Hawaiians as Asians), so I'll keep 1000Genomes first. # Reuse assembly sequence mapping files from b152 since the assemblies are the same. cd $topDir cp ../152/refSeqToUcsc.tab . cp ../152/equivRegions.tab . # Run doBigDbSnp.pl (first with -debug to make runDir $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -debug # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install \ >& do.log & tail -f do.log # *** All done ! (through the 'install' step) Elapsed time: 2095m35s # 9/13/19: Now that checkDbSnp has been added to doBigDbSnp.pl, re-run from the check # stage onward (but don't cleanup just yet in case we need to debug files). cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \ -continue check -stop install \ >& check.log & tail -f check.log # 9/18/19: re-run from dbSnpJsonToTab onward after adding several more ucscNotes. cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir \ -buildDir=`pwd` -continue convert -stop install \ >& redo.log & tail -f redo.log # *** All done ! (through the 'install' step) Elapsed time: 263m59s # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 #*** uh-oh... when checkBigDbSnp failed, doCheck.sh did not fail... I guess backgrounding #*** the jobs and 'wait' hide errors? cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \ -continue check -stop install \ >& check.log & tail -f check.log # 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids: # 10/30/19: and again after adding new ucscNotes (#23283). # 11/4/19: and again after finding that refIsMinor & diffMajor could be appended multiple times # 11/7/19: and again after finding that some cases of freqNotRefAlt are VCF normalization probs # 11/8/19: and again after adding badCoords.bed and warnings output files topDir=/hive/data/outside/dbSNP/153 freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese # Run doBigDbSnp.pl (first with -debug to make runDir): $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08 cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08 # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process): rmdir split ln -s ../bigDbSnp.2019-08-07/split split $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \ -buildDir=`pwd` -continue convert -stop install \ >& do.log & tail -f do.log # *** All done ! (through the 'install' step) Elapsed time: 504m48s # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-08 + # 11/12/19: Add new ucscNote otherMapErr by re-running from check onward. + $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \ + -buildDir=`pwd` -continue check -stop install \ + >& check.log & + tail -f check.log +# *** All done ! (through the 'install' step) Elapsed time: 172m35s + # count up how many variants have freq counts for each project cut -f 4 dbSnp153Details.tab \ | perl -wne 'chomp; next unless $_; @w = split ","; if ($w[0]) { print "1000Genomes\n" } if ($w[1]) { print "GnomAD_exomes\n"; } if ($w[2]) { print "TOPMED\n" } if ($w[3]) { print "ExAC\n" } if ($w[4]) { print "PAGE_STUDY\n" } if ($w[5]) { print "GnomAD\n" } if ($w[6]) { print "GoESP\n" } if ($w[7]) { print "Estonian\n" } if ($w[8]) { print "ALSPAC\n" } if ($w[9]) { print "TWINSUK\n" } if ($w[10]) { print "NorthernSweden\n" } if ($w[11]) { print "Vietnamese\n" }' \ | sort | uniq -c | sort -nr #437625009 TOPMED #211192420 GnomAD #84744375 1000Genomes #44888383 TWINSUK #44888383 ALSPAC #31397940 Estonian #16351632 NorthernSweden #12283940 GnomAD_exomes #10004052 Vietnamese #8854128 ExAC #1973841 GoESP #1323033 PAGE_STUDY # count up how many instances of each type of ucscNote: - cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c + time cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c # 10754 altIsAmbiguous # 5995 classMismatch # 454674 clinvar # 143860 clinvarBenign # 7932 clinvarConflicting # 96242 clinvarPathogenic # 114685 clusterError # 12184226 commonAll # 20540882 commonSome # 1377817 diffMajor # 7656 freqIsAmbiguous # 17684 freqNotRefAlt # 562157 multiMap +# 113416 otherMapErr #107003090 overlapDiffClass # 16910407 overlapSameClass #662595470 rareAll #670952126 rareSome # 101 refIsAmbiguous # 3271878 refIsMinor # 136452 refIsRare # 37783 refIsSingleton # 4 refMismatch # 3813467 revStrand + # Check count of rs's with at least one bad mapping: + grep otherMapErr hg19.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l +#54871 - cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c + time cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c # 10880 altIsAmbiguous # 6206 classMismatch # 453990 clinvar # 143730 clinvarBenign # 7950 clinvarConflicting # 95262 clinvarPathogenic # 128109 clusterError #12438325 commonAll #20902602 commonSome #1399094 diffMajor # 7756 freqIsAmbiguous # 32150 freqNotRefAlt # 132051 multiMap +# 203580 otherMapErr #109991096 overlapDiffClass #17281744 overlapSameClass #681685476 rareAll #690149753 rareSome # 111 refIsAmbiguous #3360159 refIsMinor # 160723 refIsRare # 50865 refIsSingleton # 33 refMismatch #4532270 revStrand + # Check count of rs's with at least one bad mapping: + grep otherMapErr hg38.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l +#86258 ##############################################################################