69087d3a65af31c39337085920e99e5b2db13082 galt Fri Jun 17 15:05:28 2022 -0700 Ran the dbsnp pipeline designed by Angie for dbsnp v155. It produces huge bigBed output and I found and fixed a problem encountered on the bedToBigBed. I also tweaked dbSnpJsonToTab to deal with some dbsnp data having multiple study subversions, by ignoring the old datasets and just using the latest one. Added a track description page that has lots of content and counts to update. dbsnp155 is ready for QA on hgwdev. refs #rm27751 diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt index 0563b42..d2ee72e 100644 --- src/hg/makeDb/doc/bigDbSnp.txt +++ src/hg/makeDb/doc/bigDbSnp.txt @@ -1,504 +1,821 @@ # for emacs: -*- mode: sh; -*- # dbSnpNNN ("dbSNP 2.0") bigDbSnp tracks for hg38 / GRCh38 and hg19 / GRCh37 ############################################################################## # dbSnp152: dbSNP build 152 (DONE 9/18/19 angie) topDir=/hive/data/outside/dbSNP/152 mkdir -p $topDir/json cd $topDir/json wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\* md5sum -c CHECKSUMS #refsnp-chr10.json.bz2: OK #... #refsnp-withdrawn.json.bz2: OK # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there, # by sampling first 10,000 variants on an arbitrary chrom: set assemblyPath = "primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name" set rnaSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession" set proteinSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession" set freqSourcePath = "primary_snapshot_data.allele_annotations[*].frequency[*].study_name" foreach jPath ("$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath") echo "$jPath" bzcat refsnp-chr3.json.bz2 \ | head -10000 \ | jsonQuery -countUniq -verbose=2 stdin "$jPath" stdout \ | sort -nr echo "" end # Assemblies: # 10229 "GRCh38.p12" # 10111 "GRCh37.p13" # RNA SO terms -- make sure all of these appear in soTerm.[ch]: # 147701 "SO:0001627" # 26809 "SO:0002153" # 19013 "SO:0002152" # 8334 "SO:0001624" # 3082 "SO:0001986" # 2657 "SO:0001580" # 1772 "SO:0001619" # 1741 "SO:0001987" # 833 "SO:0001623" # 30 "SO:0001575" # 10 "SO:0001574" # 4 "SO:0001590" # Protein SO terms -- ditto for soTerm.[ch]: # 784 "SO:0001819" # 713 "SO:0001583" # 6 "SO:0001821" # 5 "SO:0001587" # 2 "SO:0000865" # Made sure all those are in hg/{inc,lib}/soTerm.[ch] # Projects reporting allele counts/frequencies: # 18130 "GnomAD" # 17473 "1000Genomes" # 17376 "TOPMED" # 15952 "TWINSUK" # 15952 "ALSPAC" # 15188 "Estonian" # 555 "GnomAD_exomes" # 473 "GoESP" # 468 "ExAC" freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,GnomAD,GoESP,ALSPAC,TWINSUK,Estonian cd $topDir # Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name. hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \ | tawk '{print $1, "GRCh38.p12", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \ > refSeqToUcsc.tab hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \ | tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \ >> refSeqToUcsc.tab # Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37, # so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings. refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions grep -v ^# \ $refseqAssemblies/GCF_000001405.{25_GRCh37.p13,38_GRCh38.p12}/*_assembly_structure/all_alt_scaffold_placement.txt \ | tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \ | sort -k 1,1 -k2n,2n \ | tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \ > equivRegions.tab # Add PARs: grep -w PAR \ $refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \ | sort \ | sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \ | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}' #PAR#1 NC_000023.10:60000:2699520 #PAR#1 NC_000024.9:10000:2649520 #PAR#2 NC_000023.10:154931043:155260560 #PAR#2 NC_000024.9:59034049:59363566 echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab grep -w PAR \ $refseqAssemblies/GCF_000001405.38_GRCh38.p12/*_assembly_regions.txt \ | sort \ | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \ | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}' #PAR#1 NC_000023.11:10000:2781479 #PAR#1 NC_000024.10:10000:2781479 #PAR#2 NC_000023.11:155701382:156030895 #PAR#2 NC_000024.10:56887902:57217415 - echo "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab - echo "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab + echo -e "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab + echo -e "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab # Run doBigDbSnp.pl... $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -continue convert -stop bigBed \ >& do.log & tail -f do.log # *** All done ! (through the 'install' step) Elapsed time: 278m36s # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17 # 9/17/19: re-run from dbSnpJsonToTab onward after lots of changes topDir=/hive/data/outside/dbSNP/152 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install -debug # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17 cd /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17 # Link to ../split, -continue convert to avoid re-splitting (the slowest part of the process): rm split ln -s ../split split $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \ -buildDir=`pwd` -continue convert -stop install \ >& do.log & tail -f do.log # *** All done ! (through the 'install' step) Elapsed time: 449m6s # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17 # 10/8/19: count up how many variants have freq counts for each project cut -f 4 dbSnp152Details.tab \ | perl -wne 'chomp; next unless $_; @w = split ","; if ($w[0]) { print "1000Genomes\n" } if ($w[1]) { print "GnomAD_exomes\n"; } if ($w[2]) { print "TOPMED\n" } if ($w[3]) { print "ExAC\n" } if ($w[4]) { print "GnomAD\n" } if ($w[5]) { print "GoESP\n" } if ($w[6]) { print "ALSPAC\n" } if ($w[7]) { print "TWINSUK\n" } if ($w[8]) { print "Estonian\n" }' \ | sort | uniq -c | sort -nr #437624857 TOPMED #234158623 GnomAD #84743526 1000Genomes #44887599 TWINSUK #44887599 ALSPAC #31397792 Estonian #11721224 GnomAD_exomes #8854021 ExAC #1973787 GoESP # 10/11/19: count up how many instances of each type of ucscNote: cut -f 15 hg19.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c # 10680 altIsAmbiguous # 4808 classMismatch # 409132 clinvar # 106941 clusterError #12757487 commonAll #18901486 commonSome # 823424 diffMajor # 7635 freqIsAmbiguous # 23027 freqNotRefAlt # 555144 multiMap #99618012 overlapDiffClass #14790469 overlapSameClass # 101 refIsAmbiguous # 2933684 refIsMinor # 150892 refIsRare # 45618 refIsSingleton # 4 refMismatch # 3761191 revStrand cut -f 15 hg38.dbSnp152.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c # 10807 altIsAmbiguous # 5103 classMismatch # 408665 clinvar # 94310 clusterError # 13027110 commonAll # 19258751 commonSome # 836327 diffMajor # 7736 freqIsAmbiguous # 36306 freqNotRefAlt # 130175 multiMap #102260850 overlapDiffClass # 15075710 overlapSameClass # 110 refIsAmbiguous # 3033691 refIsMinor # 189809 refIsRare # 63804 refIsSingleton # 33 refMismatch # 4439534 revStrand # 10/18/19: add subset tracks $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \ -buildDir=`pwd` -continue=bigBed -stop=install >& subsets.log & tail -f subsets.log ############################################################################## # dbSnp153: dbSNP build 153 (DONE 11/25/19 angie) topDir=/hive/data/outside/dbSNP/153 mkdir -p $topDir/json cd $topDir/json wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\* md5sum -c CHECKSUMS #refsnp-chr10.json.bz2: OK #... #refsnp-withdrawn.json.bz2: OK # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there, # by sampling first 10,000 variants on an arbitrary chrom: assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name" rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession" proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession" freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name" for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do echo "$jPath" bzcat refsnp-chr3.json.bz2 \ | head -10000 \ | jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \ | sort -nr echo "" done # Assemblies: # 10229 "GRCh38.p12" # 10111 "GRCh37.p13" # RNA SO terms -- make sure all of these appear in soTerm.[ch]: # 144288 "SO:0001627" # 25829 "SO:0002153" # 19729 "SO:0002152" # 8506 "SO:0001624" # 3112 "SO:0001986" # 2648 "SO:0001580" # 1769 "SO:0001619" # 1712 "SO:0001987" # 878 "SO:0001623" # 30 "SO:0001575" # 10 "SO:0001574" # 4 "SO:0001590" # Protein SO terms -- ditto for soTerm.[ch]: # 770 "SO:0001819" # 726 "SO:0001583" # 6 "SO:0001821" # 5 "SO:0001587" # 2 "SO:0000865" # Made sure all those are in hg/{inc,lib}/soTerm.[ch] (nothing new since b152) # Projects reporting allele counts/frequencies: # 17493 "1000Genomes" # 17392 "TOPMED" # 16619 "GnomAD" # 16306 "NorthernSweden" # 15970 "TWINSUK" # 15970 "ALSPAC" # 15202 "Estonian" # 13096 "Vietnamese" # 1844 "PAGE_STUDY" # 473 "GoESP" # 468 "ExAC" # 458 "GnomAD_exomes" # This time the JSON downloads include a file frequency_studies.json that describes each study. # Will be useful for making a details page, but some descriptions are just study names. # Get total_count values from refsnp-chr3.json.bz2, put 1000Genomes first, then order by # decreasing total_count: # 1000Genomes: 5008 # GnomAD_exomes: 251006 # TOPMED: 125568 # ExAC: 121234 # PAGE_STUDY: 78694 # GnomAD: 31348 # GoESP: 12494 # Estonian: 4480 # ALSPAC: 3854 # TWINSUK: 3708 # NorthernSweden: 600 # Vietnamese: 212 freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese # Hmm, the PAGE study is a population genetics study. It excludes caucasians: # https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf # "Genotyped individuals self-identified as Hispanic/Latino (N=22,216), # African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940), # Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage, # as well as participants who did not identify with any of the available options." # They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost # as many Native Hawaiians as Asians), so I'll keep 1000Genomes first. # Reuse assembly sequence mapping files from b152 since the assemblies are the same. cd $topDir cp ../152/refSeqToUcsc.tab . cp ../152/equivRegions.tab . # Run doBigDbSnp.pl (first with -debug to make runDir $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -debug # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install \ >& do.log & tail -f do.log # *** All done ! (through the 'install' step) Elapsed time: 2095m35s # 9/13/19: Now that checkDbSnp has been added to doBigDbSnp.pl, re-run from the check # stage onward (but don't cleanup just yet in case we need to debug files). cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \ -continue check -stop install \ >& check.log & tail -f check.log # 9/18/19: re-run from dbSnpJsonToTab onward after adding several more ucscNotes. cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir \ -buildDir=`pwd` -continue convert -stop install \ >& redo.log & tail -f redo.log # *** All done ! (through the 'install' step) Elapsed time: 263m59s # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 #*** uh-oh... when checkBigDbSnp failed, doCheck.sh did not fail... I guess backgrounding #*** the jobs and 'wait' hide errors? cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07 $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \ -continue check -stop install \ >& check.log & tail -f check.log # 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids: # 10/30/19: and again after adding new ucscNotes (#23283). # 11/4/19: and again after finding that refIsMinor & diffMajor could be appended multiple times # 11/7/19: and again after finding that some cases of freqNotRefAlt are VCF normalization probs # 11/8/19: and again after adding badCoords.bed and warnings output files # 11/15/19: and again after adding ucscNotes freqIncomplete, freqNotMapped # 11/24/19: and again: support for alt alleles with different expansion ranges (rs782394990) topDir=/hive/data/outside/dbSNP/153 freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese # Run doBigDbSnp.pl (first with -debug to make runDir): $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-24 cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-24 # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process): rmdir split ln -s ../bigDbSnp.2019-08-07/split split $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \ -buildDir=`pwd` -continue convert -stop install \ >& do.log & tail -f do.log # *** All done ! (through the 'install' step) Elapsed time: 579m31s # *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-11-24 wc -l dbSnp153Errors.tab #0 dbSnp153Errors.tab # Yay! Now we don't need to document any dropped variants. :) awk '{print $1, $2, $3;}' dbSnp153Warnings.tab | sort | uniq -c # 5339 Frequency report not # 3852 Incomplete freq data # 65257 Mismatching SPDI del # 197684 Mismatching SPDI pos # Statistics: bigBedInfo -extraIndex hg19.dbSnp153.bb #extraIndexCount: 1 # name (field 3) with 683142960 items #itemCount: 683,142,960 #chromCount: 297 #basesCovered: 823,963,745 #meanDepth (of bases covered): 1.279107 #minDepth: 1.000000 #maxDepth: 321.000000 #std of depth: 1.993272 bigBedInfo -extraIndex hg38.dbSnp153.bb #extraIndexCount: 1 # name (field 3) with 702599342 items #itemCount: 702,599,342 #chromCount: 575 #basesCovered: 845,712,321 #meanDepth (of bases covered): 1.279667 #minDepth: 1.000000 #maxDepth: 321.000000 #std of depth: 1.990520 bigBedInfo -extraIndex hg19.dbSnp153BadCoords.bb # name (field 3) with 121484 items #itemCount: 121,484 #chromCount: 191 #basesCovered: 750,840 #meanDepth (of bases covered): 2.603311 #minDepth: 1.000000 #maxDepth: 150.000000 #std of depth: 4.266904 bigBedInfo -extraIndex hg38.dbSnp153BadCoords.bb #extraIndexCount: 1 # name (field 3) with 149475 items #itemCount: 149,475 #chromCount: 418 #basesCovered: 986,919 #meanDepth (of bases covered): 2.867679 #minDepth: 1.000000 #maxDepth: 119.000000 #std of depth: 4.568874 # count up how many variants (mappings) have freq counts for each project cut -f 4 dbSnp153Details.tab \ | perl -we ' my @freqSourceOrder = ("1000Genomes", "GnomAD_exomes", "TOPMED", "ExAC", "PAGE_STUDY", "GnomAD", "GoESP", "Estonian", "ALSPAC", "TWINSUK", "NorthernSweden", "Vietnamese"); my @counts = (); while (<>) { chomp; next unless $_; @w = split ","; for (my $i = 0; $i < @w; $i++) { if ($w[$i]) { $counts[$i]++; } } } for (my $i = 0; $i < @freqSourceOrder; $i++) { print sprintf("%10d %s", $counts[$i] || 0, $freqSourceOrder[$i]) . "\n"; }' # 437625304 TOPMED # 211192464 GnomAD # 84744519 1000Genomes # 44888412 TWINSUK # 44888412 ALSPAC # 31397949 Estonian # 16351704 NorthernSweden # 12283947 GnomAD_exomes # 10004068 Vietnamese # 8854135 ExAC # 1973844 GoESP # 1323049 PAGE_STUDY # count up how many instances of each type of ucscNote: time cut -f 15 hg19.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c # 10755 altIsAmbiguous # 5998 classMismatch # 454678 clinvar # 143864 clinvarBenign # 7932 clinvarConflicting # 96242 clinvarPathogenic # 114826 clusterError # 12184521 commonAll # 20541190 commonSome # 1377831 diffMajor # 3922 freqIncomplete # 7656 freqIsAmbiguous # 2685 freqNotMapped # 17694 freqNotRefAlt # 562180 multiMap # 114095 otherMapErr #107015341 overlapDiffClass # 16915239 overlapSameClass #662601770 rareAll #670958439 rareSome # 101 refIsAmbiguous # 3272116 refIsMinor # 136547 refIsRare # 37832 refIsSingleton # 4 refMismatch # 3813702 revStrand #real 34m57.796s #user 47m49.283s #sys 4m29.442s # Check count of rs's with at least one bad mapping: grep otherMapErr hg19.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l #55454 time cut -f 15 hg38.dbSnp153.checked.bigDbSnp | sed -re 's/,/\n/g;' | g . | sort | uniq -c # 10888 altIsAmbiguous # 6216 classMismatch # 453996 clinvar # 143736 clinvarBenign # 7950 clinvarConflicting # 95262 clinvarPathogenic # 128306 clusterError # 12438655 commonAll # 20902944 commonSome # 1399109 diffMajor # 4673 freqIncomplete # 7756 freqIsAmbiguous # 6590 freqNotMapped # 32170 freqNotRefAlt # 132123 multiMap # 204219 otherMapErr #110007682 overlapDiffClass # 17291289 overlapSameClass #681696398 rareAll #690160687 rareSome # 111 refIsAmbiguous # 3360435 refIsMinor # 160827 refIsRare # 50927 refIsSingleton # 33 refMismatch # 4532511 revStrand #real 36m36.972s #user 49m41.817s #sys 4m43.806s # Check count of rs's with at least one bad mapping: grep otherMapErr hg38.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l #86636 +############################################################################## +# dbSnp155: dbSNP build 155 (IN-PROGRESS 11/25/19 galt) + + topDir=/hive/data/outside/dbSNP/155 + mkdir -p $topDir/json + cd $topDir/json + + wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\* + md5sum -c CHECKSUMS +#refsnp-chr10.json.bz2: OK +#... +#refsnp-withdrawn.json.bz2: OK + + + + # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there, + # by sampling first 10,000 variants on an arbitrary chrom: + assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name" + rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession" + proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession" + freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name" + for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do + echo "$jPath" + bzcat refsnp-chr3.json.bz2 \ + | head -10000 \ + | jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \ + | sort -nr + echo "" + done + # Assemblies: +# 10232 "GRCh38.p13" +# 10111 "GRCh37.p13" + + # RNA SO terms -- make sure all of these appear in soTerm.[ch]: + +# 144288 "SO:0001627" +# 25829 "SO:0002153" +# 19729 "SO:0002152" +# 8506 "SO:0001624" +# 3112 "SO:0001986" +# 2648 "SO:0001580" +# 1769 "SO:0001619" +# 1712 "SO:0001987" +# 878 "SO:0001623" +# 30 "SO:0001575" +# 10 "SO:0001574" +# 4 "SO:0001590" + + # Made sure all RNA SO terms are found in hg/{inc,lib}/soTerm.[ch] (nothing new since b152) + + + # Protein SO terms -- make sure all of these appear in soTerm.[ch]: + +# 770 "SO:0001819" +# 726 "SO:0001583" +# 6 "SO:0001821" +# 5 "SO:0001587" +# 2 "SO:0000865" + + # Made sure all Protein SO terms are found in hg/{inc,lib}/soTerm.[ch] (nothing new since b152) + + + # Projects reporting allele counts/frequencies: + +# 34713 "TOPMED" +# 22432 "dbGaP_PopFreq" +# 21128 "KOREAN" +# 17553 "SGDP_PRJ" +# 17470 "1000Genomes" +# 17069 "Qatari" +# 16284 "NorthernSweden" +# 16074 "Siberian" +# 15948 "TWINSUK" +# 15948 "TOMMO" +# 15948 "ALSPAC" +# 15408 "GENOME_DK" +# 15356 "GnomAD" +# 15340 "GoNL" +# 15198 "Estonian" +# 13086 "Vietnamese" +# 12000 "Korea1K" +# 11238 "HapMap" +# 3406 "PRJEB36033" +# 2454 "HGDP_Stanford" +# 2018 "Daghestan" +# 1846 "PAGE_STUDY" +# 1204 "Chileans" +# 1134 "MGP" +# 1116 "PRJEB37584" +# 474 "GoESP" +# 468 "ExAC" +# 460 "GnomAD_exomes" +# 318 "FINRISK" +# 46 "PharmGKB" +# 6 "PRJEB37766" + + # This time the JSON downloads include a file frequency_studies.json that describes each study. + # Will be useful for making a details page, but some descriptions are just study names. + + # Got total_count values from refsnp-chr3.json.bz2 above, put ALFA 2.0 (dbGaP_PopFreq) first, then order by + + # decreasing total_count: + +# 22432 "dbGaP_PopFreq" +# 34713 "TOPMED" +# 21128 "KOREAN" +# 17553 "SGDP_PRJ" +# 17470 "1000Genomes" +# 17069 "Qatari" +# 16284 "NorthernSweden" +# 16074 "Siberian" +# 15948 "TWINSUK" +# 15948 "TOMMO" +# 15948 "ALSPAC" +# 15408 "GENOME_DK" +# 15356 "GnomAD" +# 15340 "GoNL" +# 15198 "Estonian" +# 13086 "Vietnamese" +# 12000 "Korea1K" +# 11238 "HapMap" +# 3406 "PRJEB36033" +# 2454 "HGDP_Stanford" +# 2018 "Daghestan" +# 1846 "PAGE_STUDY" +# 1204 "Chileans" +# 1134 "MGP" +# 1116 "PRJEB37584" +# 474 "GoESP" +# 468 "ExAC" +# 460 "GnomAD_exomes" +# 318 "FINRISK" +# 46 "PharmGKB" +# 6 "PRJEB37766" + + + +#NOTE freqSourceOrder with ALFA2 aka dbGaP_PopFreq as the primary study. This was later abandoned since the common set was much smaller, went back to putting 1000Genomes first later on. + freqSourceOrder=dbGaP_PopFreq,TOPMED,KOREAN,SGDP_PRJ,1000Genomes,Qatari,NorthernSweden,Siberian,TWINSUK,TOMMO,ALSPAC,GENOME_DK,GnomAD,GoNL,Estonian,Vietnamese,Korea1K,HapMap,PRJEB36033,HGDP_Stanford,Daghestan,PAGE_STUDY,Chileans,MGP,PRJEB37584,GoESP,ExAC,GnomAD_exomes,FINRISK,PharmGKB,PRJEB37766 + + # Hmm, the PAGE study is a population genetics study. It excludes caucasians: + # https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf + # "Genotyped individuals self-identified as Hispanic/Latino (N=22,216), + # African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940), + # Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage, + # as well as participants who did not identify with any of the available options." + # They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost + # as many Native Hawaiians as Asians), so I'll keep 1000Genomes first. + + cd $topDir + + # dbSnp has changed since v152, it now uses patch13 too, so "GRCh38.p13" instead of "GRCh38.p12" + + # Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name. + hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \ + | tawk '{print $1, "GRCh38.p13", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \ + > refSeqToUcsc.tab + hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \ + | tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \ + >> refSeqToUcsc.tab + + # has 45 more alts and fixes for hg38 just as expected for patch13. + + + #make equivRegions.tab + + # hg38 patch13 RefSeq assembly accession: GCF_000001405.39 (latest) + + # Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37, + # so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings. + refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions + grep -v ^# \ + $refseqAssemblies/GCF_000001405.{25_GRCh37.p13,39_GRCh38.p13}/*_assembly_structure/all_alt_scaffold_placement.txt \ + | tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \ + | sort -k 1,1 -k2n,2n \ + | tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \ + > equivRegions.tab + + # Add PARs: + grep -w PAR \ + $refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \ + | sort \ + | sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \ + | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}' + +#PAR#1 NC_000023.10:60000:2699520 +#PAR#1 NC_000024.9:10000:2649520 +#PAR#2 NC_000023.10:154931043:155260560 +#PAR#2 NC_000024.9:59034049:59363566 + echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab + echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab + + grep -w PAR \ + $refseqAssemblies/GCF_000001405.39_GRCh38.p13/*_assembly_regions.txt \ + | sort \ + | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \ + | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}' + +#PAR#1 NC_000023.11:10000:2781479 +#PAR#1 NC_000024.10:10000:2781479 +#PAR#2 NC_000023.11:155701382:156030895 +#PAR#2 NC_000024.10:56887902:57217415 + echo -e "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab + echo -e "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab + + # Patch doBigDbSnp.pl to update the default hg38 assembly to p13. + vi $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl + # around line 47, change to GRCh38.p13 and save and commit and push this new default. + -my $assemblyList = 'GRCh37.p13,GRCh38.p12'; + +my $assemblyList = 'GRCh37.p13,GRCh38.p13'; + + + # Run doBigDbSnp.pl (first with -debug to make runDir + # FYI the system is currently using ku cluster to run parasol jobs. + + $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -debug +# *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30 + cd /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30 + $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -stop install \ + >& do.log & + tail -f do.log + + +convert step choked because the step above for create equivRegions.tab lacked -e on echo statement +which meant that the "\t" was NOT converted to an actual tab. Must re-run. + +Re-ran from start but died on step fixHg19ChrM which was looking for NC_012920 but that has been renamed to just chrMT in the dbsnp data for hg19. +fixed doFixHg19ChrM.sh by tweaking the doBigDbSnp.pl that creates it. + +Even though perhaps $freqSourceOrder is only needed for the convert step, +the script requires it. + +Lame that after starting up screen, it lost my local variables, +so I had to redefine topDir and freqSourceOrder after launching screen program for running background jobs. + +# resuming + cd /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30 + $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder\ + -buildDir=`pwd` -continue check -stop install \ + >& check.log & + tail -f check.log + + *** All done ! (through the 'install' step) Elapsed time: 408m55s + *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30 + +# GALT 2022-05-14 + + # FYI + # split step runs on hgwdev + # convert step runs on ku which has 32 machines that have 32 CPUs each. + # fixed freqSoureOrder changed, back to using 1000Genomes as the primary study, instead of ALFA2 dbGaP_PopFreq + + topDir=/hive/data/outside/dbSNP/155 + # after trying ALFA2 as primary study, it did not have enought SNPs, so I am going back to using 1000Genomes as the primary study. + freqSourceOrder=1000Genomes,dbGaP_PopFreq,TOPMED,KOREAN,SGDP_PRJ,Qatari,NorthernSweden,Siberian,TWINSUK,TOMMO,ALSPAC,GENOME_DK,GnomAD,GoNL,Estonian,Vietnamese,Korea1K,HapMap,PRJEB36033,HGDP_Stanford,Daghestan,PAGE_STUDY,Chileans,MGP,PRJEB37584,GoESP,ExAC,GnomAD_exomes,FINRISK,PharmGKB,PRJEB37766 + # Run doBigDbSnp.pl (first with -debug to make runDir): + $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -debug + +# *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14 + + + cd /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14 + # Link to ../bigDbSnp.2021-09-30/split, -continue convert to avoid re-splitting (the slowest part of the process): + rmdir split + ln -s ../bigDbSnp.2021-09-30/split split + rmdir splitProcessed + ln -s ../bigDbSnp.2021-09-30/splitProcessed splitProcessed + $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \ + -buildDir=`pwd` -continue convert -stop install \ + >& do.log & + tail -f do.log + # crashed on the mergeToChrom step + + # NOTICE that on mergeToChrom it uses /dev/shm/ virtual memory disk for temp files, + # and it can run out of space, and if you have to stop and restart it, + # be sure to clean it out properly like this: +# para stop # seems to flush results too. +# para freeBatch # should also fix any sick batches and machines. +# +# TO CLEAN: +# rm -fr /dev/shm/dbSnpMergeSortBed.*/ +# rm -fr /dev/shm/dbSnpMergeSort.*/ +# +# Note that it also overflowed the relatively small space in /scratch/tmp/ which +# is used by sort in the sort command. So I had to drastically reduce the maximum +# number of simultaneous jobs, all the way down to just 6 jobs! +# added -maxJob=6 to the para make jobList command in doMergeToChrom.sh +/parasol/bin/para make -maxJob=6 jobList +# and cleaning out temp files in /dev/shm/ and then para stop, para freeBatch, +# and finally I re-ran doMergeToChrom.sh manually. + + $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \ + -buildDir=`pwd` -continue mergeChroms -stop install \ + >& do2.log & + tail -f do2.log + +# step bigBed choked on hg19 because of a subtle bug in resSizes counting which is now fixed. + $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \ + -buildDir=`pwd` -continue bigBed -stop install \ + >& do3.log & + tail -f do3.log + + *** All done ! (through the 'install' step) Elapsed time: 326m34s + *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14 + + wc -l dbSnp155Errors.tab + +#13318 dbSnp155Errors.tab +# nearly all of these "Errors" are because a small study called 'ChromosomeY' was not picked up +# in our scan of the first 10000 lines of chrom 3 that we used above. However, this study is not big enough to be important. +1011954 NOSEQ freqSourceOrder does not contains 'ChromosomeY' + +I copied track description page fron 153 and updated the content +src/hg/makeDb/trackDb/human/dbSnp155Composite.html + +Informally, I made a couple of scripts in here to help update the studies list and the keywords counts sections of the description page. +/hive/data/outside/dbSNP/155/json/ ##############################################################################