src/hg/makeDb/doc/bigDbSnp.txt 69087d3a65af31c39337085920e99e5b2db13082

69087d3a65af31c39337085920e99e5b2db13082
galt
  Fri Jun 17 15:05:28 2022 -0700
Ran the dbsnp pipeline designed by Angie for dbsnp v155. It produces huge bigBed output and I found and fixed a problem encountered on the bedToBigBed. I also tweaked dbSnpJsonToTab to deal with some dbsnp data having multiple study subversions, by ignoring the old datasets and just using the latest one. Added a track description page that has lots of content and counts to update. dbsnp155 is ready for QA on hgwdev. refs #rm27751

diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt
index 0563b42..d2ee72e 100644
--- src/hg/makeDb/doc/bigDbSnp.txt
+++ src/hg/makeDb/doc/bigDbSnp.txt
@@ -91,32 +91,32 @@
 #PAR#1   NC_000023.10:60000:2699520
 #PAR#1   NC_000024.9:10000:2649520
 #PAR#2   NC_000023.10:154931043:155260560
 #PAR#2   NC_000024.9:59034049:59363566
     echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab
     echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab
     grep -w PAR \
       $refseqAssemblies/GCF_000001405.38_GRCh38.p12/*_assembly_regions.txt \
     | sort \
     | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \
     | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
 #PAR#1   NC_000023.11:10000:2781479
 #PAR#1   NC_000024.10:10000:2781479
 #PAR#2   NC_000023.11:155701382:156030895
 #PAR#2   NC_000024.10:56887902:57217415
-    echo "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab
-    echo "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab
+    echo -e "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab
+    echo -e "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab
 
 
     # Run doBigDbSnp.pl...
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -continue convert -stop bigBed \
       >& do.log &
     tail -f do.log
 # *** All done !  (through the 'install' step)  Elapsed time: 278m36s
 # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
 
     # 9/17/19: re-run from dbSnpJsonToTab onward after lots of changes
     topDir=/hive/data/outside/dbSNP/152
     $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install -debug
 # *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
     cd /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
     # Link to ../split, -continue convert to avoid re-splitting (the slowest part of the process):
@@ -488,17 +488,334 @@
 #690160687 rareSome
 #      111 refIsAmbiguous
 #  3360435 refIsMinor
 #   160827 refIsRare
 #    50927 refIsSingleton
 #       33 refMismatch
 #  4532511 revStrand
 #real    36m36.972s
 #user    49m41.817s
 #sys     4m43.806s
 
     # Check count of rs's with at least one bad mapping:
     grep otherMapErr hg38.dbSnp153.checked.bigDbSnp | cut -f 4 | sort -u | wc -l
 #86636
 
+##############################################################################
+# dbSnp155: dbSNP build 155 (IN-PROGRESS 11/25/19 galt)
+
+    topDir=/hive/data/outside/dbSNP/155
+    mkdir -p $topDir/json
+    cd $topDir/json
+
+    wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
+    md5sum -c CHECKSUMS
+#refsnp-chr10.json.bz2: OK
+#...
+#refsnp-withdrawn.json.bz2: OK
+
+
+
+    # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
+    # by sampling first 10,000 variants on an arbitrary chrom:
+    assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
+    rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
+    proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
+    freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
+    for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do
+      echo "$jPath"
+      bzcat refsnp-chr3.json.bz2 \
+      | head -10000 \
+      | jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \
+      | sort -nr
+      echo ""
+    done
+    # Assemblies:
+#     10232 "GRCh38.p13"
+#     10111 "GRCh37.p13"
+
+    # RNA SO terms -- make sure all of these appear in soTerm.[ch]:
+
+#    144288 "SO:0001627"
+#     25829 "SO:0002153"
+#     19729 "SO:0002152"
+#      8506 "SO:0001624" 
+#      3112 "SO:0001986"
+#      2648 "SO:0001580"
+#      1769 "SO:0001619"
+#      1712 "SO:0001987"
+#       878 "SO:0001623"
+#        30 "SO:0001575"
+#        10 "SO:0001574"
+#         4 "SO:0001590"
+
+    # Made sure all RNA SO terms are found in hg/{inc,lib}/soTerm.[ch] (nothing new since b152)
+
+
+    # Protein SO terms -- make sure all of these appear in soTerm.[ch]:
+
+#       770 "SO:0001819"
+#       726 "SO:0001583"
+#         6 "SO:0001821"
+#         5 "SO:0001587"
+#         2 "SO:0000865"
+
+    # Made sure all Protein SO terms are found in hg/{inc,lib}/soTerm.[ch] (nothing new since b152)
+
+
+    # Projects reporting allele counts/frequencies:
+
+#     34713 "TOPMED"
+#     22432 "dbGaP_PopFreq"
+#     21128 "KOREAN"
+#     17553 "SGDP_PRJ"
+#     17470 "1000Genomes"
+#     17069 "Qatari"
+#     16284 "NorthernSweden"
+#     16074 "Siberian"
+#     15948 "TWINSUK"
+#     15948 "TOMMO"
+#     15948 "ALSPAC"
+#     15408 "GENOME_DK"
+#     15356 "GnomAD"
+#     15340 "GoNL"
+#     15198 "Estonian"
+#     13086 "Vietnamese"
+#     12000 "Korea1K"
+#     11238 "HapMap"
+#      3406 "PRJEB36033"
+#      2454 "HGDP_Stanford"
+#      2018 "Daghestan"
+#      1846 "PAGE_STUDY"
+#      1204 "Chileans"
+#      1134 "MGP"
+#      1116 "PRJEB37584"
+#       474 "GoESP"
+#       468 "ExAC"
+#       460 "GnomAD_exomes"
+#       318 "FINRISK"
+#        46 "PharmGKB"
+#         6 "PRJEB37766"
+
+    # This time the JSON downloads include a file frequency_studies.json that describes each study.
+    # Will be useful for making a details page, but some descriptions are just study names.
+
+    # Got total_count values from refsnp-chr3.json.bz2 above, put ALFA 2.0 (dbGaP_PopFreq) first, then order by
+
+    # decreasing total_count:
+
+#     22432 "dbGaP_PopFreq"
+#     34713 "TOPMED"
+#     21128 "KOREAN"
+#     17553 "SGDP_PRJ"
+#     17470 "1000Genomes"
+#     17069 "Qatari"
+#     16284 "NorthernSweden"
+#     16074 "Siberian"
+#     15948 "TWINSUK"
+#     15948 "TOMMO"
+#     15948 "ALSPAC"
+#     15408 "GENOME_DK"
+#     15356 "GnomAD"
+#     15340 "GoNL"
+#     15198 "Estonian"
+#     13086 "Vietnamese"
+#     12000 "Korea1K"
+#     11238 "HapMap"
+#      3406 "PRJEB36033"
+#      2454 "HGDP_Stanford"
+#      2018 "Daghestan"
+#      1846 "PAGE_STUDY"
+#      1204 "Chileans"
+#      1134 "MGP"
+#      1116 "PRJEB37584"
+#       474 "GoESP"
+#       468 "ExAC"
+#       460 "GnomAD_exomes"
+#       318 "FINRISK"
+#        46 "PharmGKB"
+#         6 "PRJEB37766"
+
+
+
+#NOTE freqSourceOrder with ALFA2 aka dbGaP_PopFreq as the primary study. This was later abandoned since the common set was much smaller, went back to putting 1000Genomes first later on.
+    freqSourceOrder=dbGaP_PopFreq,TOPMED,KOREAN,SGDP_PRJ,1000Genomes,Qatari,NorthernSweden,Siberian,TWINSUK,TOMMO,ALSPAC,GENOME_DK,GnomAD,GoNL,Estonian,Vietnamese,Korea1K,HapMap,PRJEB36033,HGDP_Stanford,Daghestan,PAGE_STUDY,Chileans,MGP,PRJEB37584,GoESP,ExAC,GnomAD_exomes,FINRISK,PharmGKB,PRJEB37766
+
+    # Hmm, the PAGE study is a population genetics study.  It excludes caucasians:
+    # https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf
+    # "Genotyped individuals self-identified as Hispanic/Latino (N=22,216),
+    #  African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940),
+    #  Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage,
+    #  as well as participants who did not identify with any of the available options."
+    # They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost
+    # as many Native Hawaiians as Asians), so I'll keep 1000Genomes first.
+
+    cd $topDir
+
+    # dbSnp has changed since v152, it now uses patch13 too, so "GRCh38.p13" instead of "GRCh38.p12"
+
+    # Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name.
+    hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
+    | tawk '{print $1, "GRCh38.p13", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \
+      > refSeqToUcsc.tab
+    hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
+    | tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \
+      >> refSeqToUcsc.tab
+
+    # has 45 more alts and fixes for hg38 just as expected for patch13.
+    
+
+    #make equivRegions.tab
+
+    # hg38 patch13 RefSeq assembly accession: GCF_000001405.39 (latest)
+
+    # Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37,
+    # so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings.
+    refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions
+    grep -v ^# \
+      $refseqAssemblies/GCF_000001405.{25_GRCh37.p13,39_GRCh38.p13}/*_assembly_structure/all_alt_scaffold_placement.txt \
+    | tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \
+    | sort -k 1,1 -k2n,2n \
+    | tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \
+      > equivRegions.tab
+
+    # Add PARs:
+    grep -w PAR \
+      $refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \
+    | sort \
+    | sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \
+    | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
+
+#PAR#1   NC_000023.10:60000:2699520
+#PAR#1   NC_000024.9:10000:2649520
+#PAR#2   NC_000023.10:154931043:155260560
+#PAR#2   NC_000024.9:59034049:59363566
+    echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab
+    echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab
+
+    grep -w PAR \
+      $refseqAssemblies/GCF_000001405.39_GRCh38.p13/*_assembly_regions.txt \
+    | sort \
+    | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \
+    | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
+
+#PAR#1   NC_000023.11:10000:2781479
+#PAR#1   NC_000024.10:10000:2781479
+#PAR#2   NC_000023.11:155701382:156030895
+#PAR#2   NC_000024.10:56887902:57217415
+    echo -e "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab
+    echo -e "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab
+
+    # Patch doBigDbSnp.pl to update the default hg38 assembly to p13.
+    vi $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl
+    # around line 47, change to GRCh38.p13 and save and commit and push this new default.
+    -my $assemblyList = 'GRCh37.p13,GRCh38.p12';
+    +my $assemblyList = 'GRCh37.p13,GRCh38.p13';
+
+
+    # Run doBigDbSnp.pl (first with -debug to make runDir
+    # FYI the system is currently using ku cluster to run parasol jobs.
+
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -debug
+# *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30
+    cd /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -stop install \
+      >& do.log &
+    tail -f do.log
+
+
+convert step choked because the step above for create equivRegions.tab lacked -e on echo statement
+which meant that the "\t" was NOT converted to an actual tab. Must re-run.
+
+Re-ran from start but died on step fixHg19ChrM which was looking for NC_012920 but that has been renamed to just chrMT in the dbsnp data for hg19.
+fixed doFixHg19ChrM.sh by tweaking the doBigDbSnp.pl that creates it.
+
+Even though perhaps $freqSourceOrder is only needed for the convert step,
+the script requires it.
+
+Lame that after starting up screen, it lost my local variables,
+so I had to redefine topDir and freqSourceOrder after launching screen program for running background jobs.
+
+# resuming
+    cd /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder\
+      -buildDir=`pwd` -continue check -stop install \
+      >& check.log &
+    tail -f check.log
+
+ *** All done !  (through the 'install' step)  Elapsed time: 408m55s
+ *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2021-09-30
+
+# GALT 2022-05-14
+
+    # FYI
+    # split   step runs on hgwdev
+    # convert step runs on ku  which has 32 machines that have 32 CPUs each.
+    # fixed freqSoureOrder changed, back to using 1000Genomes as the primary study, instead of ALFA2 dbGaP_PopFreq
+
+    topDir=/hive/data/outside/dbSNP/155
+    # after trying ALFA2 as primary study, it did not have enought SNPs, so I am going back to using 1000Genomes as the primary study.
+    freqSourceOrder=1000Genomes,dbGaP_PopFreq,TOPMED,KOREAN,SGDP_PRJ,Qatari,NorthernSweden,Siberian,TWINSUK,TOMMO,ALSPAC,GENOME_DK,GnomAD,GoNL,Estonian,Vietnamese,Korea1K,HapMap,PRJEB36033,HGDP_Stanford,Daghestan,PAGE_STUDY,Chileans,MGP,PRJEB37584,GoESP,ExAC,GnomAD_exomes,FINRISK,PharmGKB,PRJEB37766
+    # Run doBigDbSnp.pl (first with -debug to make runDir):
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder -debug
+
+# *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14
+
+
+    cd /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14
+    # Link to ../bigDbSnp.2021-09-30/split, -continue convert to avoid re-splitting (the slowest part of the process):
+    rmdir split
+    ln -s ../bigDbSnp.2021-09-30/split split
+    rmdir splitProcessed
+    ln -s ../bigDbSnp.2021-09-30/splitProcessed splitProcessed
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \
+      -buildDir=`pwd` -continue convert -stop install \
+      >& do.log &
+    tail -f do.log
+    # crashed on the mergeToChrom step
+
+    # NOTICE that on mergeToChrom it uses /dev/shm/ virtual memory disk for temp files,
+    # and it can run out of space, and if you have to stop and restart it,
+    # be sure to clean it out properly like this:
+# para stop  # seems to flush results too.
+# para freeBatch  # should also fix any sick batches and machines.
+#
+# TO CLEAN:
+# rm -fr /dev/shm/dbSnpMergeSortBed.*/
+# rm -fr /dev/shm/dbSnpMergeSort.*/
+#
+# Note that it also overflowed the relatively small space in /scratch/tmp/ which
+# is used by sort in the sort command.  So I had to drastically reduce the maximum
+# number of simultaneous jobs, all the way down to just 6 jobs!
+# added -maxJob=6 to the para make jobList command in doMergeToChrom.sh
+/parasol/bin/para make -maxJob=6 jobList
+# and cleaning out temp files in /dev/shm/ and then para stop, para freeBatch, 
+# and finally I re-ran doMergeToChrom.sh manually.
+
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \
+      -buildDir=`pwd` -continue mergeChroms -stop install \
+      >& do2.log &
+    tail -f do2.log
+
+# step bigBed choked on hg19 because of a subtle bug in resSizes counting which is now fixed.
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 155 $freqSourceOrder \
+      -buildDir=`pwd` -continue bigBed -stop install \
+      >& do3.log &
+    tail -f do3.log
+
+ *** All done !  (through the 'install' step)  Elapsed time: 326m34s
+ *** Steps were performed in /hive/data/outside/dbSNP/155/bigDbSnp.2022-05-14
+
+    wc -l dbSnp155Errors.tab
+
+#13318 dbSnp155Errors.tab
+# nearly all of these "Errors" are because a small study called 'ChromosomeY' was not picked up
+# in our scan of the first 10000 lines of chrom 3 that we used above. However, this study is not big enough to be important.
+1011954 NOSEQ   freqSourceOrder does not contains 'ChromosomeY'
+
+I copied track description page fron 153 and updated the content
+src/hg/makeDb/trackDb/human/dbSnp155Composite.html
+
+Informally, I made a couple of scripts in here to help update the studies list and the keywords counts sections of the description page.
+/hive/data/outside/dbSNP/155/json/
 
 ##############################################################################