src/hg/makeDb/doc/bigDbSnp.txt 122806164798753fbfbce3cc77411c911a4d9d1e

122806164798753fbfbce3cc77411c911a4d9d1e
angie
  Tue Oct 1 13:15:36 2019 -0700
Adding automation script doBigDbSnp.pl that downloads dbSNP JSON files, does a cluster run of dbSnpJsonToTab, aggregates results, runs checkBigDbSnp & installs track files.  refs #23283

diff --git src/hg/makeDb/doc/bigDbSnp.txt src/hg/makeDb/doc/bigDbSnp.txt
new file mode 100644
index 0000000..fd7c647
--- /dev/null
+++ src/hg/makeDb/doc/bigDbSnp.txt
@@ -0,0 +1,279 @@
+# for emacs: -*- mode: sh; -*-
+
+# dbSnpNNN ("dbSNP 2.0") bigDbSnp tracks for hg38 / GRCh38 and hg19 / GRCh37
+
+##############################################################################
+# dbSnp152: dbSNP build 152 (DONE 9/18/19 angie)
+
+    topDir=/hive/data/outside/dbSNP/152
+    mkdir -p $topDir/json
+    cd $topDir/json
+    wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
+    md5sum -c CHECKSUMS
+#refsnp-chr10.json.bz2: OK
+#...
+#refsnp-withdrawn.json.bz2: OK
+
+    # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
+    # by sampling first 10,000 variants on an arbitrary chrom:
+    set assemblyPath = "primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
+    set rnaSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
+    set proteinSoPath = "primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
+    set freqSourcePath = "primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
+    foreach jPath ("$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath")
+      echo "$jPath"
+      bzcat refsnp-chr3.json.bz2 \
+      | head -10000 \
+      | jsonQuery -countUniq -verbose=2 stdin "$jPath" stdout \
+      | sort -nr
+      echo ""
+    end
+    # Assemblies:
+#     10229 "GRCh38.p12"
+#     10111 "GRCh37.p13"
+    # RNA SO terms -- make sure all of these appear in soTerm.[ch]:
+#    147701 "SO:0001627"
+#     26809 "SO:0002153"
+#     19013 "SO:0002152"
+#      8334 "SO:0001624"
+#      3082 "SO:0001986"
+#      2657 "SO:0001580"
+#      1772 "SO:0001619"
+#      1741 "SO:0001987"
+#       833 "SO:0001623"
+#        30 "SO:0001575"
+#        10 "SO:0001574"
+#         4 "SO:0001590"
+    # Protein SO terms -- ditto for soTerm.[ch]:
+#       784 "SO:0001819"
+#       713 "SO:0001583"
+#         6 "SO:0001821"
+#         5 "SO:0001587"
+#         2 "SO:0000865"
+    # Made sure all those are in hg/{inc,lib}/soTerm.[ch]
+
+    # Projects reporting allele counts/frequencies:
+#     18130 "GnomAD"
+#     17473 "1000Genomes"
+#     17376 "TOPMED"
+#     15952 "TWINSUK"
+#     15952 "ALSPAC"
+#     15188 "Estonian"
+#       555 "GnomAD_exomes"
+#       473 "GoESP"
+#       468 "ExAC"
+    freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,GnomAD,GoESP,ALSPAC,TWINSUK,Estonian
+
+    cd $topDir
+    # Construct a mapping from RefSeq accessions like NC_000... to assembly, 2bit, and UCSC name.
+    hgsql hg38 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
+    | tawk '{print $1, "GRCh38.p12", "/hive/data/genomes/hg38/hg38.2bit", $2;}' \
+      > refSeqToUcsc.tab
+    hgsql hg19 -NBe 'select alias, chrom from chromAlias where find_in_set("refseq", source);' \
+    | tawk '{print $1, "GRCh37.p13", "/hive/data/genomes/hg19/hg19.2bit", $2;}' \
+      >> refSeqToUcsc.tab
+
+    # Construct a mapping of equivalent RefSeq assembly regions for GRCh38 and GRCh37,
+    # so we can distinguish multiple mappings to PAR/alts/fixes from plain old multiple mappings.
+    refseqAssemblies=/hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions
+    grep -v ^# \
+      $refseqAssemblies/GCF_000001405.{25_GRCh37.p13,38_GRCh38.p12}/*_assembly_structure/all_alt_scaffold_placement.txt \
+    | tawk '{print $7, $12-1, $13, $4, $10-1, $11;}' \
+    | sort -k 1,1 -k2n,2n \
+    | tawk '{print $1 ":" $2 ":" $3, $4 ":" $5 ":" $6;}' \
+      > equivRegions.tab
+    # Add PARs:
+    grep -w PAR \
+      $refseqAssemblies/GCF_000001405.25_GRCh37.p13/*_assembly_regions.txt \
+    | sort \
+    | sed -e 's/X/NC_000023.10/; s/Y/NC_000024.9/;' \
+    | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
+#PAR#1   NC_000023.10:60000:2699520
+#PAR#1   NC_000024.9:10000:2649520
+#PAR#2   NC_000023.10:154931043:155260560
+#PAR#2   NC_000024.9:59034049:59363566
+    echo -e "NC_000023.10:60000:2699520\tNC_000024.9:10000:2649520" >> equivRegions.tab
+    echo -e "NC_000023.10:154931043:155260560\tNC_000024.9:59034049:59363566" >> equivRegions.tab
+    grep -w PAR \
+      $refseqAssemblies/GCF_000001405.38_GRCh38.p12/*_assembly_regions.txt \
+    | sort \
+    | sed -e 's/X/NC_000023.11/; s/Y/NC_000024.10/;' \
+    | tawk '{print $1, $2 ":" $3 - 1 ":" $4;}'
+#PAR#1   NC_000023.11:10000:2781479
+#PAR#1   NC_000024.10:10000:2781479
+#PAR#2   NC_000023.11:155701382:156030895
+#PAR#2   NC_000024.10:56887902:57217415
+    echo "NC_000023.11:10000:2781479\tNC_000024.10:10000:2781479" >> equivRegions.tab
+    echo "NC_000023.11:155701382:156030895\tNC_000024.10:56887902:57217415" >> equivRegions.tab
+
+
+    # Run doBigDbSnp.pl...
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -continue convert -stop bigBed \
+      >& do.log &
+    tail -f do.log
+# *** All done !  (through the 'install' step)  Elapsed time: 278m36s
+# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
+
+    # 9/17/19: re-run from dbSnpJsonToTab onward after lots of changes
+    topDir=/hive/data/outside/dbSNP/152
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install -debug
+# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
+    cd /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
+    # Link to ../split, -continue convert to avoid re-splitting (the slowest part of the process):
+    rm split
+    ln -s ../split split
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 152 $freqSourceOrder \
+      -buildDir=`pwd` -continue convert -stop install \
+      >& do.log &
+    tail -f do.log
+# *** All done !  (through the 'install' step)  Elapsed time: 449m6s
+# *** Steps were performed in /hive/data/outside/dbSNP/152/bigDbSnp.2019-09-17
+
+
+##############################################################################
+# dbSnp153: dbSNP build 153 (DONE 9/19/19 angie)
+
+    topDir=/hive/data/outside/dbSNP/153
+    mkdir -p $topDir/json
+    cd $topDir/json
+    wget --timestamping -nd ftp://ftp.ncbi.nih.gov/snp/latest_release/JSON/\*
+    md5sum -c CHECKSUMS
+#refsnp-chr10.json.bz2: OK
+#...
+#refsnp-withdrawn.json.bz2: OK
+
+    # jsonQuery commands to figure out what assemblies, SO terms and frequency sources are in there,
+    # by sampling first 10,000 variants on an arbitrary chrom:
+    assemblyPath="primary_snapshot_data.placements_with_allele[*].placement_annot.seq_id_traits_by_assembly[*].assembly_name"
+    rnaSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].sequence_ontology[*].accession"
+    proteinSoPath="primary_snapshot_data.allele_annotations[*].assembly_annotation[*].genes[*].rnas[*].protein.sequence_ontology[*].accession"
+    freqSourcePath="primary_snapshot_data.allele_annotations[*].frequency[*].study_name"
+    for jPath in "$assemblyPath" "$rnaSoPath" "$proteinSoPath" "$freqSourcePath"; do
+      echo "$jPath"
+      bzcat refsnp-chr3.json.bz2 \
+      | head -10000 \
+      | jsonQuery -countUniq -verbose=2 /dev/stdin "$jPath" stdout \
+      | sort -nr
+      echo ""
+    done
+    # Assemblies:
+#     10229 "GRCh38.p12"
+#     10111 "GRCh37.p13"
+
+    # RNA SO terms -- make sure all of these appear in soTerm.[ch]:
+#    144288 "SO:0001627"
+#     25829 "SO:0002153"
+#     19729 "SO:0002152"
+#      8506 "SO:0001624"
+#      3112 "SO:0001986"
+#      2648 "SO:0001580"
+#      1769 "SO:0001619"
+#      1712 "SO:0001987"
+#       878 "SO:0001623"
+#        30 "SO:0001575"
+#        10 "SO:0001574"
+#         4 "SO:0001590"
+    # Protein SO terms -- ditto for soTerm.[ch]:
+#       770 "SO:0001819"
+#       726 "SO:0001583"
+#         6 "SO:0001821"
+#         5 "SO:0001587"
+#         2 "SO:0000865"
+    # Made sure all those are in hg/{inc,lib}/soTerm.[ch] (nothing new since b152)
+
+    # Projects reporting allele counts/frequencies:
+#     17493 "1000Genomes"
+#     17392 "TOPMED"
+#     16619 "GnomAD"
+#     16306 "NorthernSweden"
+#     15970 "TWINSUK"
+#     15970 "ALSPAC"
+#     15202 "Estonian"
+#     13096 "Vietnamese"
+#      1844 "PAGE_STUDY"
+#       473 "GoESP"
+#       468 "ExAC"
+#       458 "GnomAD_exomes"
+
+    # This time the JSON downloads include a file frequency_studies.json that describes each study.
+    # Will be useful for making a details page, but some descriptions are just study names.
+    # Get total_count values from refsnp-chr3.json.bz2, put 1000Genomes first, then order by
+    # decreasing total_count:
+    # 1000Genomes:  5008
+    # GnomAD_exomes: 251006
+    # TOPMED: 125568
+    # ExAC: 121234
+    # PAGE_STUDY: 78694
+    # GnomAD: 31348
+    # GoESP: 12494
+    # Estonian: 4480
+    # ALSPAC: 3854
+    # TWINSUK: 3708
+    # NorthernSweden: 600
+    # Vietnamese: 212
+    freqSourceOrder=1000Genomes,GnomAD_exomes,TOPMED,ExAC,PAGE_STUDY,GnomAD,GoESP,Estonian,ALSPAC,TWINSUK,NorthernSweden,Vietnamese
+    # Hmm, the PAGE study is a population genetics study.  It excludes caucasians:
+    # https://www.biorxiv.org/content/biorxiv/early/2018/10/17/188094.full.pdf
+    # "Genotyped individuals self-identified as Hispanic/Latino (N=22,216),
+    #  African American (N=17,299), Asian (N=4,680), Native Hawaiian (N=3,940),
+    #  Native American (N=652), or Other (N=1,052, primarily South Asian or mixed heritage,
+    #  as well as participants who did not identify with any of the available options."
+    # They didn't attempt to make balanced global (non-caucasian) populations (e.g. almost
+    # as many Native Hawaiians as Asians), so I'll keep 1000Genomes first.
+
+    # Reuse assembly sequence mapping files from b152 since the assemblies are the same.
+    cd $topDir
+    cp ../152/refSeqToUcsc.tab .
+    cp ../152/equivRegions.tab .
+
+    # Run doBigDbSnp.pl (first with -debug to make runDir
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -debug
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -stop install \
+      >& do.log &
+    tail -f do.log
+# *** All done !  (through the 'install' step)  Elapsed time: 2095m35s
+
+    # 9/13/19: Now that checkDbSnp has been added to doBigDbSnp.pl, re-run from the check
+    # stage onward (but don't cleanup just yet in case we need to debug files).
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
+      -continue check -stop install \
+      >& check.log &
+    tail -f check.log
+
+    # 9/18/19: re-run from dbSnpJsonToTab onward after adding several more ucscNotes.
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir \
+      -buildDir=`pwd` -continue convert -stop install \
+      >& redo.log &
+    tail -f redo.log
+# *** All done !  (through the 'install' step)  Elapsed time: 263m59s
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
+
+    #*** uh-oh... when checkBigDbSnp failed, doCheck.sh did not fail... I guess backgrounding
+    #*** the jobs and 'wait' hide errors?
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-08-07
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir -buildDir=`pwd` \
+      -continue check -stop install \
+      >& check.log &
+    tail -f check.log
+
+    # 9/19/19: and again after changing doBigDbSnp.pl to have args & wait on specific pids:
+    # Run doBigDbSnp.pl (first with -debug to make runDir):
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder -debug
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
+    cd /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
+    # Link to ../bigDbSnp.2019-08-07/split, -continue convert to avoid re-splitting (the slowest part of the process):
+    rmdir split
+    ln -s ../bigDbSnp.2019-08-07/split split
+    $HOME/kent/src/hg/utils/automation/doBigDbSnp.pl $topDir 153 $freqSourceOrder \
+      -buildDir=`pwd` -continue convert -stop install \
+      >& do.log &
+    tail -f do.log
+# *** All done !  (through the 'install' step)  Elapsed time: 491m30s
+# *** Steps were performed in /hive/data/outside/dbSNP/153/bigDbSnp.2019-09-19
+
+
+##############################################################################