0f4667da863293da78301c7341e6d14151b6458e
chmalee
  Wed Jan 15 14:36:30 2020 -0800
Forgot makedoc for missense constraint track and new util for converting between amino acid ranges and genomic positions, refs #24751

diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt
index 6603bb7..863a7e9 100644
--- src/hg/makeDb/doc/hg19.txt
+++ src/hg/makeDb/doc/hg19.txt
@@ -33871,15 +33871,94 @@
 #     lstring cohortDescription; "Description of sample population for the study"
 #     lstring genes;      "Genes overlapping this variant"
 #     lstring samples;    "Sample IDs if available"
 #     uint _size;         "Genomic Size of variant"
 #     )
 bedToBigBed -tab -as=dgvPlusSize.as -type=bed9+14 dgvMergedWithSize.bed  /hive/data/genomes/hg19/chrom.sizes dgvMerged.bb
 # pass1 - making usageList (24 chroms): 1054 millis
 # pass2 - checking and writing primary data (392583 records, 23 fields): 8495 millis
 bedToBigBed -tab -as=dgvPlusSize.as -type=bed9+14 dgvSupportingWithSize.bed  /hive/data/genomes/hg19/chrom.sizes dgvSupporting.bb
 # pass1 - making usageList (25 chroms): 2577 millis
 # pass2 - checking and writing primary data (6668715 records, 23 fields): 27271 millis
 cd /gbdb/hg19/dgv/
 ln -s /hive/data/genomes/hg19/bed/dgv/160810/dgvMerged.bb
 ln -s /hive/data/genomes/hg19/bed/dgv/160810/dgvSupporting.bb
 ##############################################################################
+# gnomAD Missense Constraint Scores track - Jan 15 2020 - ChrisL
+
+    cd /hive/data/outside/gnomAD.2/constraint/
+    mkdir missense
+
+    gsutil cat gs://gnomad-public/legacy/exacv1_downloads/release1/regional_missense_constraint/README_fordist_mpc_values > README.txt
+
+    # so the v2 files are what we want since they are most recent:
+    gsutil cp gs://gnomad-public/legacy/exacv1_downloads/release1/regional_missense_constraint/fordist_constraint_official_mpc_values_v2.txt.* .
+
+    # this is not what we want, at least not for what should be a bigBed, although this would make a nice bigWig file?
+    chrom	pos	ref	alt	gene_name	ENST	ENSG	CCDS	Consequence	HGVSc	HGVSp	Amino_acids	context	SIFT	PolyPhen	obs_exp	mis_badness	fitted_score	MPC
+    1	69094	G	A	OR4F5	ENST00000335137	ENSG00000186092	CCDS30547.1	missense_variant	ENST00000335137.3:c.4G>A	ENSP00000334393.3:p.Val2Met	V/M	GGT	deleterious(0.01)	possibly_damaging(0.828)	0.115421521859	0.359925753267	0.689727513402	2.7340307082
+
+    # After reading the paper: https://www.biorxiv.org/content/10.1101/148353v1.full
+    # it looks like what I want is in supp table 4, which is an excel sheet ugh
+    wget https://www.biorxiv.org/highwire/filestream/44323/field_highwire_adjunct_files/2/148353-3.xlsx
+    xlsx2csv -a -d 'tab' 148353-3.xlsx 148353-3
+    ls 148353-3
+    # Info.csv  Table_S4.csv
+
+    # Table_S4.csv is where it's at:
+    # head -2 148353-3/Table_S4.csv
+    transcript  gene    chr amino_acids genomic_start   genomic_end obs_mis exp_mis obs_exp chisq_diff_null region_name
+    ENST00000337907.3   RERE    1   1-507   8716356 8424825 97  197.9807    0.489947    51.505535   RERE_1
+    ENST00000337907.3	RERE	1	508-1567	8424824	8415147	355	438.045275	0.810419	15.743847	RERE_2
+
+    # now I need to get this into exons somehow
+    hgsql -Ne "select * from wgEncodeGencodeCompV19" | cut -f2- | genePredToBed > hg19.gencodeV19.txt
+    bedToPsl /hive/data/genomes/hg19/chrom.sizes hg19.gencodeV19.txt v19.psl
+    # pslMap would work here but since I don't know how to make a psl for RERE:1-507 I can't supply
+    # the input psl that pslMap needs. thus I'll need a new util
+
+    # first trim the utrs from v19:
+    ./trimUtrs.py hg19.gencodeV19.txt trimmedUtrs.txt
+    # 99448 transcript added to transcript dict
+    # are these correct?
+    bedToExons trimmedUtrs.txt my.gencode.exonsOnly
+    bedToExons -cdsOnly hg19.gencodeV19.txt gencode.exonsOnly
+    # the awk removes the non-coding transcripts
+    diff <(cut -f1-4 gencode.exonsOnly | tawk '{if ($3 != $2) print}' | sort -k4) <(cut -f1-4 trimmedUtrs.txt | sort -k4)
+    # no diffs so we're good
+
+    # now chop up exons according to the amino acids:
+    ./aaToGenomic.py trimmedUtrs.txt 148353-3/Table_S4.csv > aaToBed.out
+    # make autoSql file, regular bed12 plus one for the gene name and one for the chi square value
+    # table missenseConstraint
+    # "Parts of transcripts shaded according to how well that region of the transcript tolerates missense variation."
+    #     (
+    #     string chrom;      "Chromosome (or contig, scaffold, etc.)"
+    #     uint   chromStart; "Start position in chromosome"
+    #     uint   chromEnd;   "End position in chromosome"
+    #     string name;       "Name of item"
+    #     uint   score;      "Score from 0-1000"
+    #     char[1] strand;    "+ or -"
+    #     uint thickStart;   "Start of where display should be thick (start codon)"
+    #     uint thickEnd;     "End of where display should be thick (stop codon)"
+    #     uint reserved;     "RGB color of item"
+    #     int blockCount;    "Number of blocks"
+    #     int[blockCount] blockSizes; "Comma separated list of block sizes"
+    #     int[blockCount] chromStarts; "Start positions relative to chromStart"
+    #     string geneName;     "Name of corresponding gene"
+    #     int observed;      "Number of observed missense variants"
+    #     float expected;      "Number of expected missense variants"
+    #     float obs_exp;      "Observed/expected score"
+    #     float chisq;       "Chi-Squared Difference"
+    #     )
+
+    # now we can make the bigBed:
+    ./aaToGenomic.py trimmedUtrs.txt 148353-3/Table_S4.csv | sort -k1,1 -k2,2n > missenseConstrained.bed
+    bedToBigBed -tab -type=bed12+5 -as=missenseBed.as missenseConstrained.bed /hive/data/genomes/hg19/chrom.sizes missenseConstrained.bb
+    # pass1 - making usageList (24 chroms): 5 millis
+    # pass2 - checking and writing primary data (6507 records, 17 fields): 134 millis
+    # only a few genes:
+    cut -f13 missenseConstrained.bed | sort | uniq | wc -l
+    # 2700
+    ln -s /gbdb/hg19/gnomAD/missenseConstrained.bb missenseConstrained.bb
+
+##############################################################################