0f4667da863293da78301c7341e6d14151b6458e chmalee Wed Jan 15 14:36:30 2020 -0800 Forgot makedoc for missense constraint track and new util for converting between amino acid ranges and genomic positions, refs #24751 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 6603bb7..863a7e9 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -33871,15 +33871,94 @@ # lstring cohortDescription; "Description of sample population for the study" # lstring genes; "Genes overlapping this variant" # lstring samples; "Sample IDs if available" # uint _size; "Genomic Size of variant" # ) bedToBigBed -tab -as=dgvPlusSize.as -type=bed9+14 dgvMergedWithSize.bed /hive/data/genomes/hg19/chrom.sizes dgvMerged.bb # pass1 - making usageList (24 chroms): 1054 millis # pass2 - checking and writing primary data (392583 records, 23 fields): 8495 millis bedToBigBed -tab -as=dgvPlusSize.as -type=bed9+14 dgvSupportingWithSize.bed /hive/data/genomes/hg19/chrom.sizes dgvSupporting.bb # pass1 - making usageList (25 chroms): 2577 millis # pass2 - checking and writing primary data (6668715 records, 23 fields): 27271 millis cd /gbdb/hg19/dgv/ ln -s /hive/data/genomes/hg19/bed/dgv/160810/dgvMerged.bb ln -s /hive/data/genomes/hg19/bed/dgv/160810/dgvSupporting.bb ############################################################################## +# gnomAD Missense Constraint Scores track - Jan 15 2020 - ChrisL + + cd /hive/data/outside/gnomAD.2/constraint/ + mkdir missense + + gsutil cat gs://gnomad-public/legacy/exacv1_downloads/release1/regional_missense_constraint/README_fordist_mpc_values > README.txt + + # so the v2 files are what we want since they are most recent: + gsutil cp gs://gnomad-public/legacy/exacv1_downloads/release1/regional_missense_constraint/fordist_constraint_official_mpc_values_v2.txt.* . + + # this is not what we want, at least not for what should be a bigBed, although this would make a nice bigWig file? + chrom pos ref alt gene_name ENST ENSG CCDS Consequence HGVSc HGVSp Amino_acids context SIFT PolyPhen obs_exp mis_badness fitted_score MPC + 1 69094 G A OR4F5 ENST00000335137 ENSG00000186092 CCDS30547.1 missense_variant ENST00000335137.3:c.4G>A ENSP00000334393.3:p.Val2Met V/M GGT deleterious(0.01) possibly_damaging(0.828) 0.115421521859 0.359925753267 0.689727513402 2.7340307082 + + # After reading the paper: https://www.biorxiv.org/content/10.1101/148353v1.full + # it looks like what I want is in supp table 4, which is an excel sheet ugh + wget https://www.biorxiv.org/highwire/filestream/44323/field_highwire_adjunct_files/2/148353-3.xlsx + xlsx2csv -a -d 'tab' 148353-3.xlsx 148353-3 + ls 148353-3 + # Info.csv Table_S4.csv + + # Table_S4.csv is where it's at: + # head -2 148353-3/Table_S4.csv + transcript gene chr amino_acids genomic_start genomic_end obs_mis exp_mis obs_exp chisq_diff_null region_name + ENST00000337907.3 RERE 1 1-507 8716356 8424825 97 197.9807 0.489947 51.505535 RERE_1 + ENST00000337907.3 RERE 1 508-1567 8424824 8415147 355 438.045275 0.810419 15.743847 RERE_2 + + # now I need to get this into exons somehow + hgsql -Ne "select * from wgEncodeGencodeCompV19" | cut -f2- | genePredToBed > hg19.gencodeV19.txt + bedToPsl /hive/data/genomes/hg19/chrom.sizes hg19.gencodeV19.txt v19.psl + # pslMap would work here but since I don't know how to make a psl for RERE:1-507 I can't supply + # the input psl that pslMap needs. thus I'll need a new util + + # first trim the utrs from v19: + ./trimUtrs.py hg19.gencodeV19.txt trimmedUtrs.txt + # 99448 transcript added to transcript dict + # are these correct? + bedToExons trimmedUtrs.txt my.gencode.exonsOnly + bedToExons -cdsOnly hg19.gencodeV19.txt gencode.exonsOnly + # the awk removes the non-coding transcripts + diff <(cut -f1-4 gencode.exonsOnly | tawk '{if ($3 != $2) print}' | sort -k4) <(cut -f1-4 trimmedUtrs.txt | sort -k4) + # no diffs so we're good + + # now chop up exons according to the amino acids: + ./aaToGenomic.py trimmedUtrs.txt 148353-3/Table_S4.csv > aaToBed.out + # make autoSql file, regular bed12 plus one for the gene name and one for the chi square value + # table missenseConstraint + # "Parts of transcripts shaded according to how well that region of the transcript tolerates missense variation." + # ( + # string chrom; "Chromosome (or contig, scaffold, etc.)" + # uint chromStart; "Start position in chromosome" + # uint chromEnd; "End position in chromosome" + # string name; "Name of item" + # uint score; "Score from 0-1000" + # char[1] strand; "+ or -" + # uint thickStart; "Start of where display should be thick (start codon)" + # uint thickEnd; "End of where display should be thick (stop codon)" + # uint reserved; "RGB color of item" + # int blockCount; "Number of blocks" + # int[blockCount] blockSizes; "Comma separated list of block sizes" + # int[blockCount] chromStarts; "Start positions relative to chromStart" + # string geneName; "Name of corresponding gene" + # int observed; "Number of observed missense variants" + # float expected; "Number of expected missense variants" + # float obs_exp; "Observed/expected score" + # float chisq; "Chi-Squared Difference" + # ) + + # now we can make the bigBed: + ./aaToGenomic.py trimmedUtrs.txt 148353-3/Table_S4.csv | sort -k1,1 -k2,2n > missenseConstrained.bed + bedToBigBed -tab -type=bed12+5 -as=missenseBed.as missenseConstrained.bed /hive/data/genomes/hg19/chrom.sizes missenseConstrained.bb + # pass1 - making usageList (24 chroms): 5 millis + # pass2 - checking and writing primary data (6507 records, 17 fields): 134 millis + # only a few genes: + cut -f13 missenseConstrained.bed | sort | uniq | wc -l + # 2700 + ln -s /gbdb/hg19/gnomAD/missenseConstrained.bb missenseConstrained.bb + +##############################################################################