18af2bb33abcd036c8cd8539580a8943ce8cb8ad chmalee Fri Jan 17 08:44:45 2020 -0800 Add full path to gnomad related track building scripts, from Max email diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 4e460b2..c6fd5ad 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -33762,37 +33762,37 @@ # 289674 total # check that v19 has all the transcripts: comm -12 hg19.gencodeV19.transcripts pliByGene.transcripts | wc -l 19704 comm -12 hg19.gencodeV19.transcripts pliByTranscripts.transcripts | wc -l 80950 rm hg19.gencodeV19.transcripts # ok safe to use v19 exon boundaries, just need to drop the version numbers: hgsql -Ne "select * from wgEncodeGencodeCompV19" hg19 | cut -f2- | genePredToBed | sed -Ee 's/\.[0-9]+//' | sort -k4 > hg19.gencodeCompV19.bed12 gzip -cd $geneFile | tail -n +2 \ | tawk '{print $75,$76,$77,$64,$65,$1,$2,$3,$4,$5,$33,$12,$13,$14,$32,$17,$20,$21,$24,$25,$26,$27,$28,$29,$30}' \ | sort -k7 | join -t $'\t' -1 4 -2 7 hg19.gencodeCompV19.bed12 - \ - | ./combine.awk -v doTranscripts=false 2>genes.chromMismatches \ + | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=false 2>genes.chromMismatches \ | sort -k1,1 -k2,2n > pliByGene.bed gzip -cd $transcriptFile | tail -n +2 \ | tawk '{print $76,$77,$78,$65,$66,$1,$2,$4,$5,$6,$34,$13,$14,$15,$33,$18,$21,$22,$25,$26,$27,$28,$29,$30,$31}' \ | sort -k7 | join -t $'\t' -1 4 -2 7 hg19.gencodeCompV19.bed12 - \ - | ./combine.awk -v doTranscripts=true 2>transcripts.chromMismatches \ + | ~/kent/src/hg/makeDb/gnomad/combine.awk -v doTranscripts=true 2>transcripts.chromMismatches \ | sort -k1,1 -k2,2n > pliByTranscript.bed # make .as file: # table pliMetrics # "bed12+5 for displaying gnomAD haploinsufficiency prediction scores" # ( # string chrom; "Reference sequence chromosome or scaffold" # uint chromStart; "Start position in chromosome" # uint chromEnd; "End position in chromosome" # string name; "ENST or ENSG Name" # uint score; "pLI score between 0-1000" # char[1] strand; "strand of transcript" # uint thickStart; "Start of where display is thick" # uint thickEnd; "End of where display should be thick" # uint itemRgb; "Color of item" @@ -33906,41 +33906,41 @@ # Info.csv Table_S4.csv # Table_S4.csv is where it's at: # head -2 148353-3/Table_S4.csv transcript gene chr amino_acids genomic_start genomic_end obs_mis exp_mis obs_exp chisq_diff_null region_name ENST00000337907.3 RERE 1 1-507 8716356 8424825 97 197.9807 0.489947 51.505535 RERE_1 ENST00000337907.3 RERE 1 508-1567 8424824 8415147 355 438.045275 0.810419 15.743847 RERE_2 # now I need to get this into exons somehow hgsql -Ne "select * from wgEncodeGencodeCompV19" | cut -f2- | genePredToBed > hg19.gencodeV19.txt bedToPsl /hive/data/genomes/hg19/chrom.sizes hg19.gencodeV19.txt v19.psl # pslMap would work here but since I don't know how to make a psl for RERE:1-507 I can't supply # the input psl that pslMap needs. thus I'll need a new util # first trim the utrs from v19: - ./trimUtrs.py hg19.gencodeV19.txt trimmedUtrs.txt + ~/kent/src/hg/makeDb/gnomad/trimUtrs.py hg19.gencodeV19.txt trimmedUtrs.txt # 99448 transcript added to transcript dict # are these correct? bedToExons trimmedUtrs.txt my.gencode.exonsOnly bedToExons -cdsOnly hg19.gencodeV19.txt gencode.exonsOnly # the awk removes the non-coding transcripts diff <(cut -f1-4 gencode.exonsOnly | tawk '{if ($3 != $2) print}' | sort -k4) <(cut -f1-4 trimmedUtrs.txt | sort -k4) # no diffs so we're good # now chop up exons according to the amino acids: - ./aaToGenomic.py trimmedUtrs.txt 148353-3/Table_S4.csv > aaToBed.out + ~/kent/src/hg/makeDb/gnomad/aaToGenomic.py trimmedUtrs.txt 148353-3/Table_S4.csv > aaToBed.out # make autoSql file, regular bed12 plus one for the gene name and one for the chi square value # table missenseConstraint # "Parts of transcripts shaded according to how well that region of the transcript tolerates missense variation." # ( # string chrom; "Chromosome (or contig, scaffold, etc.)" # uint chromStart; "Start position in chromosome" # uint chromEnd; "End position in chromosome" # string name; "Name of item" # uint score; "Score from 0-1000" # char[1] strand; "+ or -" # uint thickStart; "Start of where display should be thick (start codon)" # uint thickEnd; "End of where display should be thick (stop codon)" # uint reserved; "RGB color of item" # int blockCount; "Number of blocks" # int[blockCount] blockSizes; "Comma separated list of block sizes"