8979882863a20beb00f7c72cfc4263a1e5a9863b mspeir Fri Oct 15 15:28:44 2021 -0700 Updating gtex README with latest version of script and other details about latest version of the track. Also tweaking track description page to be more accurate, refs #27947 diff --git src/hg/makeDb/doc/hg38/gtex.txt src/hg/makeDb/doc/hg38/gtex.txt index c777492..a9094e0 100644 --- src/hg/makeDb/doc/hg38/gtex.txt +++ src/hg/makeDb/doc/hg38/gtex.txt @@ -1,367 +1,394 @@ ############################################################################# # GTEx V6 (October 2015) Kate # Create BED from hgFixed tables (see doc/gtex) # Reloading during QA of track (fixing gene classes, adding scores). (March 2016) Kate cd /hive/data/outside/gtex/V6 # see doc/hg19.txt for how this genePred was made set chain = /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz liftOver -genePred gencodeV19.hg19.genePred $chain gtexGeneModelV6.hg38.genePred \ gencode.V19.hg38.unmapped # 926 unmapped hgLoadGenePred hg38 gtexGeneModelV6 gtexGeneModelV6.hg38.genePred # OLD: creates gtexGeneModelV6.hg38.genePred # OLD: NOTE: drops 192 transcripts. One I spot-checked indeed didn't exist in our hg38 genes cd /hive/data/genomes/hg38/bed mkdir -p gtex cd gtex # table renamed to gtexGeneModel later # Use latest GENCODE attrs file to get biotypes # create bed table ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed hg38 -gtexVersion=V6 \ -noLoad -gencodeVersion=$gencodeVersion gtexGeneBedV6 -verbose=2 >&! makeGeneBed.log # 45 genes not found in GENCODE attributes table #Max score: 219385.906250 wc -l *.tab wc -l *.tab 52896 gtexGeneBedV6.tab # 55810 gtexGeneBedV6.tab # exploratory for assigning score based on sum of scores (expScores field) set bedScore = ~/kent/src/utils/bedScore/bedScore $bedScore -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.bed $bedScore -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.bed $bedScore -col=10 -minScore=1 -method=reg gtexGeneBedV6.tab gtexGeneBedV6.reg.bed $bedScore -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.bed $bedScore -col=10 -minScore=1 -method=reg -log gtexGeneBedV6.tab gtexGeneBedV6.reg.log.bed $bedScore -log -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.log.bed $bedScore -log -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.log.bed $bedScore -log -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.log.bed # Using -log -method=encode, as this is closest to density plot of all scores # as here: GtexTotalExpressionDensity.png, GtexTotalExpressionFrequency.png textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV6.encode.log.bed 1.000000 ************************************************************ 22889 72.357214 *************************************************** 4862 143.714428 ************************************************** 4199 215.071643 ************************************************* 3931 286.428857 ************************************************** 4329 357.786071 *************************************************** 5419 429.143285 ************************************************** 4472 500.500500 ********************************************* 1953 571.857714 ************************************** 564 643.214928 ******************************** 200 714.572142 ************************* 61 785.929356 *********** 6 857.286571 ******** 4 928.643785 ************ 7 $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneBedV6.tab gtexGeneBedV6.bed # load up set lib = ~/kent/src/hg/lib hgLoadBed hg38 -noBin -tab -type=bed6+4 \ -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \ gtexGeneBedNewV6 gtexGeneBedV6.bed # Add GTEx to Gene Sorter (2016-08-18 kate) # See hg/near/makeNear.doc ############################################################################# # GTEx V8 (Apr 2020) Kate # Create BED from hgFixed tables (see doc/gtex) # Load gene models (Gencode V26 transcript union from GTEx) cd /hive/data/outside/gtex/V8/rnaSeq gtfToGenePred gencode.v26.GRCh38.genes.gtf gencodeV26.hg38.genePred \ -infoOut=gtexGeneModelInfoV8.tab hgLoadGenePred hg38 gtexGeneModelV8 gencodeV26.hg38.genePred # Get transcript for each gene (why ?) tail -n +2 gtexGeneModelInfoV8.tab | awk '{printf("%s\t%s\n", $1, $9)}' > gtexGeneTranscriptsV8.tab #hgLoadSqlTab hgFixed gtexTranscriptV8 ~/kent/src/hg/lib/gtexTranscript.sql gtexGeneTranscriptsV8.tab # no schema (or table on hgwdev.hgFixed) # Load BED table cd /hive/data/genomes/hg38/bed/gtex mkdir V8 cd V8 set gencode = V26 ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \ hg38 -noLoad -gtexVersion=V8 -gencodeVersion=$gencode gtexGeneV8 -verbose=2 >&! log.txt & Reading wgEncodeGencodeAttrs table Reading gtexGeneModelV8 table Reading gtexTissueMedian table Writing tab file gtexGeneV8 Max score: 267400.000000 # Add scores set bedScore = ~/kent/src/utils/bedScore/bedScore $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV8.tab gtexGeneBedV8.bed textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV8.bed 0.000000 ************************************************************ 21287 71.428643 **************************************************** 5635 142.857286 **************************************************** 5513 214.285929 *************************************************** 4683 285.714571 *************************************************** 4480 357.143214 *************************************************** 4748 428.571857 **************************************************** 5466 500.000500 ************************************************ 3117 571.429143 ***************************************** 911 642.857786 ********************************* 252 714.286429 ************************** 81 785.715071 ************** 11 857.143714 ******** 4 928.572357 *************** 12 # load up set lib = ~/kent/src/hg/lib hgLoadBed hg38 -noBin -tab -type=bed6+4 \ -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \ gtexGeneV8 gtexGeneBedV8.bed #Read 56200 elements of size 10 from gtexGeneBedV8.bed ### TODO # Add GTEx to Gene Sorter (2016-08-18 kate) # See hg/near/makeNear.doc ############################################################################# # GTEx V8 cis-eQTLs CAVIAR High Confidence (Sept 2021) Matt cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_CAVIAR # Tar files were downloaded from https://gtexportal.org/home/datasets#filesetFilesDiv15 # This file was used for this track: wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_CAVIAR.tar # Then unpacked +# Other files used: -# Used this file: CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz + +# Initially planned to use this file: # CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz +# as it seemed to be a filtered subset of eQTLs # Description from GTEx_v8_finemapping_CAVIAR/README.txt -***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report -all the high causal variants (variants that have posterior probability of > 0.1). -# Started with this as it seems this is the data Kate used for hg19 eQTL tracks +# ***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report +# all the high causal variants (variants that have posterior probability of > 0.1). -# Sample line from file +# Sample header line: # TISSUE GENE eQTL CHROM POS Probability # Brain_Caudate_basal_ganglia ENSG00000248485.1 1_161274374 1 161274374 0.157456 +# However, the names/positions in the eQTL column are not unique meaning +# We want file with unique variant/eQTL names that match those in the GTEx +# variant mapping file: GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz +# Looks like the CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz has names +# that match the names in the variant/eQTL mapping file + +# Description from GTEx_v8_finemapping_CAVIAR/README.txt +# ***CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz —> is a single file for all GTEx tissues and all eGenes where we reported +# the CPP (Causal Posterior Probability). Each eQTL contains the allele information. Sample header file: + +# TISSUE GENE eQTL CHROM POS Probability +# Brain_Caudate_basal_ganglia ENSG00000248485.1 chr1_161274374_G_A_b38 1 161274374 0.157456 + +# So, take NOCUTOFF_with_Allele file and then filter for probability >0.1 (which is how HighConfidentVariants file was created according to GTEx README) +zcat CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz | awk '$6 > 0.1' | gzip -c > CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz + +# Confirm that it has the same size as original HighConfidentVariants file +zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | wc -l +1257158 +zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | wc -l +1257158 + # There seem to 31 duplicate eQTLs, meaning that the two lines for these entries are duplicated (probability values, etc.) # These will essentially be collapsed into a single entry by the buildInteract script -zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt +zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt wc -l dupe_eQtls.txt 31 dupe_eQtls.txt # Wrote buildInteract script to help build interact-format tracks from CAVIAR files: cat << '_EOF_' > buildInteract #!/usr/bin/env python3 import sys, gzip qtlFile = sys.argv[1] gpFile = sys.argv[2] alleleFile = sys.argv[3] +qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL +probCol = sys.argv[5] # Column name in file header, e.g. Probability # Open up all of our files qfh = gzip.open(qtlFile,"r") # QTL file gfh = open(gpFile,"r") # genePred file afh = gzip.open(alleleFile,"r") # allele file # Set up dicts for each qtlDict = dict() gpDict = dict() alleleDict = dict() +# Get indices of eQTL and probability fields +header = qfh.readline().decode('ASCII').rstrip().split("\t") +qtl_col = header.index(qtlIdCol) +prob_col = header.index(probCol) + # Process eQTL file for line in qfh: qtlLine = line.decode('ASCII').strip().split("\t") # Skip header line in file if "TISSUE" not in qtlLine: tissue = qtlLine[0] gene = qtlLine[1] - e_qtl = qtlLine[2] + e_qtl = qtlLine[qtl_col] # each of these elements on their own aren't unique, but together they are qtlKey = e_qtl + "|" + tissue + "|" + gene # Put these into a dict where key is unique name we've made qtlDict[qtlKey] = qtlLine # Process genePred file for line in gfh: gpLine = line.strip().split("\t") gene_id = gpLine[0] # Put into dict with key on ENSG* ID gpDict[gene_id] = gpLine # Process allele file from GTEx for line in afh: # gzip file, so we need to decode to asii first aLine = line.decode('ASCII').strip().split("\t") allele = aLine[0] - chrom = aLine[1].replace("chr","") - pos = aLine[2] - # Create key from combination of chrom + position - # In theory this should match the keys from the eQTL dict - alleleDict[chrom + "_" + pos] = aLine + alleleDict[allele] = aLine #7 is rsID for entry in qtlDict: # Create list to store elements that will become a single interact line # Interact format: http://genome.ucsc.edu/goldenPath/help/interact.html # Get gene information from the genePred dict # eQTL file/dict has ENSG id in it gene_id = qtlDict[entry][1] if gene_id in gpDict.keys(): gene_sym = gpDict[gene_id][11] gene_chrom = gpDict[gene_id][1] gene_strand = gpDict[gene_id][2] gene_start = int(gpDict[gene_id][3]) gene_end = int(gpDict[gene_id][4]) # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID var_pos = qtlDict[entry][2] if var_pos in alleleDict.keys(): rsID = alleleDict[var_pos][6] var_start = int(qtlDict[entry][4]) # Do some things to make sure start isn't ever greater than end for items # Deals with case where eQTL is internal to gene if var_start > gene_start and var_start < gene_end: chromStart = gene_start chromEnd = gene_end # Deals with case where eQTL is downstream elif var_start > gene_end: chromStart = gene_start chromEnd = var_start # Deals with case where eQTL is upstream elif var_start < gene_start: chromStart = var_start - 1 chromEnd = gene_end interactLine = list() interactLine.append("chr" + qtlDict[entry][3]) # chrom interactLine.append(chromStart) # chrStart interactLine.append(chromEnd) # chrEnd interactLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name interactLine.append(int("1000")) # Maybe I should scale up to 0-1000 range? # score - interactLine.append(qtlDict[entry][5]) # value + interactLine.append(qtlDict[entry][prob_col]) # value interactLine.append(qtlDict[entry][0]) # exp interactLine.append("0,0,0") #color - eventually replaced by 'addColor' script interactLine.append(interactLine[0]) # sourceChrom interactLine.append(var_start - 1) # sourceStart interactLine.append(var_start) # sourceEnd interactLine.append(rsID) # sourceName interactLine.append(".") # sourceStrand interactLine.append(gene_chrom) # targetChrom interactLine.append(gene_start) # targetStart interactLine.append(gene_end) # targetEnd interactLine.append(gene_sym) # targetEnd interactLine.append(gene_strand) # targetStrand # Print interact line to stdout, can redirect to a file to save it print(*interactLine, sep="\t") '_EOF_' # Script takes in eQTL, SNP info, and GENCODE genepred file # Uses this information to build an interact line for each item in the eQTL file # Need to convert GTF to genePredExt # (Kate had converted GTF to genePred for GTEx V8 expression track work, but that didn't include gene name as I was hoping) gtfToGenePred -genePredExt -geneNameAsName2 -includeVersion gencode.v26.GRCh38.genes.gtf gencode.v26.GRCh38.genes.gpExt # Build interact files and sort resulting bed file -./buildInteract CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz ../gencode.v26.GRCh38.genes.gpExt \ - ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz | \ +./buildInteract CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz ../gencode.v26.GRCh38.genes.gpExt \ + ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz eQTL Probability| \ bedSort stdin gtexCaviar.interact.sorted.txt ## Add colors # Make list of tissues in V8 file zcat GTEx_v8_finemapping_CAVIAR/CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF.txt.gz | cut -f1 -d$'\t' |sort -u |grep -v TISSUE> gtexTissuesV8.txt # Using GTEx V6p colors, manually match up to names in V8 file ln -s /hive/data/outside/GTEx/V6p/eQtl/Caviar2/gtexTissueColor.tab gtexTissueColor.v8.tab # Write addColors script to add colors from this file to the interact file: cat << '_EOF_' > addColors #### # Intially created interact file w/o colors # Wrote this quick script to automatically add them to the interact format #### #!/usr/bin/env python3 import sys interactFile = sys.argv[1] colorFile = sys.argv[2] # Read in color file from gtexTisseColor.v8.tab cfh = open(colorFile, "r") # Process this file and turn it into a dictionary colors = dict() for line in cfh: splitLine = line.strip().split("\t") # Keys in dictionary are tissue colors[splitLine[0]] = splitLine # Read in interact file ifh = open(interactFile, "r") for line in ifh: splitLine = line.strip().split("\t") # Extract tisue information from interact file tissue = splitLine[6] # If tissue is in color dict keys, then we're good if tissue in colors.keys(): color = colors[tissue][2] # Set color field equal to tissue color splitLine[7] = color # Output modified line to stdout, can save to file by redirecting stdout to a file print(*splitLine, sep="\t") '_EOF_' # Run script to add colors ./addColors gtexCaviar.interact.sorted.txt ../gtexTissueColor.v8.tab > gtexCaviar.interact.sorted.colors.txt # Create custom as file for this bigBed: cat << '_EOF_' > gtexInteract.as table interact "Interaction between an eQTL and a target gene" ( string chrom; "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records" uint chromStart; "Start position of eQTL." uint chromEnd; "End position of gene." string name; "Name of item for display, dbSNP rsID/Gene Symbol/Tissue." uint score; "Not used, always 1000." double cpp; "Causal Posterior Probability (CPP) that quantifies the probability of each variant to be causal." string tissue; "Tissue" string color; "Item color, based on GTEx Gene expression track colors." string eqtlChrom; "Chromosome of eQTL." uint eqtlStart; "Start position of eQTL." uint eqtlEnd; "End position of eQTL." string eqtlName; "eQTL name, which is a dbSNP identifier." string eqtlStrand; "Not applicable, always '.'" string geneChrom; "Chromosome of target gene." uint geneStart; "Start position of target gene." uint geneEnd; "End position of target gene." string geneName; "Gene symbol of target gene." string geneStrand; "Strand of target gene." ) '_EOF_' # Build bigInteract file with colors bedToBigBed -as=../gtexInteract.as -type=bed5+13 gtexCaviar.interact.sorted.colors.txt /hive/data/genomes/hg38/chrom.sizes gtexCaviar.interact.colors.bb