src/hg/makeDb/doc/hg38/gtex.txt 8979882863a20beb00f7c72cfc4263a1e5a9863b

8979882863a20beb00f7c72cfc4263a1e5a9863b
mspeir
  Fri Oct 15 15:28:44 2021 -0700
Updating gtex README with latest version of script and other details about latest version of the track. Also tweaking track description page to be more accurate, refs #27947

diff --git src/hg/makeDb/doc/hg38/gtex.txt src/hg/makeDb/doc/hg38/gtex.txt
index c777492..a9094e0 100644
--- src/hg/makeDb/doc/hg38/gtex.txt
+++ src/hg/makeDb/doc/hg38/gtex.txt
@@ -138,98 +138,125 @@
 
 
 ### TODO 
 # Add GTEx to Gene Sorter (2016-08-18 kate)
 # See hg/near/makeNear.doc
 
 #############################################################################
 # GTEx V8 cis-eQTLs CAVIAR High Confidence (Sept 2021) Matt
 
 cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_CAVIAR
 
 # Tar files were downloaded from https://gtexportal.org/home/datasets#filesetFilesDiv15
 # This file was used for this track:
 wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_CAVIAR.tar
 # Then unpacked
+# Other files used:
 
-# Used this file: CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz
+
+# Initially planned to use this file: # CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz
+# as it seemed to be a filtered subset of eQTLs
 # Description from GTEx_v8_finemapping_CAVIAR/README.txt
-***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report
-all the high causal variants (variants that have posterior probability of > 0.1).
-# Started with this as it seems this is the data Kate used for hg19 eQTL tracks
+# ***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report
+# all the high causal variants (variants that have posterior probability of > 0.1).
 
-# Sample line from file
+# Sample header line:
 # TISSUE  GENE    eQTL    CHROM   POS     Probability
 # Brain_Caudate_basal_ganglia     ENSG00000248485.1       1_161274374     1       161274374       0.157456
 
+# However, the names/positions in the eQTL column are not unique meaning
+# We want file with unique variant/eQTL names that match those in the GTEx
+# variant mapping file: GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
+# Looks like the CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz has names 
+# that match the names in the variant/eQTL mapping file
+
+# Description from GTEx_v8_finemapping_CAVIAR/README.txt
+# ***CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz  —> is a single file for all GTEx tissues and all eGenes where we reported
+# the CPP (Causal Posterior Probability). Each eQTL contains the allele information. Sample header file:
+
+# TISSUE  GENE    eQTL    CHROM   POS Probability
+# Brain_Caudate_basal_ganglia ENSG00000248485.1   chr1_161274374_G_A_b38  1   161274374   0.157456
+
+# So, take NOCUTOFF_with_Allele file and then filter for probability >0.1 (which is how HighConfidentVariants file was created according to GTEx README)
+zcat CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz | awk '$6 > 0.1' | gzip -c > CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz
+
+# Confirm that it has the same size as original HighConfidentVariants file
+zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | wc -l
+1257158
+zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | wc -l
+1257158
+
 # There seem to 31 duplicate eQTLs, meaning that the two lines for these entries are duplicated (probability values, etc.)
 # These will essentially be collapsed into a single entry by the buildInteract script
-zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt
+zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt
 wc -l dupe_eQtls.txt
 31 dupe_eQtls.txt
 
 # Wrote buildInteract script to help build interact-format tracks from CAVIAR files:
 cat << '_EOF_' > buildInteract
 #!/usr/bin/env python3
 
 import sys, gzip
 
 qtlFile = sys.argv[1]
 gpFile = sys.argv[2]
 alleleFile = sys.argv[3]
+qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL
+probCol = sys.argv[5] # Column name in file header, e.g. Probability
 
 # Open up all of our files
 qfh = gzip.open(qtlFile,"r") # QTL file
 gfh = open(gpFile,"r") # genePred file
 afh = gzip.open(alleleFile,"r") # allele file
 
 # Set up dicts for each
 qtlDict = dict()
 gpDict = dict()
 alleleDict = dict()
 
+# Get indices of eQTL and probability fields
+header = qfh.readline().decode('ASCII').rstrip().split("\t")
+qtl_col = header.index(qtlIdCol)
+prob_col = header.index(probCol)
+
 # Process eQTL file
 for line in qfh:
     qtlLine = line.decode('ASCII').strip().split("\t")
     # Skip header line in file
     if "TISSUE" not in qtlLine:
         tissue = qtlLine[0]
         gene = qtlLine[1]
-        e_qtl = qtlLine[2]
+        e_qtl = qtlLine[qtl_col]
         # each of these elements on their own aren't unique, but together they are
         qtlKey = e_qtl + "|" + tissue + "|" + gene
         # Put these into a dict where key is unique name we've made
         qtlDict[qtlKey] = qtlLine
 
 # Process genePred file
 for line in gfh:
     gpLine = line.strip().split("\t")
     gene_id = gpLine[0]
     # Put into dict with key on ENSG* ID
     gpDict[gene_id] = gpLine
 
 # Process allele file from GTEx
 for line in afh:
     # gzip file, so we need to decode to asii first
     aLine = line.decode('ASCII').strip().split("\t")
 
     allele = aLine[0]
-    chrom = aLine[1].replace("chr","")
-    pos = aLine[2]
-    # Create key from combination of chrom + position
-    # In theory this should match the keys from the eQTL dict
-    alleleDict[chrom + "_" + pos] = aLine
+    alleleDict[allele] = aLine
     #7 is rsID
 
 for entry in qtlDict:
     # Create list to store elements that will become a single interact line
     # Interact format: http://genome.ucsc.edu/goldenPath/help/interact.html
 
     # Get gene information from the genePred dict
     # eQTL file/dict has ENSG id in it
     gene_id = qtlDict[entry][1]
     if gene_id in gpDict.keys():
         gene_sym = gpDict[gene_id][11]
         gene_chrom = gpDict[gene_id][1]
         gene_strand = gpDict[gene_id][2]
         gene_start = int(gpDict[gene_id][3])
         gene_end = int(gpDict[gene_id][4])
@@ -247,58 +274,58 @@
     # Deals with case where eQTL is downstream
     elif var_start > gene_end:
         chromStart = gene_start
         chromEnd = var_start
     # Deals with case where eQTL is upstream
     elif var_start < gene_start:
         chromStart = var_start - 1
         chromEnd = gene_end
 
     interactLine = list()
     interactLine.append("chr" + qtlDict[entry][3]) # chrom
     interactLine.append(chromStart) # chrStart
     interactLine.append(chromEnd) # chrEnd
     interactLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name
     interactLine.append(int("1000")) # Maybe I should scale up to 0-1000 range? # score
-    interactLine.append(qtlDict[entry][5]) # value
+    interactLine.append(qtlDict[entry][prob_col]) # value
     interactLine.append(qtlDict[entry][0]) # exp
     interactLine.append("0,0,0") #color - eventually replaced by 'addColor' script
     interactLine.append(interactLine[0]) # sourceChrom
     interactLine.append(var_start - 1) # sourceStart
     interactLine.append(var_start) # sourceEnd
     interactLine.append(rsID) # sourceName
     interactLine.append(".") # sourceStrand
     interactLine.append(gene_chrom) # targetChrom
     interactLine.append(gene_start) # targetStart
     interactLine.append(gene_end) # targetEnd
     interactLine.append(gene_sym) # targetEnd
     interactLine.append(gene_strand) # targetStrand
 
     # Print interact line to stdout, can redirect to a file to save it
     print(*interactLine, sep="\t")
 '_EOF_'
 
 # Script takes in eQTL, SNP info, and GENCODE genepred file
 # Uses this information to build an interact line for each item in the eQTL file
 
 # Need to convert GTF to genePredExt
 # (Kate had converted GTF to genePred for GTEx V8 expression track work, but that didn't include gene name as I was hoping)
 gtfToGenePred -genePredExt -geneNameAsName2 -includeVersion gencode.v26.GRCh38.genes.gtf gencode.v26.GRCh38.genes.gpExt 
 
 # Build interact files and sort resulting bed file
-./buildInteract CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz ../gencode.v26.GRCh38.genes.gpExt \
-  ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz | \
+./buildInteract CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz ../gencode.v26.GRCh38.genes.gpExt \
+  ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz eQTL Probability| \
     bedSort stdin gtexCaviar.interact.sorted.txt
 
 ## Add colors
 # Make list of tissues in V8 file
 zcat GTEx_v8_finemapping_CAVIAR/CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF.txt.gz | cut -f1 -d$'\t' |sort -u |grep -v TISSUE> gtexTissuesV8.txt
 
 # Using GTEx V6p colors, manually match up to names in V8 file
 ln -s /hive/data/outside/GTEx/V6p/eQtl/Caviar2/gtexTissueColor.tab 
 gtexTissueColor.v8.tab 
 
 # Write addColors script to add colors from this file to the interact file:
 cat << '_EOF_' > addColors
 ####
 # Intially created interact file w/o colors
 # Wrote this quick script to automatically add them to the interact format