5e75c13af084da1dbfdfc81a248eaa00903819ed
mspeir
  Fri Sep 24 14:27:57 2021 -0700
Changes to makedoc for gtex based on CR. Added two scripts and as file contents to makrdoc. refs #27947 #28163

diff --git src/hg/makeDb/doc/hg38/gtex.txt src/hg/makeDb/doc/hg38/gtex.txt
index 218c75f..c777492 100644
--- src/hg/makeDb/doc/hg38/gtex.txt
+++ src/hg/makeDb/doc/hg38/gtex.txt
@@ -132,62 +132,236 @@
 # load up
 set lib = ~/kent/src/hg/lib
 hgLoadBed hg38 -noBin -tab -type=bed6+4 \
         -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                 gtexGeneV8 gtexGeneBedV8.bed
 #Read 56200 elements of size 10 from gtexGeneBedV8.bed
 
 
 ### TODO 
 # Add GTEx to Gene Sorter (2016-08-18 kate)
 # See hg/near/makeNear.doc
 
 #############################################################################
 # GTEx V8 cis-eQTLs CAVIAR High Confidence (Sept 2021) Matt
 
+cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_CAVIAR
+
 # Tar files were downloaded from https://gtexportal.org/home/datasets#filesetFilesDiv15
+# This file was used for this track:
+wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_CAVIAR.tar
 # Then unpacked
 
 # Used this file: CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz
 # Description from GTEx_v8_finemapping_CAVIAR/README.txt
 ***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report
 all the high causal variants (variants that have posterior probability of > 0.1).
 # Started with this as it seems this is the data Kate used for hg19 eQTL tracks
 
 # Sample line from file
 #TISSUE  GENE    eQTL    CHROM   POS     Probability
 #Brain_Caudate_basal_ganglia     ENSG00000248485.1       1_161274374     1       161274374       0.157456
 
-# Wrote script to help build interact-format tracks from CAVIAR files:
-buildInteract
+# There seem to 31 duplicate eQTLs, meaning that the two lines for these entries are duplicated (probability values, etc.)
+# These will essentially be collapsed into a single entry by the buildInteract script
+zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt
+wc -l dupe_eQtls.txt
+31 dupe_eQtls.txt
+
+# Wrote buildInteract script to help build interact-format tracks from CAVIAR files:
+cat << '_EOF_' > buildInteract
+#!/usr/bin/env python3
+
+import sys, gzip
+
+qtlFile = sys.argv[1]
+gpFile = sys.argv[2]
+alleleFile = sys.argv[3]
+
+# Open up all of our files
+qfh = gzip.open(qtlFile,"r") # QTL file
+gfh = open(gpFile,"r") # genePred file
+afh = gzip.open(alleleFile,"r") # allele file
+
+# Set up dicts for each
+qtlDict = dict()
+gpDict = dict()
+alleleDict = dict()
+
+# Process eQTL file
+for line in qfh:
+    qtlLine = line.decode('ASCII').strip().split("\t")
+    # Skip header line in file
+    if "TISSUE" not in qtlLine:
+        tissue = qtlLine[0]
+        gene = qtlLine[1]
+        e_qtl = qtlLine[2]
+        # each of these elements on their own aren't unique, but together they are
+        qtlKey = e_qtl + "|" + tissue + "|" + gene
+        # Put these into a dict where key is unique name we've made
+        qtlDict[qtlKey] = qtlLine
+
+# Process genePred file
+for line in gfh:
+    gpLine = line.strip().split("\t")
+    gene_id = gpLine[0]
+    # Put into dict with key on ENSG* ID
+    gpDict[gene_id] = gpLine
+
+# Process allele file from GTEx
+for line in afh:
+    # gzip file, so we need to decode to asii first
+    aLine = line.decode('ASCII').strip().split("\t")
+
+    allele = aLine[0]
+    chrom = aLine[1].replace("chr","")
+    pos = aLine[2]
+    # Create key from combination of chrom + position
+    # In theory this should match the keys from the eQTL dict
+    alleleDict[chrom + "_" + pos] = aLine
+    #7 is rsID
+
+for entry in qtlDict:
+    # Create list to store elements that will become a single interact line
+    # Interact format: http://genome.ucsc.edu/goldenPath/help/interact.html
+
+    # Get gene information from the genePred dict
+    # eQTL file/dict has ENSG id in it
+    gene_id = qtlDict[entry][1]
+    if gene_id in gpDict.keys():
+        gene_sym = gpDict[gene_id][11]
+        gene_chrom = gpDict[gene_id][1]
+        gene_strand = gpDict[gene_id][2]
+        gene_start = int(gpDict[gene_id][3])
+        gene_end = int(gpDict[gene_id][4])
+    # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID
+    var_pos = qtlDict[entry][2]
+    if var_pos in alleleDict.keys():
+        rsID = alleleDict[var_pos][6]
+    var_start = int(qtlDict[entry][4])
+
+    # Do some things to make sure start isn't ever greater than end for items
+    # Deals with case where eQTL is internal to gene
+    if var_start > gene_start and var_start < gene_end:
+        chromStart = gene_start
+        chromEnd = gene_end
+    # Deals with case where eQTL is downstream
+    elif var_start > gene_end:
+        chromStart = gene_start
+        chromEnd = var_start
+    # Deals with case where eQTL is upstream
+    elif var_start < gene_start:
+        chromStart = var_start - 1
+        chromEnd = gene_end
+
+    interactLine = list()
+    interactLine.append("chr" + qtlDict[entry][3]) # chrom
+    interactLine.append(chromStart) # chrStart
+    interactLine.append(chromEnd) # chrEnd
+    interactLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name
+    interactLine.append(int("1000")) # Maybe I should scale up to 0-1000 range? # score
+    interactLine.append(qtlDict[entry][5]) # value
+    interactLine.append(qtlDict[entry][0]) # exp
+    interactLine.append("0,0,0") #color - eventually replaced by 'addColor' script
+    interactLine.append(interactLine[0]) # sourceChrom
+    interactLine.append(var_start - 1) # sourceStart
+    interactLine.append(var_start) # sourceEnd
+    interactLine.append(rsID) # sourceName
+    interactLine.append(".") # sourceStrand
+    interactLine.append(gene_chrom) # targetChrom
+    interactLine.append(gene_start) # targetStart
+    interactLine.append(gene_end) # targetEnd
+    interactLine.append(gene_sym) # targetEnd
+    interactLine.append(gene_strand) # targetStrand
+
+    # Print interact line to stdout, can redirect to a file to save it
+    print(*interactLine, sep="\t")
+'_EOF_'
+
 # Script takes in eQTL, SNP info, and GENCODE genepred file
 # Uses this information to build an interact line for each item in the eQTL file
-# Not sure if script is that generalizable since each eQTL file seems to have its own format
-# Will see if I can do it though
 
 # Need to convert GTF to genePredExt
 # (Kate had converted GTF to genePred for GTEx V8 expression track work, but that didn't include gene name as I was hoping)
 gtfToGenePred -genePredExt -geneNameAsName2 -includeVersion gencode.v26.GRCh38.genes.gtf gencode.v26.GRCh38.genes.gpExt 
 
-# Command to build interact files
-./buildInteract CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz ../gencode.v26.GRCh38.genes.gpExt ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz > gtexCaviar.interact.txt
-
-# Sort resulting bed file
-bedSort gtexCaviar.interact.txt gtexCaviar.interact.sorted.txt 
-
-# Build bigInteract
-bedToBigBed -as=../interact.as -type=bed5+13 gtexCaviar.interact.sorted.txt /hive/data/genomes/hg38/chrom.sizes gtexCaviar.interact.bb
+# Build interact files and sort resulting bed file
+./buildInteract CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz ../gencode.v26.GRCh38.genes.gpExt \
+  ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz | \
+    bedSort stdin gtexCaviar.interact.sorted.txt
 
 ## Add colors
 # Make list of tissues in V8 file
 zcat GTEx_v8_finemapping_CAVIAR/CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF.txt.gz | cut -f1 -d$'\t' |sort -u |grep -v TISSUE> gtexTissuesV8.txt
 
 # Using GTEx V6p colors, manually match up to names in V8 file
 ln -s /hive/data/outside/GTEx/V6p/eQtl/Caviar2/gtexTissueColor.tab 
 gtexTissueColor.v8.tab 
 
-# Write script to add colors from this file to the interact file
-addColors
+# Write addColors script to add colors from this file to the interact file:
+cat << '_EOF_' > addColors
+####
+# Intially created interact file w/o colors
+# Wrote this quick script to automatically add them to the interact format
+####
+#!/usr/bin/env python3
+
+import sys
+
+interactFile = sys.argv[1]
+colorFile = sys.argv[2]
+
+# Read in color file from gtexTisseColor.v8.tab
+cfh = open(colorFile, "r")
+# Process this file and turn it into a dictionary
+colors = dict()
+for line in cfh:
+    splitLine = line.strip().split("\t")
+    # Keys in dictionary are tissue
+    colors[splitLine[0]] = splitLine
+
+# Read in interact file
+ifh = open(interactFile, "r")
+for line in ifh:
+    splitLine = line.strip().split("\t")
+    # Extract tisue information from interact file
+    tissue = splitLine[6]
+    # If tissue is in color dict keys, then we're good
+    if tissue in colors.keys():
+        color = colors[tissue][2]
+        # Set color field equal to tissue color
+        splitLine[7] = color
+    # Output modified line to stdout, can save to file by redirecting stdout to a file
+    print(*splitLine, sep="\t")
+'_EOF_'
+
+# Run script to add colors
 ./addColors gtexCaviar.interact.sorted.txt ../gtexTissueColor.v8.tab > gtexCaviar.interact.sorted.colors.txt
 
-# Rebuild bigInteract file
-bedToBigBed -as=../interact.as -type=bed5+13 gtexCaviar.interact.sorted.colors.txt /hive/data/genomes/hg38/chrom.sizes gtexCaviar.interact.colors.bb
+# Create custom as file for this bigBed:
+cat << '_EOF_' > gtexInteract.as
+table interact
+"Interaction between an eQTL and a target gene"
+    (
+        string chrom;       "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records"
+        uint chromStart;    "Start position of eQTL."
+        uint chromEnd;      "End position of gene."
+        string name;        "Name of item for display, dbSNP rsID/Gene Symbol/Tissue."
+        uint score;         "Not used, always 1000."
+        double cpp;         "Causal Posterior Probability (CPP) that quantifies the probability of each variant to be causal."
+        string tissue;      "Tissue"
+        string color;       "Item color, based on GTEx Gene expression track colors."
+        string eqtlChrom;   "Chromosome of eQTL."
+        uint eqtlStart;     "Start position of eQTL."
+        uint eqtlEnd;       "End position of eQTL."
+        string eqtlName;    "eQTL name, which is a dbSNP identifier."
+        string eqtlStrand;  "Not applicable, always '.'"
+        string geneChrom;   "Chromosome of target gene."
+        uint geneStart;     "Start position of target gene."
+        uint geneEnd;       "End position of target gene."
+        string geneName;    "Gene symbol of target gene."
+        string geneStrand;  "Strand of target gene."
+        )
+'_EOF_'
+
+# Build bigInteract file with colors
+bedToBigBed -as=../gtexInteract.as -type=bed5+13 gtexCaviar.interact.sorted.colors.txt /hive/data/genomes/hg38/chrom.sizes gtexCaviar.interact.colors.bb