557e23cb23716c1e543903446c03bb3ed7a5a195
mspeir
  Fri Oct 15 15:31:42 2021 -0700
adding details about other files used, refs #27947

diff --git src/hg/makeDb/doc/hg38/gtex.txt src/hg/makeDb/doc/hg38/gtex.txt
index a9094e0..504f861 100644
--- src/hg/makeDb/doc/hg38/gtex.txt
+++ src/hg/makeDb/doc/hg38/gtex.txt
@@ -1,394 +1,397 @@
 #############################################################################
 # GTEx V6 (October 2015) Kate
 # Create BED from hgFixed tables (see doc/gtex)
 # Reloading during QA of track (fixing gene classes, adding scores).  (March 2016) Kate
 
 cd /hive/data/outside/gtex/V6
 
 # see doc/hg19.txt for how this genePred was made
 set chain = /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz
 liftOver -genePred gencodeV19.hg19.genePred $chain gtexGeneModelV6.hg38.genePred \
                 gencode.V19.hg38.unmapped
 # 926 unmapped
 hgLoadGenePred hg38 gtexGeneModelV6 gtexGeneModelV6.hg38.genePred
 
 # OLD: creates gtexGeneModelV6.hg38.genePred
 # OLD: NOTE: drops 192 transcripts.  One I spot-checked indeed didn't exist in our hg38 genes
 
 cd /hive/data/genomes/hg38/bed
 mkdir -p gtex
 cd gtex
 
 # table renamed to gtexGeneModel later
 
 # Use latest GENCODE attrs file to get biotypes
 
 # create bed table
 ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed  hg38 -gtexVersion=V6 \
         -noLoad -gencodeVersion=$gencodeVersion gtexGeneBedV6 -verbose=2 >&! makeGeneBed.log
 
 # 45 genes not found in GENCODE attributes table
 
 #Max score: 219385.906250
 
 wc -l *.tab
 wc -l *.tab
 52896 gtexGeneBedV6.tab
 # 55810 gtexGeneBedV6.tab
 
 # exploratory for assigning score based on sum of scores (expScores field)
 set bedScore = ~/kent/src/utils/bedScore/bedScore
 $bedScore -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.bed
 $bedScore -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.bed
 $bedScore -col=10 -minScore=1 -method=reg gtexGeneBedV6.tab gtexGeneBedV6.reg.bed
 $bedScore -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.bed
 
 $bedScore -col=10 -minScore=1 -method=reg -log gtexGeneBedV6.tab gtexGeneBedV6.reg.log.bed
 $bedScore -log -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.log.bed
 $bedScore -log -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.log.bed
 $bedScore -log -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.log.bed
 
 # Using -log -method=encode, as this is closest to density plot of all scores
 # as here: GtexTotalExpressionDensity.png, GtexTotalExpressionFrequency.png
 textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV6.encode.log.bed
 1.000000 ************************************************************ 22889
 72.357214 *************************************************** 4862
 143.714428 ************************************************** 4199
 215.071643 ************************************************* 3931
 286.428857 ************************************************** 4329
 357.786071 *************************************************** 5419
 429.143285 ************************************************** 4472
 500.500500 ********************************************* 1953
 571.857714 ************************************** 564
 643.214928 ******************************** 200
 714.572142 ************************* 61
 785.929356 *********** 6
 857.286571 ******** 4
 928.643785 ************ 7
 
 $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneBedV6.tab gtexGeneBedV6.bed
 
 
 # load up
 set lib = ~/kent/src/hg/lib
 
 hgLoadBed hg38 -noBin -tab -type=bed6+4 \
         -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                 gtexGeneBedNewV6 gtexGeneBedV6.bed
 
 # Add GTEx to Gene Sorter (2016-08-18 kate)
 # See hg/near/makeNear.doc
 
 #############################################################################
 # GTEx V8 (Apr 2020) Kate
 # Create BED from hgFixed tables (see doc/gtex)
  
 # Load gene models (Gencode V26 transcript union from GTEx)
 
 cd /hive/data/outside/gtex/V8/rnaSeq
 gtfToGenePred gencode.v26.GRCh38.genes.gtf gencodeV26.hg38.genePred \
                 -infoOut=gtexGeneModelInfoV8.tab
 hgLoadGenePred hg38 gtexGeneModelV8 gencodeV26.hg38.genePred
 
 # Get transcript for each gene (why ?)
 tail -n +2 gtexGeneModelInfoV8.tab | awk '{printf("%s\t%s\n", $1, $9)}' > gtexGeneTranscriptsV8.tab
 #hgLoadSqlTab hgFixed gtexTranscriptV8 ~/kent/src/hg/lib/gtexTranscript.sql gtexGeneTranscriptsV8.tab
 # no schema (or table on hgwdev.hgFixed)
 
 # Load BED table
 cd /hive/data/genomes/hg38/bed/gtex
 mkdir V8
 cd V8
 
 set gencode = V26
 ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \
         hg38 -noLoad -gtexVersion=V8 -gencodeVersion=$gencode gtexGeneV8 -verbose=2 >&! log.txt &
 
 Reading wgEncodeGencodeAttrs table
 Reading gtexGeneModelV8 table
 Reading gtexTissueMedian table
 Writing tab file gtexGeneV8
 Max score: 267400.000000
 
 # Add scores
 set bedScore = ~/kent/src/utils/bedScore/bedScore
 $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV8.tab gtexGeneBedV8.bed
 textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV8.bed
 0.000000 ************************************************************ 21287
 71.428643 **************************************************** 5635
 142.857286 **************************************************** 5513
 214.285929 *************************************************** 4683
 285.714571 *************************************************** 4480
 357.143214 *************************************************** 4748
 428.571857 **************************************************** 5466
 500.000500 ************************************************ 3117
 571.429143 ***************************************** 911
 642.857786 ********************************* 252
 714.286429 ************************** 81
 785.715071 ************** 11
 857.143714 ******** 4
 928.572357 *************** 12
 
 # load up
 set lib = ~/kent/src/hg/lib
 hgLoadBed hg38 -noBin -tab -type=bed6+4 \
         -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                 gtexGeneV8 gtexGeneBedV8.bed
 #Read 56200 elements of size 10 from gtexGeneBedV8.bed
 
 
 ### TODO 
 # Add GTEx to Gene Sorter (2016-08-18 kate)
 # See hg/near/makeNear.doc
 
 #############################################################################
 # GTEx V8 cis-eQTLs CAVIAR High Confidence (Sept 2021) Matt
 
 cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_CAVIAR
 
 # Tar files were downloaded from https://gtexportal.org/home/datasets#filesetFilesDiv15
 # This file was used for this track:
 wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_CAVIAR.tar
 # Then unpacked
 # Other files used:
-
+# Lookup table for all variants genotyped in GTEx
+wget https://storage.googleapis.com/gtex_analysis_v8/reference/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
+# Gene-level model based on the GENCODE 26 transcript model, where isoforms were collapsed to a single transcript per gene.
+wget https://storage.googleapis.com/gtex_analysis_v8/reference/gencode.v26.GRCh38.genes.gtf
 
 # Initially planned to use this file: # CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz
 # as it seemed to be a filtered subset of eQTLs
 # Description from GTEx_v8_finemapping_CAVIAR/README.txt
 # ***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report
 # all the high causal variants (variants that have posterior probability of > 0.1).
 
 # Sample header line:
 # TISSUE  GENE    eQTL    CHROM   POS     Probability
 # Brain_Caudate_basal_ganglia     ENSG00000248485.1       1_161274374     1       161274374       0.157456
 
 # However, the names/positions in the eQTL column are not unique meaning
 # We want file with unique variant/eQTL names that match those in the GTEx
 # variant mapping file: GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
 # Looks like the CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz has names 
 # that match the names in the variant/eQTL mapping file
 
 # Description from GTEx_v8_finemapping_CAVIAR/README.txt
 # ***CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz  —> is a single file for all GTEx tissues and all eGenes where we reported
 # the CPP (Causal Posterior Probability). Each eQTL contains the allele information. Sample header file:
 
 # TISSUE  GENE    eQTL    CHROM   POS Probability
 # Brain_Caudate_basal_ganglia ENSG00000248485.1   chr1_161274374_G_A_b38  1   161274374   0.157456
 
 # So, take NOCUTOFF_with_Allele file and then filter for probability >0.1 (which is how HighConfidentVariants file was created according to GTEx README)
 zcat CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz | awk '$6 > 0.1' | gzip -c > CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz
 
 # Confirm that it has the same size as original HighConfidentVariants file
 zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | wc -l
 1257158
 zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | wc -l
 1257158
 
 # There seem to 31 duplicate eQTLs, meaning that the two lines for these entries are duplicated (probability values, etc.)
 # These will essentially be collapsed into a single entry by the buildInteract script
 zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt
 wc -l dupe_eQtls.txt
 31 dupe_eQtls.txt
 
 # Wrote buildInteract script to help build interact-format tracks from CAVIAR files:
 cat << '_EOF_' > buildInteract
 #!/usr/bin/env python3
 
 import sys, gzip
 
 qtlFile = sys.argv[1]
 gpFile = sys.argv[2]
 alleleFile = sys.argv[3]
 qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL
 probCol = sys.argv[5] # Column name in file header, e.g. Probability
 
 # Open up all of our files
 qfh = gzip.open(qtlFile,"r") # QTL file
 gfh = open(gpFile,"r") # genePred file
 afh = gzip.open(alleleFile,"r") # allele file
 
 # Set up dicts for each
 qtlDict = dict()
 gpDict = dict()
 alleleDict = dict()
 
 # Get indices of eQTL and probability fields
 header = qfh.readline().decode('ASCII').rstrip().split("\t")
 qtl_col = header.index(qtlIdCol)
 prob_col = header.index(probCol)
 
 # Process eQTL file
 for line in qfh:
     qtlLine = line.decode('ASCII').strip().split("\t")
     # Skip header line in file
     if "TISSUE" not in qtlLine:
         tissue = qtlLine[0]
         gene = qtlLine[1]
         e_qtl = qtlLine[qtl_col]
         # each of these elements on their own aren't unique, but together they are
         qtlKey = e_qtl + "|" + tissue + "|" + gene
         # Put these into a dict where key is unique name we've made
         qtlDict[qtlKey] = qtlLine
 
 # Process genePred file
 for line in gfh:
     gpLine = line.strip().split("\t")
     gene_id = gpLine[0]
     # Put into dict with key on ENSG* ID
     gpDict[gene_id] = gpLine
 
 # Process allele file from GTEx
 for line in afh:
     # gzip file, so we need to decode to asii first
     aLine = line.decode('ASCII').strip().split("\t")
 
     allele = aLine[0]
     alleleDict[allele] = aLine
     #7 is rsID
 
 for entry in qtlDict:
     # Create list to store elements that will become a single interact line
     # Interact format: http://genome.ucsc.edu/goldenPath/help/interact.html
 
     # Get gene information from the genePred dict
     # eQTL file/dict has ENSG id in it
     gene_id = qtlDict[entry][1]
     if gene_id in gpDict.keys():
         gene_sym = gpDict[gene_id][11]
         gene_chrom = gpDict[gene_id][1]
         gene_strand = gpDict[gene_id][2]
         gene_start = int(gpDict[gene_id][3])
         gene_end = int(gpDict[gene_id][4])
     # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID
     var_pos = qtlDict[entry][2]
     if var_pos in alleleDict.keys():
         rsID = alleleDict[var_pos][6]
     var_start = int(qtlDict[entry][4])
 
     # Do some things to make sure start isn't ever greater than end for items
     # Deals with case where eQTL is internal to gene
     if var_start > gene_start and var_start < gene_end:
         chromStart = gene_start
         chromEnd = gene_end
     # Deals with case where eQTL is downstream
     elif var_start > gene_end:
         chromStart = gene_start
         chromEnd = var_start
     # Deals with case where eQTL is upstream
     elif var_start < gene_start:
         chromStart = var_start - 1
         chromEnd = gene_end
 
     interactLine = list()
     interactLine.append("chr" + qtlDict[entry][3]) # chrom
     interactLine.append(chromStart) # chrStart
     interactLine.append(chromEnd) # chrEnd
     interactLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name
     interactLine.append(int("1000")) # Maybe I should scale up to 0-1000 range? # score
     interactLine.append(qtlDict[entry][prob_col]) # value
     interactLine.append(qtlDict[entry][0]) # exp
     interactLine.append("0,0,0") #color - eventually replaced by 'addColor' script
     interactLine.append(interactLine[0]) # sourceChrom
     interactLine.append(var_start - 1) # sourceStart
     interactLine.append(var_start) # sourceEnd
     interactLine.append(rsID) # sourceName
     interactLine.append(".") # sourceStrand
     interactLine.append(gene_chrom) # targetChrom
     interactLine.append(gene_start) # targetStart
     interactLine.append(gene_end) # targetEnd
     interactLine.append(gene_sym) # targetEnd
     interactLine.append(gene_strand) # targetStrand
 
     # Print interact line to stdout, can redirect to a file to save it
     print(*interactLine, sep="\t")
 '_EOF_'
 
 # Script takes in eQTL, SNP info, and GENCODE genepred file
 # Uses this information to build an interact line for each item in the eQTL file
 
 # Need to convert GTF to genePredExt
 # (Kate had converted GTF to genePred for GTEx V8 expression track work, but that didn't include gene name as I was hoping)
 gtfToGenePred -genePredExt -geneNameAsName2 -includeVersion gencode.v26.GRCh38.genes.gtf gencode.v26.GRCh38.genes.gpExt 
 
 # Build interact files and sort resulting bed file
 ./buildInteract CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz ../gencode.v26.GRCh38.genes.gpExt \
   ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz eQTL Probability| \
     bedSort stdin gtexCaviar.interact.sorted.txt
 
 ## Add colors
 # Make list of tissues in V8 file
 zcat GTEx_v8_finemapping_CAVIAR/CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF.txt.gz | cut -f1 -d$'\t' |sort -u |grep -v TISSUE> gtexTissuesV8.txt
 
 # Using GTEx V6p colors, manually match up to names in V8 file
 ln -s /hive/data/outside/GTEx/V6p/eQtl/Caviar2/gtexTissueColor.tab 
 gtexTissueColor.v8.tab 
 
 # Write addColors script to add colors from this file to the interact file:
 cat << '_EOF_' > addColors
 ####
 # Intially created interact file w/o colors
 # Wrote this quick script to automatically add them to the interact format
 ####
 #!/usr/bin/env python3
 
 import sys
 
 interactFile = sys.argv[1]
 colorFile = sys.argv[2]
 
 # Read in color file from gtexTisseColor.v8.tab
 cfh = open(colorFile, "r")
 # Process this file and turn it into a dictionary
 colors = dict()
 for line in cfh:
     splitLine = line.strip().split("\t")
     # Keys in dictionary are tissue
     colors[splitLine[0]] = splitLine
 
 # Read in interact file
 ifh = open(interactFile, "r")
 for line in ifh:
     splitLine = line.strip().split("\t")
     # Extract tisue information from interact file
     tissue = splitLine[6]
     # If tissue is in color dict keys, then we're good
     if tissue in colors.keys():
         color = colors[tissue][2]
         # Set color field equal to tissue color
         splitLine[7] = color
     # Output modified line to stdout, can save to file by redirecting stdout to a file
     print(*splitLine, sep="\t")
 '_EOF_'
 
 # Run script to add colors
 ./addColors gtexCaviar.interact.sorted.txt ../gtexTissueColor.v8.tab > gtexCaviar.interact.sorted.colors.txt
 
 # Create custom as file for this bigBed:
 cat << '_EOF_' > gtexInteract.as
 table interact
 "Interaction between an eQTL and a target gene"
     (
         string chrom;       "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records"
         uint chromStart;    "Start position of eQTL."
         uint chromEnd;      "End position of gene."
         string name;        "Name of item for display, dbSNP rsID/Gene Symbol/Tissue."
         uint score;         "Not used, always 1000."
         double cpp;         "Causal Posterior Probability (CPP) that quantifies the probability of each variant to be causal."
         string tissue;      "Tissue"
         string color;       "Item color, based on GTEx Gene expression track colors."
         string eqtlChrom;   "Chromosome of eQTL."
         uint eqtlStart;     "Start position of eQTL."
         uint eqtlEnd;       "End position of eQTL."
         string eqtlName;    "eQTL name, which is a dbSNP identifier."
         string eqtlStrand;  "Not applicable, always '.'"
         string geneChrom;   "Chromosome of target gene."
         uint geneStart;     "Start position of target gene."
         uint geneEnd;       "End position of target gene."
         string geneName;    "Gene symbol of target gene."
         string geneStrand;  "Strand of target gene."
         )
 '_EOF_'
 
 # Build bigInteract file with colors
 bedToBigBed -as=../gtexInteract.as -type=bed5+13 gtexCaviar.interact.sorted.colors.txt /hive/data/genomes/hg38/chrom.sizes gtexCaviar.interact.colors.bb