src/hg/makeDb/doc/hg38/gtex.txt ab7847c47ac5507a7e126691bb78826ed4ba345d

ab7847c47ac5507a7e126691bb78826ed4ba345d
chmalee
  Mon Sep 13 16:34:47 2021 -0700
Staging GTEx single nuclei gene expression track, refs #29954

diff --git src/hg/makeDb/doc/hg38/gtex.txt src/hg/makeDb/doc/hg38/gtex.txt
index 9f9dbaf..047c6d0 100644
--- src/hg/makeDb/doc/hg38/gtex.txt
+++ src/hg/makeDb/doc/hg38/gtex.txt
@@ -1,709 +1,783 @@
 #############################################################################
 # GTEx V6 (October 2015) Kate
 # Create BED from hgFixed tables (see doc/gtex)
 # Reloading during QA of track (fixing gene classes, adding scores).  (March 2016) Kate
 
 cd /hive/data/outside/gtex/V6
 
 # see doc/hg19.txt for how this genePred was made
 set chain = /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz
 liftOver -genePred gencodeV19.hg19.genePred $chain gtexGeneModelV6.hg38.genePred \
                 gencode.V19.hg38.unmapped
 # 926 unmapped
 hgLoadGenePred hg38 gtexGeneModelV6 gtexGeneModelV6.hg38.genePred
 
 # OLD: creates gtexGeneModelV6.hg38.genePred
 # OLD: NOTE: drops 192 transcripts.  One I spot-checked indeed didn't exist in our hg38 genes
 
 cd /hive/data/genomes/hg38/bed
 mkdir -p gtex
 cd gtex
 
 # table renamed to gtexGeneModel later
 
 # Use latest GENCODE attrs file to get biotypes
 
 # create bed table
 ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed  hg38 -gtexVersion=V6 \
         -noLoad -gencodeVersion=$gencodeVersion gtexGeneBedV6 -verbose=2 >&! makeGeneBed.log
 
 # 45 genes not found in GENCODE attributes table
 
 #Max score: 219385.906250
 
 wc -l *.tab
 wc -l *.tab
 52896 gtexGeneBedV6.tab
 # 55810 gtexGeneBedV6.tab
 
 # exploratory for assigning score based on sum of scores (expScores field)
 set bedScore = ~/kent/src/utils/bedScore/bedScore
 $bedScore -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.bed
 $bedScore -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.bed
 $bedScore -col=10 -minScore=1 -method=reg gtexGeneBedV6.tab gtexGeneBedV6.reg.bed
 $bedScore -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.bed
 
 $bedScore -col=10 -minScore=1 -method=reg -log gtexGeneBedV6.tab gtexGeneBedV6.reg.log.bed
 $bedScore -log -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.log.bed
 $bedScore -log -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.log.bed
 $bedScore -log -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.log.bed
 
 # Using -log -method=encode, as this is closest to density plot of all scores
 # as here: GtexTotalExpressionDensity.png, GtexTotalExpressionFrequency.png
 textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV6.encode.log.bed
 1.000000 ************************************************************ 22889
 72.357214 *************************************************** 4862
 143.714428 ************************************************** 4199
 215.071643 ************************************************* 3931
 286.428857 ************************************************** 4329
 357.786071 *************************************************** 5419
 429.143285 ************************************************** 4472
 500.500500 ********************************************* 1953
 571.857714 ************************************** 564
 643.214928 ******************************** 200
 714.572142 ************************* 61
 785.929356 *********** 6
 857.286571 ******** 4
 928.643785 ************ 7
 
 $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneBedV6.tab gtexGeneBedV6.bed
 
 
 # load up
 set lib = ~/kent/src/hg/lib
 
 hgLoadBed hg38 -noBin -tab -type=bed6+4 \
         -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                 gtexGeneBedNewV6 gtexGeneBedV6.bed
 
 # Add GTEx to Gene Sorter (2016-08-18 kate)
 # See hg/near/makeNear.doc
 
 #############################################################################
 # GTEx V8 (Apr 2020) Kate
 # Create BED from hgFixed tables (see doc/gtex)
  
 # Load gene models (Gencode V26 transcript union from GTEx)
 
 cd /hive/data/outside/gtex/V8/rnaSeq
 gtfToGenePred gencode.v26.GRCh38.genes.gtf gencodeV26.hg38.genePred \
                 -infoOut=gtexGeneModelInfoV8.tab
 hgLoadGenePred hg38 gtexGeneModelV8 gencodeV26.hg38.genePred
 
 # Get transcript for each gene (why ?)
 tail -n +2 gtexGeneModelInfoV8.tab | awk '{printf("%s\t%s\n", $1, $9)}' > gtexGeneTranscriptsV8.tab
 #hgLoadSqlTab hgFixed gtexTranscriptV8 ~/kent/src/hg/lib/gtexTranscript.sql gtexGeneTranscriptsV8.tab
 # no schema (or table on hgwdev.hgFixed)
 
 # Load BED table
 cd /hive/data/genomes/hg38/bed/gtex
 mkdir V8
 cd V8
 
 set gencode = V26
 ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \
         hg38 -noLoad -gtexVersion=V8 -gencodeVersion=$gencode gtexGeneV8 -verbose=2 >&! log.txt &
 
 Reading wgEncodeGencodeAttrs table
 Reading gtexGeneModelV8 table
 Reading gtexTissueMedian table
 Writing tab file gtexGeneV8
 Max score: 267400.000000
 
 # Add scores
 set bedScore = ~/kent/src/utils/bedScore/bedScore
 $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV8.tab gtexGeneBedV8.bed
 textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV8.bed
 0.000000 ************************************************************ 21287
 71.428643 **************************************************** 5635
 142.857286 **************************************************** 5513
 214.285929 *************************************************** 4683
 285.714571 *************************************************** 4480
 357.143214 *************************************************** 4748
 428.571857 **************************************************** 5466
 500.000500 ************************************************ 3117
 571.429143 ***************************************** 911
 642.857786 ********************************* 252
 714.286429 ************************** 81
 785.715071 ************** 11
 857.143714 ******** 4
 928.572357 *************** 12
 
 # load up
 set lib = ~/kent/src/hg/lib
 hgLoadBed hg38 -noBin -tab -type=bed6+4 \
         -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \
                 gtexGeneV8 gtexGeneBedV8.bed
 #Read 56200 elements of size 10 from gtexGeneBedV8.bed
 
 
 ### TODO 
 # Add GTEx to Gene Sorter (2016-08-18 kate)
 # See hg/near/makeNear.doc
 
 #############################################################################
 # GTEx V8 cis-eQTLs CAVIAR High Confidence (Apr 2022) Matt
 
 cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_CAVIAR
 
 # Download files from
 wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_CAVIAR.tar
 # unpack files
 tar xvf GTEx_v8_finemapping_CAVIAR.tar
 
 # Other files used:
 # Lookup table for all variants genotyped in GTEx
 wget https://storage.googleapis.com/gtex_analysis_v8/reference/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
 # Gene-level model based on the GENCODE 26 transcript model, where isoforms were collapsed to a single transcript per gene.
 wget https://storage.googleapis.com/gtex_analysis_v8/reference/gencode.v26.GRCh38.genes.gtf
 
 # Initially planned to use this file: # CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz
 # as it seemed to be a filtered subset of eQTLs
 # Description from GTEx_v8_finemapping_CAVIAR/README.txt
 # ***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report
 # all the high causal variants (variants that have posterior probability of > 0.1).
 
 # Sample header line:
 # TISSUE  GENE    eQTL    CHROM   POS     Probability
 # Brain_Caudate_basal_ganglia     ENSG00000248485.1       1_161274374     1       161274374       0.157456
 
 # However, the names/positions in the eQTL column are not unique meaning
 # We want file with unique variant/eQTL names that match those in the GTEx
 # variant mapping file: GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
 # Looks like the CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz has names 
 # that match the names in the variant/eQTL mapping file
 
 # Description from GTEx_v8_finemapping_CAVIAR/README.txt
 # ***CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz  —> is a single file for all GTEx tissues and all eGenes where we reported
 # the CPP (Causal Posterior Probability). Each eQTL contains the allele information. Sample header file:
 # TISSUE  GENE    eQTL    CHROM   POS Probability
 # Brain_Caudate_basal_ganglia ENSG00000248485.1   chr1_161274374_G_A_b38  1   161274374   0.157456
 
 # So, take NOCUTOFF_with_Allele file and then filter for probability >0.1 (which is how HighConfidentVariants file was created according to GTEx README)
 zcat CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz | awk '$6 > 0.1' | gzip -c > CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz
 
 # Confirm that it has the same size as original HighConfidentVariants file
 zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | wc -l
 1257158
 zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | wc -l
 1257158
 
 # There seem to 31 duplicate eQTLs, meaning that the two lines for these entries are duplicated (probability values, etc.)
 # These will essentially be collapsed into a single entry by the buildInteract script
 zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt
 wc -l dupe_eQtls.txt
 31 dupe_eQtls.txt
 
 # Wrote buildGtexEqtlBed script to help build bed files from eQTL files:
 cat << '_EOF_' > buildGtexEqtlBed
 #!/usr/bin/env python3
 
 import sys, gzip
 
 qtlFile = sys.argv[1]
 gpFile = sys.argv[2]
 alleleFile = sys.argv[3]
 qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL
 probCol = sys.argv[5] # Column name in file header, e.g. Probability
 # DAP-G files have an extra probability column that we want to capture
 if len(sys.argv) > 6:
     clusterProbCol = sys.argv[6]
 else:
     clusterProbCol = False
 
 # Open up all of our files
 qfh = gzip.open(qtlFile,"r") # QTL file
 gfh = open(gpFile,"r") # genePred file
 afh = gzip.open(alleleFile,"r") # allele file
 
 # Set up dicts for each
 qtlDict = dict()
 gpDict = dict()
 alleleDict = dict()
 
 # Get index of field that we want to replace values in
 header = qfh.readline().decode('ASCII').rstrip().split("\t")
 qtl_col = header.index(qtlIdCol)
 prob_col = header.index(probCol)
 if clusterProbCol:
     clus_prob_col = header.index(clusterProbCol)
 
 # Process eQTL file
 for line in qfh:
     qtlLine = line.decode('ASCII').strip().split("\t")
     # Skip header line in file
     #if "TISSUE" not in qtlLine:
     tissue = qtlLine[0]
     gene = qtlLine[1]
     e_qtl = qtlLine[qtl_col]
     # each of these elements on their own aren't unique, but together they are
     qtlKey = e_qtl + "|" + tissue + "|" + gene
     # Put these into a dict where key is unique name we've made
     qtlDict[qtlKey] = qtlLine
 
 # Process genePred file
 for line in gfh:
     gpLine = line.strip().split("\t")
     gene_id = gpLine[0]
     # Put into dict with key on ENSG* ID, e.g.
     gpDict[gene_id] = gpLine
 
 # Process allele file from GTEx
 for line in afh:
     # gzip file, so we need to decode to asii first
     aLine = line.decode('ASCII').strip().split("\t")
 
     # Key our dict using the allele, which is unique in the file
     allele = aLine[0]
     alleleDict[allele] = aLine
 
 for entry in qtlDict:
     # Get gene information from the genePred dict
     # eQTL file/dict has ENSG id in it
     gene_id = qtlDict[entry][1]
     if gene_id in gpDict.keys():
         gene_sym = gpDict[gene_id][11]
         gene_chrom = gpDict[gene_id][1]
         gene_strand = gpDict[gene_id][2]
         gene_start = int(gpDict[gene_id][3])
         gene_end = int(gpDict[gene_id][4])
     # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID
     var_name = qtlDict[entry][qtl_col]
     if var_name in alleleDict.keys():
         rsID = alleleDict[var_name][6]
         if rsID == '.':
             rsID = var_name
         var_start = int(alleleDict[var_name][2]) - 1
         var_end = int(alleleDict[var_name][2])
         chrom = alleleDict[var_name][1]
 
     # This block of if/else sets up the blockSizes/blockStart/blockCount for
     # various orientations of eQTLs/genes
     if chrom == gene_chrom:
         # Handles case where eQTL is internal to a gene
         if var_start > gene_start and var_start < gene_end:
             chromStart = gene_start
             chromEnd = gene_end
             blockCount = "1"
             blockSizes = gene_end - gene_start
             blockStarts = "0"
             strand = "."
         # Handles case where eQTL is downstream of gene
         elif var_start > gene_end:
             chromStart = gene_start
             chromEnd = var_end
             bs1 = gene_end - gene_start
             blockCount = "2"
             blockSizes = str(bs1) + ",1"
             blockStarts = "0," + str(var_start - gene_start)
             strand = "-"
         # Handles case where eQTL is upstream of gene
         elif var_start < gene_start:
             chromStart = var_start
             chromEnd = gene_end
             blockCount = "2"
             bs2 = gene_end - gene_start
             blockSizes = "1," + str(bs2)
             blockStarts = "0," + str(gene_start - var_start)
             strand = "+"
         # Handles case where eQTL and start of gene overlap
         elif var_start == gene_start:
             chromStart = var_start
             chromEnd = gene_end
             blockCount = "1"
             blockSizes = str(gene_end - var_start)
             blockStarts = "0"
             strand = "."
         # Handles case where eQTL and end of gene overlap
         elif gene_end == var_start:
             chromStart = gene_start
             chromEnd = var_end
             blockCount = "1"
             blockSizes = str(var_end - gene_start)
             blockStarts = "0"
             strand = "."
 
         # Add each element of our bed line to a list to print later
         bedLine = list()
         bedLine.append(chrom) # chrom
         bedLine.append(chromStart) # chrStart
         bedLine.append(chromEnd) # chrEnd
         bedLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name
         bedLine.append(int("1000"))# score
         bedLine.append(strand)# strand
         bedLine.append(var_start) # thickStart
         bedLine.append(var_end) # thickEnd
         bedLine.append("0,0,0") #color - eventually replaced by 'addTissueColors' script
         bedLine.append(blockCount) # blockCount
         bedLine.append(blockSizes) # blockSizes
         bedLine.append(blockStarts) # blockSizes
         bedLine.append(chrom + ":" + str(var_start+1) + "-" + str(var_end)) # prettified gene pos
         bedLine.append(rsID) # sourceName
         bedLine.append(chrom + ":" + str(gene_start+1) + "-" + str(gene_end)) # prettified gene pos
         bedLine.append(gene_sym) # geneName
         bedLine.append(gene_id) # geneId
         bedLine.append(gene_strand) # targetStrand
         bedLine.append(qtlDict[entry][0]) # tissue
         bedLine.append(qtlDict[entry][prob_col]) # CAVIAR == Causal Posterior Probability; DAPG == SNP posterior inclusion probability
         if clusterProbCol:
             # Signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster)
             bedLine.append(qtlDict[entry][clus_prob_col])
 
     # Print bed line to stdout, can redirect to a file to save it
     print(*bedLine, sep="\t")
 '_EOF_'
 
 
 # Then run buildGtexEqtlBed to build an bed file
 ./buildGtexEqtlBed CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz ../gencode.v26.GRCh38.genes.gpExt \
   ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz eQTL Probability | \
   bedSort stdin gtexCaviar.sorted.bed
 
 # Create script to add colors to bed file
 cat << '_EOF_' > addTissueColors
 
 #!/usr/bin/env python3
 
 ####
 # Intially created bed file w/o colors
 # This script will replace col 9 with tissue-specific colors
 ####
 
 import sys
 
 bedFile = sys.argv[1]
 colorFile = sys.argv[2]
 
 # Read in color file from //
 cfh = open(colorFile, "r")
 # Process this file and turn it into a dictionary
 colors = dict()
 for line in cfh:
     splitLine = line.strip().split("\t")
     # Keys in dictionary are tissue, e.g. Adipose_Subcutaneous
     colors[splitLine[0]] = splitLine
 
 # Read in bed file
 ifh = open(bedFile, "r")
 for line in ifh:
     splitLine = line.strip().split("\t")
     # Extract tisue information from bed file
     tissue = splitLine[18]
     # If tissue is in color dict keys, then we're good
     if tissue in colors.keys():
         color = colors[tissue][2]
         # Set color field equal to tissue color
         splitLine[8] = color
     # Output modified line to stdout, can save to file by redirecting stdout to a file
     print(*splitLine, sep="\t")
 
 '_EOF_'
 
 # Then add colors:
 ./addTissueColors gtexCaviar.sorted.bed ../gtexTissueColor.v8.tab > gtexCaviar.sorted.colors.bed
 
 # Create custom as file for this bigBed:
 cat << '_EOF_' > dapg.as
 table bed
 "bed12+9 describing interaction between an eQTL and a target gene"
     (
     string          chrom;       "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records"
     uint            chromStart;  "Start position of eQTL"
     uint            chromEnd;    "End position of gene"
     string          name;        "Name of item for display in pattern eQTL name/Gene Symbol/Tissue"
     uint            score;       "Score. Always 1000."
     char[1]         strand;      "+ or - for strand"
     uint            thickStart;  "eQTL start"
     uint            thickEnd;    "eQTL end"
     uint            reserved;    "Item color, based on tissue colors GTEx Gene track"
     int             blockCount;  "Number of blocks"
     int[blockCount] blockSizes;  "Comma separated list of block sizes"
     int[blockCount] chromStarts; "Start positions of blocks relative to chromStart"
     string          eqtlPos;     "eQTL position"
     string          eqtlName;    "eQTL name (most often a dbSNP identifier)"
     string          genePos;     "Gene position"
     string          geneName;    "Gene symbol of target gene"
     string          geneId;      "ENSG ID of target gene"
     string          geneStrand;  "Strand of target gene"
     string          tissue;      "Tissue"
     double          cpp;         "Causal Posterior Probability (CPP) that quantifies the probability that a variant is causal"
     )
 '_EOF_'
 
 # Build bigBed
 bedToBigBed -as=caviar.as -type=bed12+8 gtexCaviar.sorted.colors.bed /hive/data/genomes/hg38/chrom.sizes gtexCaviar.bb
 
 #############################################################################
 # GTEx V8 cis-eQTLs DAP-G High Confidence (Apr 2022) Matt
 
 cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_dap-g
 
 # Download files from
 wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_DAPG.tar
 
 # unpack files
 tar xvf GTEx_v8_finemapping_DAPG.tar
 
 # Most files in VCF, but there is which is a TSV, GTEx_v8_finemapping_DAPG.txt.gz
 # Sample header lines
 tissue_id       gene_id cluster_id      cluster_pip     variant_id      variant_pip
 Adipose_Subcutaneous    ENSG00000000457.13      1       1.000   chr1_169891332_G_A_b38  9.881e-01
 Adipose_Subcutaneous    ENSG00000000457.13      1       1.000   chr1_169913898_G_A_b38  1.170e-02
 Adipose_Subcutaneous    ENSG00000000457.13      2       0.994   chr1_169894024_A_C_b38  1.208e-01
 
 # That's a big file:
 zcat GTEx_v8_finemapping_DAPG.txt.gz |wc -l
 21648585
 # How can we filter that down?
 # Looks like they provide a filtered file, GTEx_v8_finemapping_DAPG.CS95.txt.gz,
 # but it's in VCF format. Can we replicate it using their TSV file?
 
 # That VCF is filtered on 'SPIP' only including those clusters with SPIP > 0.95, where
 # SPIP: signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster)
 # SPIP seems equivalent to the 'cluster_pip' in our TSV file
 # So, create a new file where we only include those entries with cluster pip > 0.95
 zcat GTEx_v8_finemapping_DAPG.txt.gz | awk '$4>0.95' | gzip -c > GTEx_v8_finemapping_DAPG.pip_95.txt.gz
 
 # A much smaller file (though still quite a lot of variants!)
 zcat GTEx_v8_finemapping_DAPG.pip_95.txt.gz |wc -l
 7608545
 
 # Wrote buildGtexEqtlBed script to help build bed files from eQTL files:
 cat << '_EOF_' > buildGtexEqtlBed
 #!/usr/bin/env python3
 
 import sys, gzip
 
 qtlFile = sys.argv[1]
 gpFile = sys.argv[2]
 alleleFile = sys.argv[3]
 qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL
 probCol = sys.argv[5] # Column name in file header, e.g. Probability
 # DAP-G files have an extra probability column that we want to capture
 if len(sys.argv) > 6:
     clusterProbCol = sys.argv[6]
 else:
     clusterProbCol = False
 
 # Open up all of our files
 qfh = gzip.open(qtlFile,"r") # QTL file
 gfh = open(gpFile,"r") # genePred file
 afh = gzip.open(alleleFile,"r") # allele file
 
 # Set up dicts for each
 qtlDict = dict()
 gpDict = dict()
 alleleDict = dict()
 
 # Get index of field that we want to replace values in
 header = qfh.readline().decode('ASCII').rstrip().split("\t")
 qtl_col = header.index(qtlIdCol)
 prob_col = header.index(probCol)
 if clusterProbCol:
     clus_prob_col = header.index(clusterProbCol)
 
 # Process eQTL file
 for line in qfh:
     qtlLine = line.decode('ASCII').strip().split("\t")
     # Skip header line in file
     #if "TISSUE" not in qtlLine:
     tissue = qtlLine[0]
     gene = qtlLine[1]
     e_qtl = qtlLine[qtl_col]
     # each of these elements on their own aren't unique, but together they are
     qtlKey = e_qtl + "|" + tissue + "|" + gene
     # Put these into a dict where key is unique name we've made
     qtlDict[qtlKey] = qtlLine
 
 # Process genePred file
 for line in gfh:
     gpLine = line.strip().split("\t")
     gene_id = gpLine[0]
     # Put into dict with key on ENSG* ID
     gpDict[gene_id] = gpLine
 
 # Process allele file from GTEx
 for line in afh:
     # gzip file, so we need to decode to asii first
     aLine = line.decode('ASCII').strip().split("\t")
 
     # Key our dict using the allele, which is unique in the file
     allele = aLine[0]
     alleleDict[allele] = aLine
 
 for entry in qtlDict:
     # Get gene information from the genePred dict
     # eQTL file/dict has ENSG id in it
     gene_id = qtlDict[entry][1]
     if gene_id in gpDict.keys():
         gene_sym = gpDict[gene_id][11]
         gene_chrom = gpDict[gene_id][1]
         gene_strand = gpDict[gene_id][2]
         gene_start = int(gpDict[gene_id][3])
         gene_end = int(gpDict[gene_id][4])
     # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID
     var_name = qtlDict[entry][qtl_col]
     if var_name in alleleDict.keys():
         rsID = alleleDict[var_name][6]
         if rsID == '.':
             rsID = var_name
         var_start = int(alleleDict[var_name][2]) - 1
         var_end = int(alleleDict[var_name][2])
         chrom = alleleDict[var_name][1]
 
     # This block of if/else sets up the blockSizes/blockStart/blockCount for
     # various orientations of eQTLs/genes
     if chrom == gene_chrom:
         # Handles case where eQTL is internal to a gene
         if var_start > gene_start and var_start < gene_end:
             chromStart = gene_start
             chromEnd = gene_end
             blockCount = "1"
             blockSizes = gene_end - gene_start
             blockStarts = "0"
             strand = "."
         # Handles case where eQTL is downstream of gene
         elif var_start > gene_end:
             chromStart = gene_start
             chromEnd = var_end
             bs1 = gene_end - gene_start
             blockCount = "2"
             blockSizes = str(bs1) + ",1"
             blockStarts = "0," + str(var_start - gene_start)
             strand = "-"
         # Handles case where eQTL is upstream of gene
         elif var_start < gene_start:
             chromStart = var_start
             chromEnd = gene_end
             blockCount = "2"
             bs2 = gene_end - gene_start
             blockSizes = "1," + str(bs2)
             blockStarts = "0," + str(gene_start - var_start)
             strand = "+"
         # Handles case where eQTL and start of gene overlap
         elif var_start == gene_start:
             chromStart = var_start
             chromEnd = gene_end
             blockCount = "1"
             blockSizes = str(gene_end - var_start)
             blockStarts = "0"
             strand = "."
         # Handles case where eQTL and end of gene overlap
         elif gene_end == var_start:
             chromStart = gene_start
             chromEnd = var_end
             blockCount = "1"
             blockSizes = str(var_end - gene_start)
             blockStarts = "0"
             strand = "."
 
         # Add each element of our bed line to a list to print later
         bedLine = list()
         bedLine.append(chrom) # chrom
         bedLine.append(chromStart) # chrStart
         bedLine.append(chromEnd) # chrEnd
         bedLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name
         bedLine.append(int("1000"))# score
         bedLine.append(strand)# strand
         bedLine.append(var_start) # thickStart
         bedLine.append(var_end) # thickEnd
         bedLine.append("0,0,0") #color - eventually replaced by 'addTissueColors' script
         bedLine.append(blockCount) # blockCount
         bedLine.append(blockSizes) # blockSizes
         bedLine.append(blockStarts) # blockSizes
         bedLine.append(chrom + ":" + str(var_start+1) + "-" + str(var_end)) # prettified gene pos
         bedLine.append(rsID) # sourceName
         bedLine.append(chrom + ":" + str(gene_start+1) + "-" + str(gene_end)) # prettified gene pos
         bedLine.append(gene_sym) # geneName
         bedLine.append(gene_id) # geneId
         bedLine.append(gene_strand) # targetStrand
         bedLine.append(qtlDict[entry][0]) # tissue
         bedLine.append(qtlDict[entry][prob_col]) # CAVIAR == Causal Posterior Probability; DAPG == SNP posterior inclusion probability
         if clusterProbCol:
             # Signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster)
             bedLine.append(qtlDict[entry][clus_prob_col])
 
     # Print bed line to stdout, can redirect to a file to save it
     print(*bedLine, sep="\t")
 '_EOF_'
 
 
 # Then run buildGtexEqtlBed to build an bed file
 ./buildGtexEqtlBed GTEx_v8_finemapping_DAPG.pip_95.txt.gz ../gencode.v26.GRCh38.genes.gpExt \
   ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz variant_id variant_pip cluster_pip | \
   bedSort stdin gtexDapg.sorted.bed
 
 # Create script to add colors to bed file
 cat << '_EOF_' > addTissueColors
 
 #!/usr/bin/env python3
 
 ####
 # Intially created bed file w/o colors
 # This script will replace col 9 with tissue-specific colors
 ####
 
 import sys
 
 bedFile = sys.argv[1]
 colorFile = sys.argv[2]
 
 # Read in color file from //
 cfh = open(colorFile, "r")
 # Process this file and turn it into a dictionary
 colors = dict()
 for line in cfh:
     splitLine = line.strip().split("\t")
     # Keys in dictionary are tissue, e.g. Adipose_Subcutaneous
     colors[splitLine[0]] = splitLine
 
 # Read in bed file
 ifh = open(bedFile, "r")
 for line in ifh:
     splitLine = line.strip().split("\t")
     # Extract tisue information from bed file
     tissue = splitLine[18]
     # If tissue is in color dict keys, then we're good
     if tissue in colors.keys():
         color = colors[tissue][2]
         # Set color field equal to tissue color
         splitLine[8] = color
     # Output modified line to stdout, can save to file by redirecting stdout to a file
     print(*splitLine, sep="\t")
 
 '_EOF_'
 
 # Then add colors:
 ./addTissueColors gtexDapg.sorted.bed ../gtexTissueColor.v8.tab > gtexDapg.sorted.colors.bed
 
 # Create custom as file for this bigBed:
 cat << '_EOF_' > dapg.as
 table bed
 "bed12+9 describing interaction between an eQTL and a target gene"
     (
     string          chrom;       "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records"
     uint            chromStart;  "Start position of eQTL"
     uint            chromEnd;    "End position of gene"
     string          name;        "Name of item for display in pattern eQTL name/Gene Symbol/Tissue"
     uint            score;       "Score. Always 1000."
     char[1]         strand;      "+ or - for strand"
     uint            thickStart;  "eQTL start"
     uint            thickEnd;    "eQTL end"
     uint            reserved;    "Item color, based on tissue colors GTEx Gene track"
     int             blockCount;  "Number of blocks"
     int[blockCount] blockSizes;  "Comma separated list of block sizes"
     int[blockCount] chromStarts; "Start positions of blocks relative to chromStart"
     string          eqtlPos;     "eQTL position"
     string          eqtlName;    "eQTL name (most often a dbSNP identifier)"
     string          genePos;     "Gene position"
     string          geneName;    "Gene symbol of target gene"
     string          geneId;      "ENSG ID of target gene"
     string          geneStrand;  "Strand of target gene"
     string          tissue;      "Tissue"
     double          pip;         "SNP posterior inclusion probability (higher PIP value indicates the SNP is more likely to be the casual eQTL)"
     double          clusterPip;  "Signal-level posterior inclusion probability (sum of the PIPs from all members of the signal cluster)"
     )
 '_EOF_'
 
 # Build bigBed
 bedToBigBed -as=dapg.as -type=bed12+9 gtexDapg.sorted.colors.bed /hive/data/genomes/hg38/chrom.sizes gtexDapg.bb
+
+#############################################################################
+# GTEx V9 Single Nuclei Expression August 31, 2022 ChrisL
+#############################################################################
+For building single cell rna-seq tracks from this paper:
+https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9383269/
+
+Files:
+https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad
+
+WORKDIR=/hive/data/outside/GTEx/V9/snRnaSeq/
+mkdir -p $WORKDIR && cd $WORKDIR
+
+wget https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad
+
+# Turn scanpy files into matrices and metadata:
+time hcaUnpack5 GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad atlasOut
+# Data on 209126 cells x 17695 genes
+# Sparse matrix with 173681687 elements
+# outputting 17695 row matrix, a dot every 100 rows
+# 
+# real    3m24.718s
+# user    2m39.348s
+# sys 0m27.629s
+
+# Look at the column summaries to see what is interesting:
+./getSimpleMetaStats atlasOut/meta.tsv | less -S
+cd atlasOut
+mkdir bbi
+mkdir clust
+~/bin/x86_64/matrixClusterColumns -makeIndex=clust/exprMatrix.ix exprMatrix.tsv meta.tsv \
+    prep clust/prepMatrix.tsv clust/prepStats.tsv \
+    "Tissue Composition" clust/TissueCompMatrix.tsv clust/TissueCompStats.tsv \
+    "Tissue Site Detail" clust/TissueSiteDetailMatrix.tsv clust/TissueSiteDetailStats.tsv \
+    "Sample Ischemic Time (mins)" clust/SampleIschemicTimeMatrix.tsv clust/SampleIschemicTimeStats.tsv \
+    "Granular cell type" clust/GranularMatrix.tsv clust/GranularStats.tsv \
+    "Broad cell type" clust/BroadCellTypeMatrix.tsv clust/BroadCellTypeStats.tsv \
+    Age_bin clust/Age_binMatrix.tsv clust/Age_binStats.tsv \
+    Sex clust/SexMatrix.tsv clust/SexStats.tsv
+
+# Figure out gene set
+gencodeVersionForGenes gene.lst /hive/data/inside/geneSymVerTx.tsv -bed=mapping.bed
+
+# Turn into bigBarCharts
+for s in prep TissueComp TissueSiteDetail SampleIschemicTime Granular BroadCellType Age_bin Sex
+do
+    matrixToBarChartBed clust/${s}Matrix.tsv mapping.bed clust/${s}.bed -stats=clust/${s}Stats.tsv -trackDb=clust/${s}.ra
+    bedSort clust/${s}.bed clust/${s}.bed
+    bedToBigBed clust/${s}.bed /hive/data/genomes/hg38/chrom.sizes bbi/${s}.bb -type=bed6+3 -as=${HOME}/kent/src/hg/lib/simpleBarChartBed.as
+done
+
+# Make a combined column for facetting
+tawk '{print $0,$31 "-" $46}' meta.tsv > meta.tsv.plusCombined
+~/bin/x86_64/matrixClusterColumns exprMatrix.tsv meta.tsv.plusCombined \
+    "Tissue-Broad cell type" clust/combined.tsv clust/combinedStats.tsv
+matrixToBarChartBed clust/combined.tsv mapping.bed clust/combined.bed -stats=clust/combinedStats.tsv -trackDb=clust/combined.ra
+bedSort clust/combined.bed clust/combined.bed 
+bedToBigBed clust/combined.bed /hive/data/genomes/hg38/chrom.sizes bbi/combined.bb -type=bed6+3 -as=${HOME}/kent/src/hg/lib/simpleBarChartBed.as
+
+# now un extract the original combined columns back into the counted up meta.tsv
+# which will give you the columns to facet on
+echo -e "#Organ and Cell Type	count	total	Organ	Broad_Cell_Type" > combinedStats.withFacets
+paste <(tail -n +2 clust/combinedStats.tsv) <(tail -n +2 meta.tsv | cut -f31,46 | sort -u) >> combinedStats.withFacets
+# manually make cells.colors by taking colors for Broad Cell Type from the paper
+# make categories file for trackDb
+join -t$'\t' -1 5 -2 1 <(grep -v "^#" combinedStats.withFacets | sort -t$'\t' -k5) ../cells.colors | cut -f2,6 | sort > atlasOut.categories
+# Make stats file for trackDb 
+echo -e "#Organ and Cell Type      count   total   Organ   Broad_Cell_Type color" > combinedStats.withFacets.withColors.tsv
+join -t$'\t' -1 5 -2 1 <(grep -v "^#" combinedStats.withFacets | sort -t$'\t' -k5) ../cells.colors | tawk '{print $2,$3,$4,$5,$1,$6}' | sort >> combinedStats.withFacets.withColors.tsv 
+
+# link the tracks
+cd /gbdb/hg38/gtex
+mkdir snRnaSeq
+ln -s /hive/data/outside/gtex/V9/snRnaSeq/atlasOut/bbi/*.bb .