ab7847c47ac5507a7e126691bb78826ed4ba345d chmalee Mon Sep 13 16:34:47 2021 -0700 Staging GTEx single nuclei gene expression track, refs #29954 diff --git src/hg/makeDb/doc/hg38/gtex.txt src/hg/makeDb/doc/hg38/gtex.txt index 9f9dbaf..047c6d0 100644 --- src/hg/makeDb/doc/hg38/gtex.txt +++ src/hg/makeDb/doc/hg38/gtex.txt @@ -1,709 +1,783 @@ ############################################################################# # GTEx V6 (October 2015) Kate # Create BED from hgFixed tables (see doc/gtex) # Reloading during QA of track (fixing gene classes, adding scores). (March 2016) Kate cd /hive/data/outside/gtex/V6 # see doc/hg19.txt for how this genePred was made set chain = /hive/data/genomes/hg19/bed/liftOver/hg19ToHg38.over.chain.gz liftOver -genePred gencodeV19.hg19.genePred $chain gtexGeneModelV6.hg38.genePred \ gencode.V19.hg38.unmapped # 926 unmapped hgLoadGenePred hg38 gtexGeneModelV6 gtexGeneModelV6.hg38.genePred # OLD: creates gtexGeneModelV6.hg38.genePred # OLD: NOTE: drops 192 transcripts. One I spot-checked indeed didn't exist in our hg38 genes cd /hive/data/genomes/hg38/bed mkdir -p gtex cd gtex # table renamed to gtexGeneModel later # Use latest GENCODE attrs file to get biotypes # create bed table ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed hg38 -gtexVersion=V6 \ -noLoad -gencodeVersion=$gencodeVersion gtexGeneBedV6 -verbose=2 >&! makeGeneBed.log # 45 genes not found in GENCODE attributes table #Max score: 219385.906250 wc -l *.tab wc -l *.tab 52896 gtexGeneBedV6.tab # 55810 gtexGeneBedV6.tab # exploratory for assigning score based on sum of scores (expScores field) set bedScore = ~/kent/src/utils/bedScore/bedScore $bedScore -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.bed $bedScore -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.bed $bedScore -col=10 -minScore=1 -method=reg gtexGeneBedV6.tab gtexGeneBedV6.reg.bed $bedScore -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.bed $bedScore -col=10 -minScore=1 -method=reg -log gtexGeneBedV6.tab gtexGeneBedV6.reg.log.bed $bedScore -log -col=10 -minScore=1 -method=std2 gtexGeneBedV6.tab gtexGeneBedV6.std2.log.bed $bedScore -log -col=10 -minScore=1 -method=encode gtexGeneBedV6.tab gtexGeneBedV6.encode.log.bed $bedScore -log -col=10 -minScore=1 -method=asinh gtexGeneBedV6.tab gtexGeneBedV6.asinh.log.bed # Using -log -method=encode, as this is closest to density plot of all scores # as here: GtexTotalExpressionDensity.png, GtexTotalExpressionFrequency.png textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV6.encode.log.bed 1.000000 ************************************************************ 22889 72.357214 *************************************************** 4862 143.714428 ************************************************** 4199 215.071643 ************************************************* 3931 286.428857 ************************************************** 4329 357.786071 *************************************************** 5419 429.143285 ************************************************** 4472 500.500500 ********************************************* 1953 571.857714 ************************************** 564 643.214928 ******************************** 200 714.572142 ************************* 61 785.929356 *********** 6 857.286571 ******** 4 928.643785 ************ 7 $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneBedV6.tab gtexGeneBedV6.bed # load up set lib = ~/kent/src/hg/lib hgLoadBed hg38 -noBin -tab -type=bed6+4 \ -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \ gtexGeneBedNewV6 gtexGeneBedV6.bed # Add GTEx to Gene Sorter (2016-08-18 kate) # See hg/near/makeNear.doc ############################################################################# # GTEx V8 (Apr 2020) Kate # Create BED from hgFixed tables (see doc/gtex) # Load gene models (Gencode V26 transcript union from GTEx) cd /hive/data/outside/gtex/V8/rnaSeq gtfToGenePred gencode.v26.GRCh38.genes.gtf gencodeV26.hg38.genePred \ -infoOut=gtexGeneModelInfoV8.tab hgLoadGenePred hg38 gtexGeneModelV8 gencodeV26.hg38.genePred # Get transcript for each gene (why ?) tail -n +2 gtexGeneModelInfoV8.tab | awk '{printf("%s\t%s\n", $1, $9)}' > gtexGeneTranscriptsV8.tab #hgLoadSqlTab hgFixed gtexTranscriptV8 ~/kent/src/hg/lib/gtexTranscript.sql gtexGeneTranscriptsV8.tab # no schema (or table on hgwdev.hgFixed) # Load BED table cd /hive/data/genomes/hg38/bed/gtex mkdir V8 cd V8 set gencode = V26 ~/kent/src/hg/makeDb/outside/hgGtexGeneBed/hgGtexGeneBed \ hg38 -noLoad -gtexVersion=V8 -gencodeVersion=$gencode gtexGeneV8 -verbose=2 >&! log.txt & Reading wgEncodeGencodeAttrs table Reading gtexGeneModelV8 table Reading gtexTissueMedian table Writing tab file gtexGeneV8 Max score: 267400.000000 # Add scores set bedScore = ~/kent/src/utils/bedScore/bedScore $bedScore -col=10 -minScore=0 -log -method=encode gtexGeneV8.tab gtexGeneBedV8.bed textHistogram -real -autoScale=14 -log -col=5 gtexGeneBedV8.bed 0.000000 ************************************************************ 21287 71.428643 **************************************************** 5635 142.857286 **************************************************** 5513 214.285929 *************************************************** 4683 285.714571 *************************************************** 4480 357.143214 *************************************************** 4748 428.571857 **************************************************** 5466 500.000500 ************************************************ 3117 571.429143 ***************************************** 911 642.857786 ********************************* 252 714.286429 ************************** 81 785.715071 ************** 11 857.143714 ******** 4 928.572357 *************** 12 # load up set lib = ~/kent/src/hg/lib hgLoadBed hg38 -noBin -tab -type=bed6+4 \ -as=$lib/gtexGeneBed.as -sqlTable=$lib/gtexGeneBed.sql -renameSqlTable \ gtexGeneV8 gtexGeneBedV8.bed #Read 56200 elements of size 10 from gtexGeneBedV8.bed ### TODO # Add GTEx to Gene Sorter (2016-08-18 kate) # See hg/near/makeNear.doc ############################################################################# # GTEx V8 cis-eQTLs CAVIAR High Confidence (Apr 2022) Matt cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_CAVIAR # Download files from wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_CAVIAR.tar # unpack files tar xvf GTEx_v8_finemapping_CAVIAR.tar # Other files used: # Lookup table for all variants genotyped in GTEx wget https://storage.googleapis.com/gtex_analysis_v8/reference/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz # Gene-level model based on the GENCODE 26 transcript model, where isoforms were collapsed to a single transcript per gene. wget https://storage.googleapis.com/gtex_analysis_v8/reference/gencode.v26.GRCh38.genes.gtf # Initially planned to use this file: # CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz # as it seemed to be a filtered subset of eQTLs # Description from GTEx_v8_finemapping_CAVIAR/README.txt # ***CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz --> is a single file for all GTEx tissues and all eGene where we report # all the high causal variants (variants that have posterior probability of > 0.1). # Sample header line: # TISSUE GENE eQTL CHROM POS Probability # Brain_Caudate_basal_ganglia ENSG00000248485.1 1_161274374 1 161274374 0.157456 # However, the names/positions in the eQTL column are not unique meaning # We want file with unique variant/eQTL names that match those in the GTEx # variant mapping file: GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz # Looks like the CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz has names # that match the names in the variant/eQTL mapping file # Description from GTEx_v8_finemapping_CAVIAR/README.txt # ***CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz —> is a single file for all GTEx tissues and all eGenes where we reported # the CPP (Causal Posterior Probability). Each eQTL contains the allele information. Sample header file: # TISSUE GENE eQTL CHROM POS Probability # Brain_Caudate_basal_ganglia ENSG00000248485.1 chr1_161274374_G_A_b38 1 161274374 0.157456 # So, take NOCUTOFF_with_Allele file and then filter for probability >0.1 (which is how HighConfidentVariants file was created according to GTEx README) zcat CAVIAR_Results_v8_GTEx_LD_ALL_NOCUTOFF_with_Allele.txt.gz | awk '$6 > 0.1' | gzip -c > CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz # Confirm that it has the same size as original HighConfidentVariants file zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants.gz | wc -l 1257158 zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | wc -l 1257158 # There seem to 31 duplicate eQTLs, meaning that the two lines for these entries are duplicated (probability values, etc.) # These will essentially be collapsed into a single entry by the buildInteract script zcat CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz | sort | uniq -c | sort -k1,1nr | awk '$1 > 1' > dupe_eQtls.txt wc -l dupe_eQtls.txt 31 dupe_eQtls.txt # Wrote buildGtexEqtlBed script to help build bed files from eQTL files: cat << '_EOF_' > buildGtexEqtlBed #!/usr/bin/env python3 import sys, gzip qtlFile = sys.argv[1] gpFile = sys.argv[2] alleleFile = sys.argv[3] qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL probCol = sys.argv[5] # Column name in file header, e.g. Probability # DAP-G files have an extra probability column that we want to capture if len(sys.argv) > 6: clusterProbCol = sys.argv[6] else: clusterProbCol = False # Open up all of our files qfh = gzip.open(qtlFile,"r") # QTL file gfh = open(gpFile,"r") # genePred file afh = gzip.open(alleleFile,"r") # allele file # Set up dicts for each qtlDict = dict() gpDict = dict() alleleDict = dict() # Get index of field that we want to replace values in header = qfh.readline().decode('ASCII').rstrip().split("\t") qtl_col = header.index(qtlIdCol) prob_col = header.index(probCol) if clusterProbCol: clus_prob_col = header.index(clusterProbCol) # Process eQTL file for line in qfh: qtlLine = line.decode('ASCII').strip().split("\t") # Skip header line in file #if "TISSUE" not in qtlLine: tissue = qtlLine[0] gene = qtlLine[1] e_qtl = qtlLine[qtl_col] # each of these elements on their own aren't unique, but together they are qtlKey = e_qtl + "|" + tissue + "|" + gene # Put these into a dict where key is unique name we've made qtlDict[qtlKey] = qtlLine # Process genePred file for line in gfh: gpLine = line.strip().split("\t") gene_id = gpLine[0] # Put into dict with key on ENSG* ID, e.g. gpDict[gene_id] = gpLine # Process allele file from GTEx for line in afh: # gzip file, so we need to decode to asii first aLine = line.decode('ASCII').strip().split("\t") # Key our dict using the allele, which is unique in the file allele = aLine[0] alleleDict[allele] = aLine for entry in qtlDict: # Get gene information from the genePred dict # eQTL file/dict has ENSG id in it gene_id = qtlDict[entry][1] if gene_id in gpDict.keys(): gene_sym = gpDict[gene_id][11] gene_chrom = gpDict[gene_id][1] gene_strand = gpDict[gene_id][2] gene_start = int(gpDict[gene_id][3]) gene_end = int(gpDict[gene_id][4]) # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID var_name = qtlDict[entry][qtl_col] if var_name in alleleDict.keys(): rsID = alleleDict[var_name][6] if rsID == '.': rsID = var_name var_start = int(alleleDict[var_name][2]) - 1 var_end = int(alleleDict[var_name][2]) chrom = alleleDict[var_name][1] # This block of if/else sets up the blockSizes/blockStart/blockCount for # various orientations of eQTLs/genes if chrom == gene_chrom: # Handles case where eQTL is internal to a gene if var_start > gene_start and var_start < gene_end: chromStart = gene_start chromEnd = gene_end blockCount = "1" blockSizes = gene_end - gene_start blockStarts = "0" strand = "." # Handles case where eQTL is downstream of gene elif var_start > gene_end: chromStart = gene_start chromEnd = var_end bs1 = gene_end - gene_start blockCount = "2" blockSizes = str(bs1) + ",1" blockStarts = "0," + str(var_start - gene_start) strand = "-" # Handles case where eQTL is upstream of gene elif var_start < gene_start: chromStart = var_start chromEnd = gene_end blockCount = "2" bs2 = gene_end - gene_start blockSizes = "1," + str(bs2) blockStarts = "0," + str(gene_start - var_start) strand = "+" # Handles case where eQTL and start of gene overlap elif var_start == gene_start: chromStart = var_start chromEnd = gene_end blockCount = "1" blockSizes = str(gene_end - var_start) blockStarts = "0" strand = "." # Handles case where eQTL and end of gene overlap elif gene_end == var_start: chromStart = gene_start chromEnd = var_end blockCount = "1" blockSizes = str(var_end - gene_start) blockStarts = "0" strand = "." # Add each element of our bed line to a list to print later bedLine = list() bedLine.append(chrom) # chrom bedLine.append(chromStart) # chrStart bedLine.append(chromEnd) # chrEnd bedLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name bedLine.append(int("1000"))# score bedLine.append(strand)# strand bedLine.append(var_start) # thickStart bedLine.append(var_end) # thickEnd bedLine.append("0,0,0") #color - eventually replaced by 'addTissueColors' script bedLine.append(blockCount) # blockCount bedLine.append(blockSizes) # blockSizes bedLine.append(blockStarts) # blockSizes bedLine.append(chrom + ":" + str(var_start+1) + "-" + str(var_end)) # prettified gene pos bedLine.append(rsID) # sourceName bedLine.append(chrom + ":" + str(gene_start+1) + "-" + str(gene_end)) # prettified gene pos bedLine.append(gene_sym) # geneName bedLine.append(gene_id) # geneId bedLine.append(gene_strand) # targetStrand bedLine.append(qtlDict[entry][0]) # tissue bedLine.append(qtlDict[entry][prob_col]) # CAVIAR == Causal Posterior Probability; DAPG == SNP posterior inclusion probability if clusterProbCol: # Signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster) bedLine.append(qtlDict[entry][clus_prob_col]) # Print bed line to stdout, can redirect to a file to save it print(*bedLine, sep="\t") '_EOF_' # Then run buildGtexEqtlBed to build an bed file ./buildGtexEqtlBed CAVIAR_Results_v8_GTEx_LD_HighConfidentVariants_with_Allele.gz ../gencode.v26.GRCh38.genes.gpExt \ ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz eQTL Probability | \ bedSort stdin gtexCaviar.sorted.bed # Create script to add colors to bed file cat << '_EOF_' > addTissueColors #!/usr/bin/env python3 #### # Intially created bed file w/o colors # This script will replace col 9 with tissue-specific colors #### import sys bedFile = sys.argv[1] colorFile = sys.argv[2] # Read in color file from // cfh = open(colorFile, "r") # Process this file and turn it into a dictionary colors = dict() for line in cfh: splitLine = line.strip().split("\t") # Keys in dictionary are tissue, e.g. Adipose_Subcutaneous colors[splitLine[0]] = splitLine # Read in bed file ifh = open(bedFile, "r") for line in ifh: splitLine = line.strip().split("\t") # Extract tisue information from bed file tissue = splitLine[18] # If tissue is in color dict keys, then we're good if tissue in colors.keys(): color = colors[tissue][2] # Set color field equal to tissue color splitLine[8] = color # Output modified line to stdout, can save to file by redirecting stdout to a file print(*splitLine, sep="\t") '_EOF_' # Then add colors: ./addTissueColors gtexCaviar.sorted.bed ../gtexTissueColor.v8.tab > gtexCaviar.sorted.colors.bed # Create custom as file for this bigBed: cat << '_EOF_' > dapg.as table bed "bed12+9 describing interaction between an eQTL and a target gene" ( string chrom; "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records" uint chromStart; "Start position of eQTL" uint chromEnd; "End position of gene" string name; "Name of item for display in pattern eQTL name/Gene Symbol/Tissue" uint score; "Score. Always 1000." char[1] strand; "+ or - for strand" uint thickStart; "eQTL start" uint thickEnd; "eQTL end" uint reserved; "Item color, based on tissue colors GTEx Gene track" int blockCount; "Number of blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Start positions of blocks relative to chromStart" string eqtlPos; "eQTL position" string eqtlName; "eQTL name (most often a dbSNP identifier)" string genePos; "Gene position" string geneName; "Gene symbol of target gene" string geneId; "ENSG ID of target gene" string geneStrand; "Strand of target gene" string tissue; "Tissue" double cpp; "Causal Posterior Probability (CPP) that quantifies the probability that a variant is causal" ) '_EOF_' # Build bigBed bedToBigBed -as=caviar.as -type=bed12+8 gtexCaviar.sorted.colors.bed /hive/data/genomes/hg38/chrom.sizes gtexCaviar.bb ############################################################################# # GTEx V8 cis-eQTLs DAP-G High Confidence (Apr 2022) Matt cd /hive/data/genomes/hg38/bed/gtex/V8/eQtl/finemap_dap-g # Download files from wget https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_DAPG.tar # unpack files tar xvf GTEx_v8_finemapping_DAPG.tar # Most files in VCF, but there is which is a TSV, GTEx_v8_finemapping_DAPG.txt.gz # Sample header lines tissue_id gene_id cluster_id cluster_pip variant_id variant_pip Adipose_Subcutaneous ENSG00000000457.13 1 1.000 chr1_169891332_G_A_b38 9.881e-01 Adipose_Subcutaneous ENSG00000000457.13 1 1.000 chr1_169913898_G_A_b38 1.170e-02 Adipose_Subcutaneous ENSG00000000457.13 2 0.994 chr1_169894024_A_C_b38 1.208e-01 # That's a big file: zcat GTEx_v8_finemapping_DAPG.txt.gz |wc -l 21648585 # How can we filter that down? # Looks like they provide a filtered file, GTEx_v8_finemapping_DAPG.CS95.txt.gz, # but it's in VCF format. Can we replicate it using their TSV file? # That VCF is filtered on 'SPIP' only including those clusters with SPIP > 0.95, where # SPIP: signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster) # SPIP seems equivalent to the 'cluster_pip' in our TSV file # So, create a new file where we only include those entries with cluster pip > 0.95 zcat GTEx_v8_finemapping_DAPG.txt.gz | awk '$4>0.95' | gzip -c > GTEx_v8_finemapping_DAPG.pip_95.txt.gz # A much smaller file (though still quite a lot of variants!) zcat GTEx_v8_finemapping_DAPG.pip_95.txt.gz |wc -l 7608545 # Wrote buildGtexEqtlBed script to help build bed files from eQTL files: cat << '_EOF_' > buildGtexEqtlBed #!/usr/bin/env python3 import sys, gzip qtlFile = sys.argv[1] gpFile = sys.argv[2] alleleFile = sys.argv[3] qtlIdCol = sys.argv[4] # Column name in file header, e.g. eQTL probCol = sys.argv[5] # Column name in file header, e.g. Probability # DAP-G files have an extra probability column that we want to capture if len(sys.argv) > 6: clusterProbCol = sys.argv[6] else: clusterProbCol = False # Open up all of our files qfh = gzip.open(qtlFile,"r") # QTL file gfh = open(gpFile,"r") # genePred file afh = gzip.open(alleleFile,"r") # allele file # Set up dicts for each qtlDict = dict() gpDict = dict() alleleDict = dict() # Get index of field that we want to replace values in header = qfh.readline().decode('ASCII').rstrip().split("\t") qtl_col = header.index(qtlIdCol) prob_col = header.index(probCol) if clusterProbCol: clus_prob_col = header.index(clusterProbCol) # Process eQTL file for line in qfh: qtlLine = line.decode('ASCII').strip().split("\t") # Skip header line in file #if "TISSUE" not in qtlLine: tissue = qtlLine[0] gene = qtlLine[1] e_qtl = qtlLine[qtl_col] # each of these elements on their own aren't unique, but together they are qtlKey = e_qtl + "|" + tissue + "|" + gene # Put these into a dict where key is unique name we've made qtlDict[qtlKey] = qtlLine # Process genePred file for line in gfh: gpLine = line.strip().split("\t") gene_id = gpLine[0] # Put into dict with key on ENSG* ID gpDict[gene_id] = gpLine # Process allele file from GTEx for line in afh: # gzip file, so we need to decode to asii first aLine = line.decode('ASCII').strip().split("\t") # Key our dict using the allele, which is unique in the file allele = aLine[0] alleleDict[allele] = aLine for entry in qtlDict: # Get gene information from the genePred dict # eQTL file/dict has ENSG id in it gene_id = qtlDict[entry][1] if gene_id in gpDict.keys(): gene_sym = gpDict[gene_id][11] gene_chrom = gpDict[gene_id][1] gene_strand = gpDict[gene_id][2] gene_start = int(gpDict[gene_id][3]) gene_end = int(gpDict[gene_id][4]) # Check if eQTL is in allele/eQTL dictionary so that we can grab the rsID var_name = qtlDict[entry][qtl_col] if var_name in alleleDict.keys(): rsID = alleleDict[var_name][6] if rsID == '.': rsID = var_name var_start = int(alleleDict[var_name][2]) - 1 var_end = int(alleleDict[var_name][2]) chrom = alleleDict[var_name][1] # This block of if/else sets up the blockSizes/blockStart/blockCount for # various orientations of eQTLs/genes if chrom == gene_chrom: # Handles case where eQTL is internal to a gene if var_start > gene_start and var_start < gene_end: chromStart = gene_start chromEnd = gene_end blockCount = "1" blockSizes = gene_end - gene_start blockStarts = "0" strand = "." # Handles case where eQTL is downstream of gene elif var_start > gene_end: chromStart = gene_start chromEnd = var_end bs1 = gene_end - gene_start blockCount = "2" blockSizes = str(bs1) + ",1" blockStarts = "0," + str(var_start - gene_start) strand = "-" # Handles case where eQTL is upstream of gene elif var_start < gene_start: chromStart = var_start chromEnd = gene_end blockCount = "2" bs2 = gene_end - gene_start blockSizes = "1," + str(bs2) blockStarts = "0," + str(gene_start - var_start) strand = "+" # Handles case where eQTL and start of gene overlap elif var_start == gene_start: chromStart = var_start chromEnd = gene_end blockCount = "1" blockSizes = str(gene_end - var_start) blockStarts = "0" strand = "." # Handles case where eQTL and end of gene overlap elif gene_end == var_start: chromStart = gene_start chromEnd = var_end blockCount = "1" blockSizes = str(var_end - gene_start) blockStarts = "0" strand = "." # Add each element of our bed line to a list to print later bedLine = list() bedLine.append(chrom) # chrom bedLine.append(chromStart) # chrStart bedLine.append(chromEnd) # chrEnd bedLine.append(rsID + "/" + gene_sym + "/" + qtlDict[entry][0]) # name bedLine.append(int("1000"))# score bedLine.append(strand)# strand bedLine.append(var_start) # thickStart bedLine.append(var_end) # thickEnd bedLine.append("0,0,0") #color - eventually replaced by 'addTissueColors' script bedLine.append(blockCount) # blockCount bedLine.append(blockSizes) # blockSizes bedLine.append(blockStarts) # blockSizes bedLine.append(chrom + ":" + str(var_start+1) + "-" + str(var_end)) # prettified gene pos bedLine.append(rsID) # sourceName bedLine.append(chrom + ":" + str(gene_start+1) + "-" + str(gene_end)) # prettified gene pos bedLine.append(gene_sym) # geneName bedLine.append(gene_id) # geneId bedLine.append(gene_strand) # targetStrand bedLine.append(qtlDict[entry][0]) # tissue bedLine.append(qtlDict[entry][prob_col]) # CAVIAR == Causal Posterior Probability; DAPG == SNP posterior inclusion probability if clusterProbCol: # Signal-level posterior_inclusion probability (sum of the PIPs from all members of the signal cluster) bedLine.append(qtlDict[entry][clus_prob_col]) # Print bed line to stdout, can redirect to a file to save it print(*bedLine, sep="\t") '_EOF_' # Then run buildGtexEqtlBed to build an bed file ./buildGtexEqtlBed GTEx_v8_finemapping_DAPG.pip_95.txt.gz ../gencode.v26.GRCh38.genes.gpExt \ ../GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz variant_id variant_pip cluster_pip | \ bedSort stdin gtexDapg.sorted.bed # Create script to add colors to bed file cat << '_EOF_' > addTissueColors #!/usr/bin/env python3 #### # Intially created bed file w/o colors # This script will replace col 9 with tissue-specific colors #### import sys bedFile = sys.argv[1] colorFile = sys.argv[2] # Read in color file from // cfh = open(colorFile, "r") # Process this file and turn it into a dictionary colors = dict() for line in cfh: splitLine = line.strip().split("\t") # Keys in dictionary are tissue, e.g. Adipose_Subcutaneous colors[splitLine[0]] = splitLine # Read in bed file ifh = open(bedFile, "r") for line in ifh: splitLine = line.strip().split("\t") # Extract tisue information from bed file tissue = splitLine[18] # If tissue is in color dict keys, then we're good if tissue in colors.keys(): color = colors[tissue][2] # Set color field equal to tissue color splitLine[8] = color # Output modified line to stdout, can save to file by redirecting stdout to a file print(*splitLine, sep="\t") '_EOF_' # Then add colors: ./addTissueColors gtexDapg.sorted.bed ../gtexTissueColor.v8.tab > gtexDapg.sorted.colors.bed # Create custom as file for this bigBed: cat << '_EOF_' > dapg.as table bed "bed12+9 describing interaction between an eQTL and a target gene" ( string chrom; "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records" uint chromStart; "Start position of eQTL" uint chromEnd; "End position of gene" string name; "Name of item for display in pattern eQTL name/Gene Symbol/Tissue" uint score; "Score. Always 1000." char[1] strand; "+ or - for strand" uint thickStart; "eQTL start" uint thickEnd; "eQTL end" uint reserved; "Item color, based on tissue colors GTEx Gene track" int blockCount; "Number of blocks" int[blockCount] blockSizes; "Comma separated list of block sizes" int[blockCount] chromStarts; "Start positions of blocks relative to chromStart" string eqtlPos; "eQTL position" string eqtlName; "eQTL name (most often a dbSNP identifier)" string genePos; "Gene position" string geneName; "Gene symbol of target gene" string geneId; "ENSG ID of target gene" string geneStrand; "Strand of target gene" string tissue; "Tissue" double pip; "SNP posterior inclusion probability (higher PIP value indicates the SNP is more likely to be the casual eQTL)" double clusterPip; "Signal-level posterior inclusion probability (sum of the PIPs from all members of the signal cluster)" ) '_EOF_' # Build bigBed bedToBigBed -as=dapg.as -type=bed12+9 gtexDapg.sorted.colors.bed /hive/data/genomes/hg38/chrom.sizes gtexDapg.bb + +############################################################################# +# GTEx V9 Single Nuclei Expression August 31, 2022 ChrisL +############################################################################# +For building single cell rna-seq tracks from this paper: +https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9383269/ + +Files: +https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad + +WORKDIR=/hive/data/outside/GTEx/V9/snRnaSeq/ +mkdir -p $WORKDIR && cd $WORKDIR + +wget https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad + +# Turn scanpy files into matrices and metadata: +time hcaUnpack5 GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad atlasOut +# Data on 209126 cells x 17695 genes +# Sparse matrix with 173681687 elements +# outputting 17695 row matrix, a dot every 100 rows +# +# real 3m24.718s +# user 2m39.348s +# sys 0m27.629s + +# Look at the column summaries to see what is interesting: +./getSimpleMetaStats atlasOut/meta.tsv | less -S +cd atlasOut +mkdir bbi +mkdir clust +~/bin/x86_64/matrixClusterColumns -makeIndex=clust/exprMatrix.ix exprMatrix.tsv meta.tsv \ + prep clust/prepMatrix.tsv clust/prepStats.tsv \ + "Tissue Composition" clust/TissueCompMatrix.tsv clust/TissueCompStats.tsv \ + "Tissue Site Detail" clust/TissueSiteDetailMatrix.tsv clust/TissueSiteDetailStats.tsv \ + "Sample Ischemic Time (mins)" clust/SampleIschemicTimeMatrix.tsv clust/SampleIschemicTimeStats.tsv \ + "Granular cell type" clust/GranularMatrix.tsv clust/GranularStats.tsv \ + "Broad cell type" clust/BroadCellTypeMatrix.tsv clust/BroadCellTypeStats.tsv \ + Age_bin clust/Age_binMatrix.tsv clust/Age_binStats.tsv \ + Sex clust/SexMatrix.tsv clust/SexStats.tsv + +# Figure out gene set +gencodeVersionForGenes gene.lst /hive/data/inside/geneSymVerTx.tsv -bed=mapping.bed + +# Turn into bigBarCharts +for s in prep TissueComp TissueSiteDetail SampleIschemicTime Granular BroadCellType Age_bin Sex +do + matrixToBarChartBed clust/${s}Matrix.tsv mapping.bed clust/${s}.bed -stats=clust/${s}Stats.tsv -trackDb=clust/${s}.ra + bedSort clust/${s}.bed clust/${s}.bed + bedToBigBed clust/${s}.bed /hive/data/genomes/hg38/chrom.sizes bbi/${s}.bb -type=bed6+3 -as=${HOME}/kent/src/hg/lib/simpleBarChartBed.as +done + +# Make a combined column for facetting +tawk '{print $0,$31 "-" $46}' meta.tsv > meta.tsv.plusCombined +~/bin/x86_64/matrixClusterColumns exprMatrix.tsv meta.tsv.plusCombined \ + "Tissue-Broad cell type" clust/combined.tsv clust/combinedStats.tsv +matrixToBarChartBed clust/combined.tsv mapping.bed clust/combined.bed -stats=clust/combinedStats.tsv -trackDb=clust/combined.ra +bedSort clust/combined.bed clust/combined.bed +bedToBigBed clust/combined.bed /hive/data/genomes/hg38/chrom.sizes bbi/combined.bb -type=bed6+3 -as=${HOME}/kent/src/hg/lib/simpleBarChartBed.as + +# now un extract the original combined columns back into the counted up meta.tsv +# which will give you the columns to facet on +echo -e "#Organ and Cell Type count total Organ Broad_Cell_Type" > combinedStats.withFacets +paste <(tail -n +2 clust/combinedStats.tsv) <(tail -n +2 meta.tsv | cut -f31,46 | sort -u) >> combinedStats.withFacets +# manually make cells.colors by taking colors for Broad Cell Type from the paper +# make categories file for trackDb +join -t$'\t' -1 5 -2 1 <(grep -v "^#" combinedStats.withFacets | sort -t$'\t' -k5) ../cells.colors | cut -f2,6 | sort > atlasOut.categories +# Make stats file for trackDb +echo -e "#Organ and Cell Type count total Organ Broad_Cell_Type color" > combinedStats.withFacets.withColors.tsv +join -t$'\t' -1 5 -2 1 <(grep -v "^#" combinedStats.withFacets | sort -t$'\t' -k5) ../cells.colors | tawk '{print $2,$3,$4,$5,$1,$6}' | sort >> combinedStats.withFacets.withColors.tsv + +# link the tracks +cd /gbdb/hg38/gtex +mkdir snRnaSeq +ln -s /hive/data/outside/gtex/V9/snRnaSeq/atlasOut/bbi/*.bb .