ab7847c47ac5507a7e126691bb78826ed4ba345d chmalee Mon Sep 13 16:34:47 2021 -0700 Staging GTEx single nuclei gene expression track, refs #29954 diff --git src/hg/makeDb/doc/hg38/gtex.txt src/hg/makeDb/doc/hg38/gtex.txt index 9f9dbaf..047c6d0 100644 --- src/hg/makeDb/doc/hg38/gtex.txt +++ src/hg/makeDb/doc/hg38/gtex.txt @@ -695,15 +695,89 @@ int[blockCount] chromStarts; "Start positions of blocks relative to chromStart" string eqtlPos; "eQTL position" string eqtlName; "eQTL name (most often a dbSNP identifier)" string genePos; "Gene position" string geneName; "Gene symbol of target gene" string geneId; "ENSG ID of target gene" string geneStrand; "Strand of target gene" string tissue; "Tissue" double pip; "SNP posterior inclusion probability (higher PIP value indicates the SNP is more likely to be the casual eQTL)" double clusterPip; "Signal-level posterior inclusion probability (sum of the PIPs from all members of the signal cluster)" ) '_EOF_' # Build bigBed bedToBigBed -as=dapg.as -type=bed12+9 gtexDapg.sorted.colors.bed /hive/data/genomes/hg38/chrom.sizes gtexDapg.bb + +############################################################################# +# GTEx V9 Single Nuclei Expression August 31, 2022 ChrisL +############################################################################# +For building single cell rna-seq tracks from this paper: +https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9383269/ + +Files: +https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad + +WORKDIR=/hive/data/outside/GTEx/V9/snRnaSeq/ +mkdir -p $WORKDIR && cd $WORKDIR + +wget https://storage.googleapis.com/gtex_analysis_v9/snrna_seq_data/GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad + +# Turn scanpy files into matrices and metadata: +time hcaUnpack5 GTEx_8_tissues_snRNAseq_atlas_071421.public_obs.h5ad atlasOut +# Data on 209126 cells x 17695 genes +# Sparse matrix with 173681687 elements +# outputting 17695 row matrix, a dot every 100 rows +# +# real 3m24.718s +# user 2m39.348s +# sys 0m27.629s + +# Look at the column summaries to see what is interesting: +./getSimpleMetaStats atlasOut/meta.tsv | less -S +cd atlasOut +mkdir bbi +mkdir clust +~/bin/x86_64/matrixClusterColumns -makeIndex=clust/exprMatrix.ix exprMatrix.tsv meta.tsv \ + prep clust/prepMatrix.tsv clust/prepStats.tsv \ + "Tissue Composition" clust/TissueCompMatrix.tsv clust/TissueCompStats.tsv \ + "Tissue Site Detail" clust/TissueSiteDetailMatrix.tsv clust/TissueSiteDetailStats.tsv \ + "Sample Ischemic Time (mins)" clust/SampleIschemicTimeMatrix.tsv clust/SampleIschemicTimeStats.tsv \ + "Granular cell type" clust/GranularMatrix.tsv clust/GranularStats.tsv \ + "Broad cell type" clust/BroadCellTypeMatrix.tsv clust/BroadCellTypeStats.tsv \ + Age_bin clust/Age_binMatrix.tsv clust/Age_binStats.tsv \ + Sex clust/SexMatrix.tsv clust/SexStats.tsv + +# Figure out gene set +gencodeVersionForGenes gene.lst /hive/data/inside/geneSymVerTx.tsv -bed=mapping.bed + +# Turn into bigBarCharts +for s in prep TissueComp TissueSiteDetail SampleIschemicTime Granular BroadCellType Age_bin Sex +do + matrixToBarChartBed clust/${s}Matrix.tsv mapping.bed clust/${s}.bed -stats=clust/${s}Stats.tsv -trackDb=clust/${s}.ra + bedSort clust/${s}.bed clust/${s}.bed + bedToBigBed clust/${s}.bed /hive/data/genomes/hg38/chrom.sizes bbi/${s}.bb -type=bed6+3 -as=${HOME}/kent/src/hg/lib/simpleBarChartBed.as +done + +# Make a combined column for facetting +tawk '{print $0,$31 "-" $46}' meta.tsv > meta.tsv.plusCombined +~/bin/x86_64/matrixClusterColumns exprMatrix.tsv meta.tsv.plusCombined \ + "Tissue-Broad cell type" clust/combined.tsv clust/combinedStats.tsv +matrixToBarChartBed clust/combined.tsv mapping.bed clust/combined.bed -stats=clust/combinedStats.tsv -trackDb=clust/combined.ra +bedSort clust/combined.bed clust/combined.bed +bedToBigBed clust/combined.bed /hive/data/genomes/hg38/chrom.sizes bbi/combined.bb -type=bed6+3 -as=${HOME}/kent/src/hg/lib/simpleBarChartBed.as + +# now un extract the original combined columns back into the counted up meta.tsv +# which will give you the columns to facet on +echo -e "#Organ and Cell Type count total Organ Broad_Cell_Type" > combinedStats.withFacets +paste <(tail -n +2 clust/combinedStats.tsv) <(tail -n +2 meta.tsv | cut -f31,46 | sort -u) >> combinedStats.withFacets +# manually make cells.colors by taking colors for Broad Cell Type from the paper +# make categories file for trackDb +join -t$'\t' -1 5 -2 1 <(grep -v "^#" combinedStats.withFacets | sort -t$'\t' -k5) ../cells.colors | cut -f2,6 | sort > atlasOut.categories +# Make stats file for trackDb +echo -e "#Organ and Cell Type count total Organ Broad_Cell_Type color" > combinedStats.withFacets.withColors.tsv +join -t$'\t' -1 5 -2 1 <(grep -v "^#" combinedStats.withFacets | sort -t$'\t' -k5) ../cells.colors | tawk '{print $2,$3,$4,$5,$1,$6}' | sort >> combinedStats.withFacets.withColors.tsv + +# link the tracks +cd /gbdb/hg38/gtex +mkdir snRnaSeq +ln -s /hive/data/outside/gtex/V9/snRnaSeq/atlasOut/bbi/*.bb .