a2ad2a8f3e71fe90a3c335f967ff3fcec9d37296
kate
  Thu Apr 23 10:38:28 2020 -0700
Initial work for GTEx V8 gene expression track: parse files and load gene expression and metadata tables. refs #25130

diff --git src/hg/makeDb/doc/gtex/V8.txt src/hg/makeDb/doc/gtex/V8.txt
new file mode 100644
index 0000000..817c0fc
--- /dev/null
+++ src/hg/makeDb/doc/gtex/V8.txt
@@ -0,0 +1,149 @@
+# Download and load GTEx V8 (October 2015) from portal:
+#      gtexportal.org
+# 2/20 KRR
+
+cd /hive/data/outside/GTEx
+
+# Download normalized gene expression levels (TPM)
+
+mkdir -p V8/rnaSeq
+cd V8/rnaSeq
+
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz
+set dataFile = GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct
+gunzip $dataFile.gz
+wc -l $dataFile
+# 56203 GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct
+
+# Download subject and sample metadata and compare to V6
+
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDD.xlsx
+
+wc -l *Sample*txt ../../V6/*Sample*txt
+# 22952 GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
+# 11984 GTEx_Data_V6_Annotations_SampleAttributesDS.txt
+
+# > 2x more samples
+
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SubjectPhenotypesDD.xlsx
+
+wc -l *Phenotypes*.txt ../../V6/*Phenotypes*.txt
+# 981 GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt
+# 571 ../../V6/GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt
+
+# nearly 2x subjects
+
+# Download gene models
+wget -nd  https://storage.googleapis.com/gtex_analysis_v8/reference/gencode.v26.GRCh38.genes.gtf
+
+# Parse and load
+# 4/20 KRR
+
+set subjectFile = GTEx_Analysis_v8_Annotations_SubjectPhenotypesDS.txt
+set sampleFile = GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
+
+set hgGtex = ~kate/kent/src/hg/makeDb/outside/hgGtex/hgGtex
+set dir = tables
+mkdir $dir
+
+# test run -- metadata, and just a few rows of data
+$hgGtex -tab=$dir -noLoad -limit=2 gtexV8 V8 $dataFile $sampleFile $subjectFile $tissueFile -verbose=2 >&! hgGtex.log &
+
+# NOTE: change to tissues in sample descriptions... (field 7)
+# Changed fibroblasts description, and added Kidney - Medulla (4 samples)
+diff sampleTissues.txt /hive/data/outside/GTEx/V6
+0a1
+> 
+22d22
+< Cells - Cultured fibroblasts
+24a25
+> Cells - Transformed fibroblasts
+36d36
+< Kidney - Medulla
+
+# Manually update tissueFile, with version that includes GTEx abbrevs
+cp ../../V6/metadata/gtexTissue.new.tab gtexColorTissue.tab
+# edit
+set tissueFile = gtexColorTissue.tab
+
+# sanity check files
+cd tables
+ls -l
+-rw-rw-r-- 1 kate genecats       17346 Apr 21 16:33 gtexV8Donor.tab
+-rw-rw-r-- 1 kate genecats          14 Apr 21 16:53 gtexV8Info.tab
+-rw-rw-r-- 1 kate genecats     3062798 Apr 21 16:33 gtexV8Sample.tab
+-rw-rw-r-- 1 kate genecats 61521590867 Apr 21 16:53 gtexV8SampleData.tab
+-rw-rw-r-- 1 kate genecats   189120561 Apr 21 16:53 gtexV8TissueData.tab
+-rw-rw-r-- 1 kate genecats    19937702 Apr 21 16:53 gtexV8TissueMedianAll.tab
+-rw-rw-r-- 1 kate genecats    19913568 Apr 21 16:53 gtexV8TissueMedianFemale.tab
+-rw-rw-r-- 1 kate genecats    19871866 Apr 21 16:53 gtexV8TissueMedianMale.tab
+
+wc -l gtex*Tab
+        980 gtexV8Donor.tab
+          1 gtexV8Info.tab
+      17382 gtexV8Sample.tab
+  976868400 gtexV8SampleData.tab
+    3034800 gtexV8TissueData.tab
+      56200 gtexV8TissueMedianAll.tab
+      56200 gtexV8TissueMedianFemale.tab
+      56200 gtexV8TissueMedianMale.tab
+
+# peruse Info.tab, first lines of others
+
+# Look OK, so load into database
+
+$hgGtex -tab=$dir gtexV8 V8 $dataFile $sampleFile $subjectFile $tissueFile -verbose=2 >&! load.log &
+
+[hgFixed]> show tables like 'gtexV8%';
++-----------------------------+
+| Tables_in_hgFixed (gtexV8%) |
++-----------------------------+
+| gtexV8Donor                 |
+| gtexV8Info                  |
+| gtexV8Sample                |
+| gtexV8SampleData            |
+| gtexV8Tissue                |
+| gtexV8TissueMedianAll       |
+| gtexV8TissueMedianFemale    |
+| gtexV8TissueMedianMale      |
++-----------------------------+
+
+# merge gtexV8Info into gtexInfo (i.e. add a row for V8)
+hgsql hgFixed -e 'select * from gtexV8Info';
++---------+-------------+----------+----------------+
+| version | releaseDate | maxScore | maxMedianScore |
++---------+-------------+----------+----------------+
+| V8      | 0           |        0 |         747400 |
++---------+-------------+----------+----------------+
+
+# get release date from GTEx portal
+hgsql hgFixed -e "insert into gtexInfo set version='V8', releaseDate='2019-08-26', maxMedianScore=747400"
+
+# Rename tables for use by hgGtexGeneBed
+
+hgsql hgFixed -e 'alter table gtexV8TissueMedianAll rename to gtexTissueMedianV8';
+
+#######################################
+# Download fine-mapped eQTL's
+
+cd ../
+mkdir eQtl
+cd eQtl
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/README_eQTL_v8.txt
+
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/single_tissue_qtl_data/GTEx_v8_finemapping_CAVIAR.tar
+
+tar vf GTEx_v8_finemapping_CAVIAR.tar
+ln -s GTEx_v8_finemapping_CAVIAR caviar
+cd caviar
+
+# Download variant mapping and VEP annotation
+
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/reference/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table.txt.gz
+
+wget -nd https://storage.googleapis.com/gtex_analysis_v8/reference/WGS_Feature_overlap_collapsed_VEP_short_4torus.MAF01.txt.gzo
+
+gunzip *.gz
+