d96751b9e97f16417a020a9f6356432e6d102bc1 kate Wed Apr 29 10:21:59 2020 -0700 Correct tissue description, as per GTEx portal page for V8. refs #25130 diff --git src/hg/makeDb/doc/gtex/V6.txt src/hg/makeDb/doc/gtex/V6.txt index b7d5aca..3755d14 100644 --- src/hg/makeDb/doc/gtex/V6.txt +++ src/hg/makeDb/doc/gtex/V6.txt @@ -1,129 +1,143 @@ # Download and load tissue expression and sample metadata for GTEx V6 (October 2015) from portal: # gtexportal.org # 11/2015 KRR # Reloaded gtexSampleData table to restore zero-scored rows (2016-03-21 Kate) # Download normalized gene expression levels (RPKM) wget http://www.gtexportal.org/static/datasets/gtex_analysis_v6/rna_seq_data/GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct.gz gunzip $dataFile.gz wc -l $dataFile ../V4/*.gct # 56321 GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct # 55996 ../V4/GTEx_Analysis_2014-01-17_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct # Download subject and sample metadata and compare to V4 wget http://www.gtexportal.org/static/datasets/gtex_analysis_v6/annotations/GTEx_Data_V6_Annotations_SampleAttributesDD.xlsx wget http://www.gtexportal.org/static/datasets/gtex_analysis_v6/annotations/GTEx_Data_V6_Annotations_SampleAttributesDS.txt wc -l *Sample*txt ../V4/*Sample*txt 11984 GTEx_Data_V6_Annotations_SampleAttributesDS.txt 4502 ../V4/GTEx_Data_2014-01-17_Annotations_SampleAttributesDS.txt wget http://www.gtexportal.org/static/datasets/gtex_analysis_v6/annotations/GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt wget http://www.gtexportal.org/static/datasets/gtex_analysis_v6/annotations/GTEx_Data_V6_Annotations_SubjectPhenotypes_DD.xlsx wc --l *Subject*txt ../V4/*Subject*txt 571 GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt 215 ../V4/GTEx_Data_2014-01-17_Annotations_SubjectPhenotypes_DS.txt # Twice as many donors and samples # NOTE: format of subject file has changed slightly -- no longer including the word 'years' in age column. Sample file format appears unchanged. # Parser looks like it will work unchanged. # Start with tissues hgGtex $dataFile $sampleFile tissues.tab >&! parseTissues.log & # Looks like no changes from V4 # Download gene models wget http://www.gtexportal.org/static/datasets/gtex_analysis_v6/reference/gencode.v19.genes.patched_contigs.gtf.gz # Create main tables set subjectFile = GTEx_Data_V6_Annotations_SubjectPhenotypesDS.txt set sampleFile = GTEx_Data_V6_Annotations_SampleAttributesDS.txt set dataFile = GTEx_Analysis_v6_RNA-seq_RNA-SeQCv1.1.8_gene_rpkm.gct set tissueFile = ../V4/portal/gtexColorTissue.dec.tab hgGtex -tab=tables -noLoad gtexV6 V6 $dataFile $sampleFile $subjectFile $tissueFile >&! parseData.log & ls -l tables -rw-rw-r-- 1 kate genecats 9968 Dec 22 16:50 gtexV6Donor.tab -rw-rw-r-- 1 kate genecats 14 Dec 22 17:00 gtexV6Info.tab -rw-rw-r-- 1 kate genecats 1164893 Dec 22 16:50 gtexV6Sample.tab -rw-rw-r-- 1 kate genecats 30090841561 Dec 22 17:00 gtexV6SampleData.tab -rw-rw-r-- 1 kate genecats 184672753 Dec 22 17:00 gtexV6TissueData.tab -rw-rw-r-- 1 kate genecats 19436982 Dec 22 17:00 gtexV6TissueMedian.tab $ wc -l gtex*.tab 570 gtexV6Donor.tab 1 gtexV6Info.tab 8555 gtexV6Sample.tab 481800490 gtexV6SampleData.tab 2984854 gtexV6TissueData.tab 56318 gtexV6TissueMedian.tab # NOTE: half of the scores are zero valued. Keeping -- if we drop them code must # be adapted so median computation and sample counts are correct select count(*) from gtexSampleData where score=0; +-----------+ | count(*) | +-----------+ | 242219267 | +-----------+ # keeping these instructions for reference, but we will keep zeros #hgsql hgFixed -e 'alter table gtexSampleData disable keys; delete from gtexSampleData where score=0; alter table gtexSampleData enable keys' # 30 minutes or so # looks good, load tables hgGtex gtexV6 V6 $dataFile $sampleFile $subjectFile $tissueFile >&! parseData2.log & # merge gtexV6Info into gtexInfo (i.e. add a row for V6) hgsql hgFixed -e 'select * from gtexInfoV6'; hgsql hgFixed -e "insert into gtexInfo set version='V6', releaseDate='2015-10-01', maxMedianScore=711778" # Fix sample table (V6 format changed) # Required changes to hgGtex parser # 2016-03-01 kate hgGtex -tab=newtables -noData -noLoad gtex2V6 V6 $dataFile $sampleFile $subjectFile $tissueFile hgLoadSqlTab hgFixed gtexSample ~kate/kent/src/hg/lib/gtexSample.sql \ newtables/gtex2V6Sample.tab set dir = tables.2016-03-22 mkdir $dir set hgGtex = ~kate/kent/src/hg/makeDb/outside/hgGtex/hgGtex $hgGtex -tab=$dir -noLoad gtex4V6 V6 $dataFile $sampleFile $subjectFile $tissueFile -verbose=2 >&! parseData4.log & ls -l $dir wc -l $dir 570 gtex4V6Donor.tab 1 gtex4V6Info.tab 8555 gtex4V6Sample.tab 481800490 gtex4V6SampleData.tab 2984854 gtex4V6TissueData.tab 56318 gtex4V6TissueMedianAll.tab 56318 gtex4V6TissueMedianFemale.tab 56318 gtex4V6TissueMedianMale.tab # looks OK (SampleData table same size), so load it cd $dir hgLoadSqlTab hgFixed gtexSampleDataV6_full ~kate/kent/src/hg/lib/gtexSampleData.sql \ $dir/gtex4V6SampleData.tab hgLoadSqlTab hgFixed gtexTissueMedianV6 ~kate/kent/src/hg/lib/gtexTissueMedian.sql \ $dir/gtex4V6TissueMedianAll.tab ############ # Add GTEX consortium abbreviation column to tissue table (e.g. for ASE) cd /hive/data/outside/gtex/V6/metadata hgsql hgFixed -e 'select * from gtexTissue' > gtexTissue.old.tab awk -F"\t" 'NR>1 {print $2}' gtex_tissue_colors.txt | paste gtexTissue.old.tab - > gtexTissue.new.tab hgLoadSqlTab hgFixed gtexTissueNew ~kate/kent/src/hg/lib/gtexTissue.sql $dir/gtexTissue.new.tab hgsql hgFixed -e 'alter table gtexTissueV6 rename to gtexTissueV6Old; alter table gtexTissueNew rename to gtexTissueV6' # Test GTEx track on hgwdev still works with new table +############ +# Correct cell type description, as per V8 portal page +# 4/29/20 KRR + + [hgFixed]> select * from gtexTissue where name like '%fibro%'; ++----+--------------------+---------------------------------+-------+----------+---------+ +| id | name | description | organ | color | abbrev | ++----+--------------------+---------------------------------+-------+----------+---------+ +| 22 | xformedfibroblasts | Cells - Transformed fibroblasts | Skin | 10141901 | FIBRBLS | ++----+--------------------+---------------------------------+-------+----------+---------+ +1 row in set (0.00 sec) + +MariaDB [hgFixed]> update gtexTissue set description="Cells - Cultured fibroblasts" where id=22; +