src/hg/makeDb/doc/hg19.txt 1.67
1.67 2009/12/12 04:45:10 hartera
Added documentation for making track of Burge lab RNA-seq reads mapped by GEM mapper.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.66
retrieving revision 1.67
diff -b -B -U 4 -r1.66 -r1.67
--- src/hg/makeDb/doc/hg19.txt 7 Dec 2009 17:42:13 -0000 1.66
+++ src/hg/makeDb/doc/hg19.txt 12 Dec 2009 04:45:10 -0000 1.67
@@ -7267,9 +7267,74 @@
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
-
+############################################################################
+# BURGE LAB DATA MAPPED WITH GEMMAPPER. PROVIDED BY THOMAS DERRIEN FROM RODERIC
+# GUIGO'S LAB AT CRG. (E-MAIL: thomas.derrien@crg.es). Data received on
+# 09/14/09.
+# (hartera, 2009-09-28, DONE)
+ mkdir /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
+ cd /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
+
+# Added the statements below to a script so that it can be run to fetch
+# all the sequences.
+
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325476_brain_HCT168_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325477_liver_HCT169_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325478_heart_HCT170_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325479_skelMuscle_HCT171_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325480_colon_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325481_adipose_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325482_testes_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325483_lymphNode_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325484_HCT204_bt474_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325485_HCT205_HME_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325486_HCT202_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325487_HCT203_s2468.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325488_HCT206_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325489_HCT207_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+
+ # Load this data into tables for hg19.
+ # Unzip the files:
+ gunzip *.gff.gz
+ # Create a file with the list of file names and tissues.
+ ls *.gff > burgeDataFiles.txt
+ GSM325486_HCT202_s2468 breast
+ GSM325487_HCT203_s2468 MCF-7
+ GSM325488_HCT206_s2468 MB435
+ GSM325489_HCT207_s2468 T47D
+ # Did not map these two as they are not 32 bp.
+ GSM325490_brain_s1368 MAQC mixed human brain tissue/cell lines
+ GSM325491_UHR_s247 MAQC_UHR mixed human cell lines
+ # Edit the file above to add a tab separation between file name and tissue
+ # name. Then remove the "read_name: " from the last field in each
+ # file otherwise it gets included in the name and load the data into hg18.
+ # Write a script to do this:
+cat << '_EOF_' > formatAndLoadData
+#!/bin/bash -e
+
+# Assign variables
+# Tab-separated file of file names and tissue/cell line names
+DATAFILES=$1
+# track name used as prefix for subtracks
+TRACK=$2
+# database
+DATABASE=$3
+
+cat $DATAFILES | while read file tissue; do
+ subTrack=`echo $TRACK$tissue`
+ echo $subTrack
+ sed -e 's/read_name:\s//' $file > ${subTrack}.gff
+ ldHgGene -exon=read $DATABASE ${subTrack} ${subTrack}.gff
+done
+'_EOF_'
+ chmod +x formatAndLoadData
+ ./formatAndLoadData burgeDataFiles.txt burgeRnaSeqGemMapperAlign hg19 \
+ >& load.log &
+ # Took about 2 hours to load the tables.
+ # Copy trackDb entry in hg18 trackDb.ra to
+ # ccds/trunk/gencode/browser/trackDb/human/hg19/trackDb.ra
##########################################################################
# BUILD ALLEN BRAIN TRACK (DONE 09/30/09 kent)
# Make the working directory