src/hg/makeDb/doc/hg19.txt 1.67

1.67 2009/12/12 04:45:10 hartera
Added documentation for making track of Burge lab RNA-seq reads mapped by GEM mapper.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.66
retrieving revision 1.67
diff -b -B -U 4 -r1.66 -r1.67
--- src/hg/makeDb/doc/hg19.txt	7 Dec 2009 17:42:13 -0000	1.66
+++ src/hg/makeDb/doc/hg19.txt	12 Dec 2009 04:45:10 -0000	1.67
@@ -7267,9 +7267,74 @@
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
-
+############################################################################
+# BURGE	LAB DATA MAPPED WITH GEMMAPPER. PROVIDED BY THOMAS DERRIEN FROM RODERIC
+# GUIGO'S LAB AT CRG. (E-MAIL: thomas.derrien@crg.es). Data received on
+# 09/14/09. 
+# (hartera, 2009-09-28, DONE) 
+   mkdir /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
+   cd /hive/groups/gencode/browser/hg19/burgeRnaSeqGemMapperAlign
+
+# Added the statements below to a script so that it can be run to fetch
+# all the sequences.
+
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325476_brain_HCT168_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325477_liver_HCT169_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325478_heart_HCT170_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325479_skelMuscle_HCT171_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325480_colon_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325481_adipose_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325482_testes_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325483_lymphNode_hg18.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325484_HCT204_bt474_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325485_HCT205_HME_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325486_HCT202_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325487_HCT203_s2468.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325488_HCT206_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+wget --timestamping "http://genome.imim.es/~tderrien/Gencode/For_Rachel/GFF_Burge_32bp_Gencode3/GSM325489_HCT207_s2468_.fa.GEM_index.GRCh37.gencode.v3.annotation.GRCh37.exon.gtf.length_32.gff.gz"
+  
+   # Load this data into tables for hg19.
+   # Unzip the files:
+   gunzip *.gff.gz 
+   # Create a file with the list of file names and tissues.
+   ls *.gff > burgeDataFiles.txt
+   GSM325486_HCT202_s2468	breast
+   GSM325487_HCT203_s2468	MCF-7
+   GSM325488_HCT206_s2468	MB435 
+   GSM325489_HCT207_s2468	T47D
+   # Did not map these two as they are not 32 bp. 
+   GSM325490_brain_s1368	MAQC	mixed human brain tissue/cell lines
+   GSM325491_UHR_s247		MAQC_UHR mixed human cell lines
+   # Edit the file above to add a tab separation between file name and tissue
+   # name. Then remove the "read_name: " from the last field in each 
+   # file otherwise it gets included in the name and load the data into hg18.
+   # Write a script to do this: 
+cat << '_EOF_' > formatAndLoadData
+#!/bin/bash -e
+   
+# Assign variables
+# Tab-separated file of file names and tissue/cell line names
+DATAFILES=$1
+# track name used as prefix for subtracks
+TRACK=$2
+# database
+DATABASE=$3
+
+cat $DATAFILES | while read file tissue; do
+    subTrack=`echo $TRACK$tissue`
+    echo $subTrack   
+    sed -e 's/read_name:\s//' $file > ${subTrack}.gff
+    ldHgGene -exon=read $DATABASE ${subTrack} ${subTrack}.gff 
+done
+'_EOF_'
+   chmod +x formatAndLoadData
+   ./formatAndLoadData burgeDataFiles.txt burgeRnaSeqGemMapperAlign hg19 \
+     >& load.log &
+   # Took about 2 hours to load the tables.
+   # Copy trackDb entry in hg18 trackDb.ra to
+   # ccds/trunk/gencode/browser/trackDb/human/hg19/trackDb.ra
 ##########################################################################
 # BUILD ALLEN BRAIN TRACK (DONE 09/30/09 kent)
 
 # Make the working directory