src/hg/makeDb/doc/ce8.txt 1.1

1.1 2009/07/24 22:54:44 hiram
Initial build through repeat masking and kluster scratch data populating
Index: src/hg/makeDb/doc/ce8.txt
===================================================================
RCS file: src/hg/makeDb/doc/ce8.txt
diff -N src/hg/makeDb/doc/ce8.txt
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/ce8.txt	24 Jul 2009 22:54:44 -0000	1.1
@@ -0,0 +1,167 @@
+# for emacs: -*- mode: sh; -*-
+
+# Caenorhabditis elegans
+# Washington University School of Medicine GSC and Sanger Institute WS204
+
+#  $Id$
+
+#########################################################################
+# DOWNLOAD SEQUENCE (DONE - 2009-07-22 - Hiram)
+    mkdir /hive/data/genomes/ce8
+    cd /hive/data/genomes/ce8
+    mkdir ws204
+    cd ws204
+TOP=/hive/data/genomes/ce8/ws204
+export TOP
+for D in annotation genome_feature_tables/GFF2 \
+        genome_feature_tables/SUPPLEMENTARY_GFF sequences/dna \
+        sequences/protein sequences/rna
+do
+    mkdir -p ${D}
+    cd ${D}
+    wget --timestamping \
+ftp://ftp.sanger.ac.uk/pub2/wormbase/WS204/genomes/c_elegans/${D}/*.*
+    cd ${TOP}
+done
+    #	about 1h24m
+
+#########################################################################
+# NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2009-07-24 - Hiram)
+    mkdir /hive/data/genomes/ce8/sanger
+    cd /hive/data/genomes/ce8/sanger
+    # Fix fasta names:
+    zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz \
+    | sed -e '/^$/ d;  s/^>CHROMOSOME_MtDNA/>chrM/;  s/^>CHROMOSOME_/>chr/;' \
+    | gzip -c > UCSC.fa.gz
+    faSize -detailed UCSC.fa.gz
+# chrI    15072421
+# chrII   15279323
+# chrIII  13783685
+# chrIV   17493784
+# chrM    13794
+# chrV    20924143
+# chrX    17718854
+
+    # chrII(+1) and chrII(+3) are slightly different than WS200
+    # Make sure we get the same sizes from this command:
+    zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz | sed -e '/^$/ d;' \
+	| faSize -detailed stdin
+
+    faCount UCSC.fa.gz
+#seq    len     A       C       G       T       N       cpg
+# chrI    15072421        4835939 2695879 2692150 4848453 0       503521
+# chrII   15279323        4878195 2769216 2762198 4869714 0       492149
+# chrIII  13783685        4444652 2449141 2466322 4423570 0       459669
+# chrIV   17493784        5711040 3034767 3017008 5730969 0       522372
+# chrM    13794   4335    1225    2055    6179    0       110
+# chrV    20924143        6750393 3712058 3701397 6760295 0       638983
+# chrX    17718854        5747199 3119702 3117868 5734085 0       514715
+# total   100286004       32371753  17781988  17758998  323732650  3131519
+
+#  WS200:
+# chrII   15279324        4878196 2769216 2762198 4869714 0       492149
+# chrIII  13783682        4444652 2449139 2466321 4423570 0       459669
+# total   100286002       32371754  17781986  17758997  323732650  3131519
+
+    # Fix AGP names:
+    sed -e 's/^/chr/' ../ws204/sequences/dna/CHR*.agp > UCSC.agp
+    # And add a fake mitochondrial AGP entry for the sake of downstream
+    # tools (make sure the GenBank sequence is identical to given):
+    echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp
+
+#########################################################################
+# run the makeGenomeDb procedure to create the db and unmasked sequence
+#	(DONE - 2009-07-22 - Hiram)
+    cd /hive/data/genomes/ce8
+    cat << '_EOF_' > ce8.config.ra
+# Config parameters for makeGenomeDb.pl:
+db ce8
+clade worm
+genomeCladePriority 10
+scientificName Caenorhabditis elegans
+commonName C. elegans
+assemblyDate Jun 2009
+assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS204
+orderKey 824
+mitoAcc none
+fastaFiles /hive/data/genomes/ce8/sanger/UCSC.fa.gz
+agpFiles   /hive/data/genomes/ce8/sanger/UCSC.agp
+# qualFiles /dev/null
+dbDbSpeciesDir worm
+taxId 6239
+'_EOF_'
+    # << emacs
+
+    mkdir jkStuff
+    #	run just to AGP to make sure things are sane first
+    nice -n +19 makeGenomeDb.pl ce8.config.ra -stop=agp \
+      > jkStuff/makeGenomeDb.agp.log 2>&1
+    #	now, continuing to make the Db and all
+    time nice -n +19 makeGenomeDb.pl ce8.config.ra -continue=db \
+      > jkStuff/makeGenomeDb.db.log 2>&1
+    #	real    1m33.036s
+    #	take the trackDb business there and check it into the source tree
+    #	fixup the description, gap and gold html page descriptions
+
+#########################################################################
+# REPEATMASKER (DONE - 2009-07-24 - Hiram)
+    screen 	#	use screen to control the job
+    mkdir /hive/data/genomes/ce8/bed/repeatMasker
+    cd /hive/data/genomes/ce8/bed/repeatMasker
+    time nice -n +19 doRepeatMasker.pl -bigClusterHub=swarm \
+	-buildDir=`pwd` ce8 > do.log 2>&1 &
+    #	real    35m58.812s
+    #	from the do.log:
+    #   June 4 2009 (open-3-2-8) version of RepeatMasker
+    #	CC   RELEASE 20090604;   
+    cat faSize.rmsk.txt
+    #	100286004 bases (0 N's 100286004 real 87035623 upper 13250381 lower)
+    #	in 7 sequences in 1 files
+    #	%13.21 masked total, %13.21 masked real
+
+#########################################################################
+# SIMPLE REPEATS (DONE - 2009-07-24 - Hiram)
+    screen 	#	use screen to control the job
+    mkdir /hive/data/genomes/ce8/bed/simpleRepeat
+    cd /hive/data/genomes/ce8/bed/simpleRepeat
+    time nice -n +19 doSimpleRepeat.pl -smallClusterHub=encodek \
+	-buildDir=`pwd` ce8 > do.log 2>&1 &
+    #	real    18m30.323s
+
+    cat fb.simpleRepeat 
+    #	4331076 bases of 100286004 (4.319%) in intersection
+
+#########################################################################
+# MASK SEQUENCE WITH RM+TRF (DONE - 2009-07-24 - Hiram)
+    # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
+    # now it's time to combine the masking into the final ce8.2bit,
+    # following the instructions at the end of doSimpleRepeat's output.
+    cd /hive/data/genomes/ce8
+    twoBitMask ce8.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce8.2bit
+    # You can safely ignore the warning about extra BED columns
+    twoBitToFa ce8.2bit stdout | faSize stdin > faSize.ce8.2bit.txt
+    cat faSize.ce8.2bit.txt
+    #	100286004 bases (0 N's 100286004 real 86863769 upper 13422235 lower)
+    #	in 7 sequences in 1 files
+    #	%13.38 masked total, %13.38 masked real
+    #	set the symlink on hgwdev to /gbdb/ce8
+    rm -f /gbdb/ce8/ce8.2bit
+    ln -s `pwd`/ce8.2bit /gbdb/ce8/ce8.2bit
+
+#########################################################################
+# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-07-22 - Hiram)
+    # Use -repMatch=100 (based on size -- for human we use 1024, and 
+    # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome 
+    # size from featureBits. So we would use 34, but that yields a very
+    # high number of tiles to ignore, especially for a small more compact 
+    # genome.  Bump that up a bit to be more conservative.
+    cd /hive/data/genomes/ce8
+    blat ce8.2bit /dev/null /dev/null -tileSize=11 \
+      -makeOoc=jkStuff/ce8.11.ooc -repMatch=100
+    #	Wrote 8514 overused 11-mers to jkStuff/ce8.11.ooc
+    #	copy all of this stuff to the klusters:
+    mkdir /hive/data/staging/data/ce8
+    cp -p jkStuff/ce8.11.ooc chrom.sizes ce8.2bit /hive/data/staging/data/ce8
+    #	request push of that data to kluster nodes /scratch/data/ce8/
+
+#########################################################################