src/hg/makeDb/doc/ce8.txt 1.1
1.1 2009/07/24 22:54:44 hiram
Initial build through repeat masking and kluster scratch data populating
Index: src/hg/makeDb/doc/ce8.txt
===================================================================
RCS file: src/hg/makeDb/doc/ce8.txt
diff -N src/hg/makeDb/doc/ce8.txt
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/ce8.txt 24 Jul 2009 22:54:44 -0000 1.1
@@ -0,0 +1,167 @@
+# for emacs: -*- mode: sh; -*-
+
+# Caenorhabditis elegans
+# Washington University School of Medicine GSC and Sanger Institute WS204
+
+# $Id$
+
+#########################################################################
+# DOWNLOAD SEQUENCE (DONE - 2009-07-22 - Hiram)
+ mkdir /hive/data/genomes/ce8
+ cd /hive/data/genomes/ce8
+ mkdir ws204
+ cd ws204
+TOP=/hive/data/genomes/ce8/ws204
+export TOP
+for D in annotation genome_feature_tables/GFF2 \
+ genome_feature_tables/SUPPLEMENTARY_GFF sequences/dna \
+ sequences/protein sequences/rna
+do
+ mkdir -p ${D}
+ cd ${D}
+ wget --timestamping \
+ftp://ftp.sanger.ac.uk/pub2/wormbase/WS204/genomes/c_elegans/${D}/*.*
+ cd ${TOP}
+done
+ # about 1h24m
+
+#########################################################################
+# NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2009-07-24 - Hiram)
+ mkdir /hive/data/genomes/ce8/sanger
+ cd /hive/data/genomes/ce8/sanger
+ # Fix fasta names:
+ zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz \
+ | sed -e '/^$/ d; s/^>CHROMOSOME_MtDNA/>chrM/; s/^>CHROMOSOME_/>chr/;' \
+ | gzip -c > UCSC.fa.gz
+ faSize -detailed UCSC.fa.gz
+# chrI 15072421
+# chrII 15279323
+# chrIII 13783685
+# chrIV 17493784
+# chrM 13794
+# chrV 20924143
+# chrX 17718854
+
+ # chrII(+1) and chrII(+3) are slightly different than WS200
+ # Make sure we get the same sizes from this command:
+ zcat ../ws204/sequences/dna/CHR*[XVAI].dna.gz | sed -e '/^$/ d;' \
+ | faSize -detailed stdin
+
+ faCount UCSC.fa.gz
+#seq len A C G T N cpg
+# chrI 15072421 4835939 2695879 2692150 4848453 0 503521
+# chrII 15279323 4878195 2769216 2762198 4869714 0 492149
+# chrIII 13783685 4444652 2449141 2466322 4423570 0 459669
+# chrIV 17493784 5711040 3034767 3017008 5730969 0 522372
+# chrM 13794 4335 1225 2055 6179 0 110
+# chrV 20924143 6750393 3712058 3701397 6760295 0 638983
+# chrX 17718854 5747199 3119702 3117868 5734085 0 514715
+# total 100286004 32371753 17781988 17758998 323732650 3131519
+
+# WS200:
+# chrII 15279324 4878196 2769216 2762198 4869714 0 492149
+# chrIII 13783682 4444652 2449139 2466321 4423570 0 459669
+# total 100286002 32371754 17781986 17758997 323732650 3131519
+
+ # Fix AGP names:
+ sed -e 's/^/chr/' ../ws204/sequences/dna/CHR*.agp > UCSC.agp
+ # And add a fake mitochondrial AGP entry for the sake of downstream
+ # tools (make sure the GenBank sequence is identical to given):
+ echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp
+
+#########################################################################
+# run the makeGenomeDb procedure to create the db and unmasked sequence
+# (DONE - 2009-07-22 - Hiram)
+ cd /hive/data/genomes/ce8
+ cat << '_EOF_' > ce8.config.ra
+# Config parameters for makeGenomeDb.pl:
+db ce8
+clade worm
+genomeCladePriority 10
+scientificName Caenorhabditis elegans
+commonName C. elegans
+assemblyDate Jun 2009
+assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS204
+orderKey 824
+mitoAcc none
+fastaFiles /hive/data/genomes/ce8/sanger/UCSC.fa.gz
+agpFiles /hive/data/genomes/ce8/sanger/UCSC.agp
+# qualFiles /dev/null
+dbDbSpeciesDir worm
+taxId 6239
+'_EOF_'
+ # << emacs
+
+ mkdir jkStuff
+ # run just to AGP to make sure things are sane first
+ nice -n +19 makeGenomeDb.pl ce8.config.ra -stop=agp \
+ > jkStuff/makeGenomeDb.agp.log 2>&1
+ # now, continuing to make the Db and all
+ time nice -n +19 makeGenomeDb.pl ce8.config.ra -continue=db \
+ > jkStuff/makeGenomeDb.db.log 2>&1
+ # real 1m33.036s
+ # take the trackDb business there and check it into the source tree
+ # fixup the description, gap and gold html page descriptions
+
+#########################################################################
+# REPEATMASKER (DONE - 2009-07-24 - Hiram)
+ screen # use screen to control the job
+ mkdir /hive/data/genomes/ce8/bed/repeatMasker
+ cd /hive/data/genomes/ce8/bed/repeatMasker
+ time nice -n +19 doRepeatMasker.pl -bigClusterHub=swarm \
+ -buildDir=`pwd` ce8 > do.log 2>&1 &
+ # real 35m58.812s
+ # from the do.log:
+ # June 4 2009 (open-3-2-8) version of RepeatMasker
+ # CC RELEASE 20090604;
+ cat faSize.rmsk.txt
+ # 100286004 bases (0 N's 100286004 real 87035623 upper 13250381 lower)
+ # in 7 sequences in 1 files
+ # %13.21 masked total, %13.21 masked real
+
+#########################################################################
+# SIMPLE REPEATS (DONE - 2009-07-24 - Hiram)
+ screen # use screen to control the job
+ mkdir /hive/data/genomes/ce8/bed/simpleRepeat
+ cd /hive/data/genomes/ce8/bed/simpleRepeat
+ time nice -n +19 doSimpleRepeat.pl -smallClusterHub=encodek \
+ -buildDir=`pwd` ce8 > do.log 2>&1 &
+ # real 18m30.323s
+
+ cat fb.simpleRepeat
+ # 4331076 bases of 100286004 (4.319%) in intersection
+
+#########################################################################
+# MASK SEQUENCE WITH RM+TRF (DONE - 2009-07-24 - Hiram)
+ # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
+ # now it's time to combine the masking into the final ce8.2bit,
+ # following the instructions at the end of doSimpleRepeat's output.
+ cd /hive/data/genomes/ce8
+ twoBitMask ce8.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce8.2bit
+ # You can safely ignore the warning about extra BED columns
+ twoBitToFa ce8.2bit stdout | faSize stdin > faSize.ce8.2bit.txt
+ cat faSize.ce8.2bit.txt
+ # 100286004 bases (0 N's 100286004 real 86863769 upper 13422235 lower)
+ # in 7 sequences in 1 files
+ # %13.38 masked total, %13.38 masked real
+ # set the symlink on hgwdev to /gbdb/ce8
+ rm -f /gbdb/ce8/ce8.2bit
+ ln -s `pwd`/ce8.2bit /gbdb/ce8/ce8.2bit
+
+#########################################################################
+# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-07-22 - Hiram)
+ # Use -repMatch=100 (based on size -- for human we use 1024, and
+ # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome
+ # size from featureBits. So we would use 34, but that yields a very
+ # high number of tiles to ignore, especially for a small more compact
+ # genome. Bump that up a bit to be more conservative.
+ cd /hive/data/genomes/ce8
+ blat ce8.2bit /dev/null /dev/null -tileSize=11 \
+ -makeOoc=jkStuff/ce8.11.ooc -repMatch=100
+ # Wrote 8514 overused 11-mers to jkStuff/ce8.11.ooc
+ # copy all of this stuff to the klusters:
+ mkdir /hive/data/staging/data/ce8
+ cp -p jkStuff/ce8.11.ooc chrom.sizes ce8.2bit /hive/data/staging/data/ce8
+ # request push of that data to kluster nodes /scratch/data/ce8/
+
+#########################################################################