src/hg/makeDb/doc/ce7.txt 1.1

1.1 2009/07/23 20:26:21 hiram
Through genome build and first lastz of caePb2
Index: src/hg/makeDb/doc/ce7.txt
===================================================================
RCS file: src/hg/makeDb/doc/ce7.txt
diff -N src/hg/makeDb/doc/ce7.txt
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/ce7.txt	23 Jul 2009 20:26:21 -0000	1.1
@@ -0,0 +1,217 @@
+# for emacs: -*- mode: sh; -*-
+
+# Caenorhabditis elegans
+# Washington University School of Medicine GSC and Sanger Institute WS200
+
+#  $Id$
+
+#########################################################################
+# DOWNLOAD SEQUENCE (DONE - 2009-07-21 - Hiram)
+    mkdir /hive/data/genomes/ce7
+    cd /hive/data/genomes/ce7
+    mkdir ws200
+    cd ws200
+TOP=/hive/data/genomes/ce7/ws200
+export TOP
+for D in annotation genome_feature_tables/GFF2 \
+        genome_feature_tables/SUPPLEMENTARY_GFF sequences/dna \
+        sequences/protein sequences/rna
+do
+    mkdir -p ${D}
+    cd ${D}
+    wget --timestamping \
+ftp://ftp.sanger.ac.uk/pub2/wormbase/WS200/genomes/c_elegans/${D}/*.*
+    cd ${TOP}
+done
+    #	that took a long time, many many hours.  The transfer speed from
+    #	sanger was very slow
+
+#########################################################################
+# NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2009-07-22 - Hiram)
+    mkdir /hive/data/genomes/ce7/sanger
+    cd /hive/data/genomes/ce7/sanger
+    # Fix fasta names:
+    cat ../ws200/sequences/dna/CHR*.dna \
+    | sed -e '/^$/ d;  s/^>CHROMOSOME_MtDNA/>chrM/;  s/^>CHROMOSOME_/>chr/;' \
+    | gzip -c > UCSC.fa.gz
+    faSize -detailed UCSC.fa.gz
+# chrI    15072421
+# chrII   15279324
+# chrIII  13783682
+# chrIV   17493784
+# chrM    13794
+# chrV    20924143
+# chrX    17718854
+
+    # Make sure we get the same sizes from this command:
+    cat ../ws200/sequences/dna/CHR*.dna | sed -e '/^$/ d;' \
+	| faSize -detailed stdin
+
+    faCount UCSC.fa.gz
+#seq    len     A       C       G       T       N       cpg
+# chrI    15072421        4835939 2695879 2692150 4848453 0       503521
+# chrII   15279324        4878196 2769216 2762198 4869714 0       492149
+# chrIII  13783682        4444652 2449139 2466321 4423570 0       459669
+# chrIV   17493784        5711040 3034767 3017008 5730969 0       522372
+# chrM    13794   4335    1225    2055    6179    0       110
+# chrV    20924143        6750393 3712058 3701397 6760295 0       638983
+# chrX    17718854        5747199 3119702 3117868 5734085 0       514715
+# total   100286002       32371754  17781986  17758997  323732650  3131519
+
+    # Fix AGP names:
+    sed -e 's/^/chr/' ../ws200/sequences/dna/CHR*.agp > UCSC.agp
+    # And add a fake mitochondrial AGP entry for the sake of downstream
+    # tools (make sure the GenBank sequence is identical to given):
+    echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp
+
+#########################################################################
+# run the makeGenomeDb procedure to create the db and unmasked sequence
+#	(DONE - 2009-07-22 - Hiram)
+    cd /hive/data/genomes/ce7
+    cat << '_EOF_' > ce7.config.ra
+# Config parameters for makeGenomeDb.pl:
+db ce7
+clade worm
+genomeCladePriority 10
+scientificName Caenorhabditis elegans
+commonName C. elegans
+assemblyDate Feb 2009
+assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS200
+orderKey 825
+mitoAcc none
+fastaFiles /hive/data/genomes/ce7/sanger/UCSC.fa.gz
+agpFiles   /hive/data/genomes/ce7/sanger/UCSC.agp
+# qualFiles /dev/null
+dbDbSpeciesDir worm
+taxId 6239
+'_EOF_'
+    # << emacs
+
+    mkdir jkStuff
+    #	run just to AGP to make sure things are sane first
+    nice -n +19 makeGenomeDb.pl ce7.config.ra -stop agp \
+      > jkStuff/makeGenomeDb.agp.log 2>&1
+    #	now, continuing to make the Db and all
+    time nice -n +19 makeGenomeDb.pl ce7.config.ra -continue db \
+      > jkStuff/makeGenomeDb.db.log 2>&1
+    #	real    1m26.382s
+    #	take the trackDb business there and check it into the source tree
+    #	fixup the description, gap and gold html page descriptions
+
+#########################################################################
+# REPEATMASKER (DONE - 2009-07-22 - Hiram)
+    screen 	#	use screen to control the job
+    mkdir /hive/data/genomes/ce7/bed/repeatMasker
+    cd /hive/data/genomes/ce7/bed/repeatMasker
+    time nice -n +19 doRepeatMasker.pl -bigClusterHub=swarm \
+	-buildDir=`pwd` ce7 > do.log 2>&1 &
+    #	real    35m46.794s
+    cat faSize.rmsk.txt
+# 100286002 bases (0 N's 100286002 real 87035663 upper 13250339 lower)
+#	in 7 sequences in 1 files
+# %13.21 masked total, %13.21 masked real
+
+    #	from the do.log:
+    #   June 4 2009 (open-3-2-8) version of RepeatMasker
+    #	CC   RELEASE 20090604;   
+
+#########################################################################
+# SIMPLE REPEATS (DONE - 2009-07-22 - Hiram)
+    ssh kkstore06
+    screen 	#	use screen to control the job
+    mkdir /hive/data/genomes/ce7/bed/simpleRepeat
+    cd /hive/data/genomes/ce7/bed/simpleRepeat
+    time nice -n +19 doSimpleRepeat.pl -smallClusterHub=encodek \
+	-buildDir=`pwd` ce7 > do.log 2>&1 &
+    #	about 18 minutes
+
+#########################################################################
+# MASK SEQUENCE WITH RM+TRF (DONE - 2009-07-22 - Hiram)
+    # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
+    # now it's time to combine the masking into the final ce7.2bit,
+    # following the instructions at the end of doSimpleRepeat's output.
+    cd /hive/data/genomes/ce7
+    twoBitMask ce7.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce7.2bit
+    # You can safely ignore the warning about extra BED columns
+    twoBitToFa ce7.2bit stdout | faSize stdin
+# 100286002 bases (0 N's 100286002 real 86863809 upper 13422193 lower)
+#	in 7 sequences in 1 files
+# %13.38 masked total, %13.38 masked real
+
+    #	set the symlink on hgwdev to /gbdb/ce7
+    rm -f /gbdb/ce7/ce7.2bit
+    ln -s /hive/data/genomes/ce7/ce7.2bit /gbdb/ce7/ce7.2bit
+
+#########################################################################
+# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-07-22 - Hiram)
+    # Use -repMatch=100 (based on size -- for human we use 1024, and 
+    # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome 
+    # size from featureBits. So we would use 34, but that yields a very
+    # high number of tiles to ignore, especially for a small more compact 
+    # genome.  Bump that up a bit to be more conservative.
+    cd /hive/data/genomes/ce7
+    blat ce7.2bit /dev/null /dev/null -tileSize=11 \
+      -makeOoc=jkStuff/ce7.11.ooc -repMatch=100
+    #	Wrote 8502 overused 11-mers to jkStuff/ce7.11.ooc
+    #	copy all of this stuff to the klusters:
+    mkdir /hive/data/staging/data/ce7
+    cp -p jkStuff/ce7.11.ooc chrom.sizes ce7.2bit /hive/data/staging/data/ce7
+
+#########################################################################
+## BLASTZ caePb2 (DONE - 2009-07-23 - Hiram)
+    screen 	#	use screen to control the job
+    mkdir /hive/data/genomes/ce7/bed/lastzCaePb2.2009-07-23
+    cd /hive/data/genomes/ce7/bed/lastzCaePb2.2009-07-23
+
+    cat << '_EOF_' > DEF
+# ce7 vs caePb2
+BLASTZ_H=2000
+BLASTZ_M=50
+
+# TARGET: elegans Ce7
+SEQ1_DIR=/scratch/data/ce7/ce7.2bit
+SEQ1_LEN=/scratch/data/ce7/chrom.sizes
+SEQ1_CHUNK=1000000
+SEQ1_LAP=10000
+
+# QUERY: C. PB2801 caePb2
+SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit
+SEQ2_LEN=/scratch/data/caePb2/chrom.sizes
+SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit
+SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes
+SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift
+SEQ2_CHUNK=1000000
+SEQ2_LAP=0
+SEQ2_LIMIT=50
+
+BASE=/hive/data/genomes/ce7/bed/lastzCaePb2.2009-07-23
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << happy emacs
+
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF -verbose=2 -bigClusterHub=pk -workhorse=hgwdev \
+	-qRepeats=windowmaskerSdust -noLoadChainSplit -smallClusterHub=memk \
+	> do.log 2>&1 &
+    #	about 1h30m
+    #	forgot the -qRepeats=windowmaskerSdust
+    rm axtChain/ce7.caePb2.net
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF -verbose=2 -bigClusterHub=pk -workhorse=hgwdev \
+	-qRepeats=windowmaskerSdust -noLoadChainSplit -smallClusterHub=memk \
+	-continue=load > load.log 2>&1 &
+    cat fb.ce7.chainCaePb2Link.txt 
+    #	40793071 bases of 100286002 (40.677%) in intersection
+
+    #	swap, this is also in caePb2.txt
+    mkdir /hive/data/genomes/caePb2/bed/blastz.ce7.swap
+    cd /hive/data/genomes/caePb2/bed/blastz.ce7.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	-workhorse=hgwdev -qRepeats=windowmaskerSdust \
+	/hive/data/genomes/ce7/bed/lastzCaePb2.2009-07-23/DEF \
+	-bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 &
+    #	real    3m22.808s
+    cat fb.caePb2.chainCe7Link.txt
+    #	55084634 bases of 170473138 (32.313%) in intersection
+
+#########################################################################