src/hg/makeDb/doc/ce7.txt 1.2

1.2 2009/07/24 20:32:52 hiram
lastz runs done for caePb2, caeRem3, cb3, caeJap2
Index: src/hg/makeDb/doc/ce7.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce7.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/makeDb/doc/ce7.txt	23 Jul 2009 20:26:21 -0000	1.1
+++ src/hg/makeDb/doc/ce7.txt	24 Jul 2009 20:32:52 -0000	1.2
@@ -1,217 +1,367 @@
 # for emacs: -*- mode: sh; -*-
 
 # Caenorhabditis elegans
 # Washington University School of Medicine GSC and Sanger Institute WS200
 
 #  $Id$
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE - 2009-07-21 - Hiram)
     mkdir /hive/data/genomes/ce7
     cd /hive/data/genomes/ce7
     mkdir ws200
     cd ws200
 TOP=/hive/data/genomes/ce7/ws200
 export TOP
 for D in annotation genome_feature_tables/GFF2 \
         genome_feature_tables/SUPPLEMENTARY_GFF sequences/dna \
         sequences/protein sequences/rna
 do
     mkdir -p ${D}
     cd ${D}
     wget --timestamping \
 ftp://ftp.sanger.ac.uk/pub2/wormbase/WS200/genomes/c_elegans/${D}/*.*
     cd ${TOP}
 done
     #	that took a long time, many many hours.  The transfer speed from
     #	sanger was very slow
 
 #########################################################################
 # NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2009-07-22 - Hiram)
     mkdir /hive/data/genomes/ce7/sanger
     cd /hive/data/genomes/ce7/sanger
     # Fix fasta names:
     cat ../ws200/sequences/dna/CHR*.dna \
     | sed -e '/^$/ d;  s/^>CHROMOSOME_MtDNA/>chrM/;  s/^>CHROMOSOME_/>chr/;' \
     | gzip -c > UCSC.fa.gz
     faSize -detailed UCSC.fa.gz
 # chrI    15072421
 # chrII   15279324
 # chrIII  13783682
 # chrIV   17493784
 # chrM    13794
 # chrV    20924143
 # chrX    17718854
 
     # Make sure we get the same sizes from this command:
     cat ../ws200/sequences/dna/CHR*.dna | sed -e '/^$/ d;' \
 	| faSize -detailed stdin
 
     faCount UCSC.fa.gz
 #seq    len     A       C       G       T       N       cpg
 # chrI    15072421        4835939 2695879 2692150 4848453 0       503521
 # chrII   15279324        4878196 2769216 2762198 4869714 0       492149
 # chrIII  13783682        4444652 2449139 2466321 4423570 0       459669
 # chrIV   17493784        5711040 3034767 3017008 5730969 0       522372
 # chrM    13794   4335    1225    2055    6179    0       110
 # chrV    20924143        6750393 3712058 3701397 6760295 0       638983
 # chrX    17718854        5747199 3119702 3117868 5734085 0       514715
 # total   100286002       32371754  17781986  17758997  323732650  3131519
 
     # Fix AGP names:
     sed -e 's/^/chr/' ../ws200/sequences/dna/CHR*.agp > UCSC.agp
     # And add a fake mitochondrial AGP entry for the sake of downstream
     # tools (make sure the GenBank sequence is identical to given):
     echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp
 
 #########################################################################
 # run the makeGenomeDb procedure to create the db and unmasked sequence
 #	(DONE - 2009-07-22 - Hiram)
     cd /hive/data/genomes/ce7
     cat << '_EOF_' > ce7.config.ra
 # Config parameters for makeGenomeDb.pl:
 db ce7
 clade worm
 genomeCladePriority 10
 scientificName Caenorhabditis elegans
 commonName C. elegans
 assemblyDate Feb 2009
 assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS200
 orderKey 825
 mitoAcc none
 fastaFiles /hive/data/genomes/ce7/sanger/UCSC.fa.gz
 agpFiles   /hive/data/genomes/ce7/sanger/UCSC.agp
 # qualFiles /dev/null
 dbDbSpeciesDir worm
 taxId 6239
 '_EOF_'
     # << emacs
 
     mkdir jkStuff
     #	run just to AGP to make sure things are sane first
     nice -n +19 makeGenomeDb.pl ce7.config.ra -stop agp \
       > jkStuff/makeGenomeDb.agp.log 2>&1
     #	now, continuing to make the Db and all
     time nice -n +19 makeGenomeDb.pl ce7.config.ra -continue db \
       > jkStuff/makeGenomeDb.db.log 2>&1
     #	real    1m26.382s
     #	take the trackDb business there and check it into the source tree
     #	fixup the description, gap and gold html page descriptions
 
 #########################################################################
 # REPEATMASKER (DONE - 2009-07-22 - Hiram)
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce7/bed/repeatMasker
     cd /hive/data/genomes/ce7/bed/repeatMasker
     time nice -n +19 doRepeatMasker.pl -bigClusterHub=swarm \
 	-buildDir=`pwd` ce7 > do.log 2>&1 &
     #	real    35m46.794s
     cat faSize.rmsk.txt
 # 100286002 bases (0 N's 100286002 real 87035663 upper 13250339 lower)
 #	in 7 sequences in 1 files
 # %13.21 masked total, %13.21 masked real
 
     #	from the do.log:
     #   June 4 2009 (open-3-2-8) version of RepeatMasker
     #	CC   RELEASE 20090604;   
 
 #########################################################################
 # SIMPLE REPEATS (DONE - 2009-07-22 - Hiram)
     ssh kkstore06
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce7/bed/simpleRepeat
     cd /hive/data/genomes/ce7/bed/simpleRepeat
     time nice -n +19 doSimpleRepeat.pl -smallClusterHub=encodek \
 	-buildDir=`pwd` ce7 > do.log 2>&1 &
     #	about 18 minutes
 
 #########################################################################
 # MASK SEQUENCE WITH RM+TRF (DONE - 2009-07-22 - Hiram)
     # Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
     # now it's time to combine the masking into the final ce7.2bit,
     # following the instructions at the end of doSimpleRepeat's output.
     cd /hive/data/genomes/ce7
     twoBitMask ce7.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce7.2bit
     # You can safely ignore the warning about extra BED columns
     twoBitToFa ce7.2bit stdout | faSize stdin
 # 100286002 bases (0 N's 100286002 real 86863809 upper 13422193 lower)
 #	in 7 sequences in 1 files
 # %13.38 masked total, %13.38 masked real
 
     #	set the symlink on hgwdev to /gbdb/ce7
     rm -f /gbdb/ce7/ce7.2bit
     ln -s /hive/data/genomes/ce7/ce7.2bit /gbdb/ce7/ce7.2bit
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2009-07-22 - Hiram)
     # Use -repMatch=100 (based on size -- for human we use 1024, and 
     # worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome 
     # size from featureBits. So we would use 34, but that yields a very
     # high number of tiles to ignore, especially for a small more compact 
     # genome.  Bump that up a bit to be more conservative.
     cd /hive/data/genomes/ce7
     blat ce7.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=jkStuff/ce7.11.ooc -repMatch=100
     #	Wrote 8502 overused 11-mers to jkStuff/ce7.11.ooc
     #	copy all of this stuff to the klusters:
     mkdir /hive/data/staging/data/ce7
     cp -p jkStuff/ce7.11.ooc chrom.sizes ce7.2bit /hive/data/staging/data/ce7
 
 #########################################################################
 ## BLASTZ caePb2 (DONE - 2009-07-23 - Hiram)
     screen 	#	use screen to control the job
     mkdir /hive/data/genomes/ce7/bed/lastzCaePb2.2009-07-23
     cd /hive/data/genomes/ce7/bed/lastzCaePb2.2009-07-23
 
     cat << '_EOF_' > DEF
 # ce7 vs caePb2
 BLASTZ_H=2000
 BLASTZ_M=50
 
 # TARGET: elegans Ce7
 SEQ1_DIR=/scratch/data/ce7/ce7.2bit
 SEQ1_LEN=/scratch/data/ce7/chrom.sizes
 SEQ1_CHUNK=1000000
 SEQ1_LAP=10000
 
 # QUERY: C. PB2801 caePb2
 SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit
 SEQ2_LEN=/scratch/data/caePb2/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit
 SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes
 SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift
 SEQ2_CHUNK=1000000
 SEQ2_LAP=0
 SEQ2_LIMIT=50
 
 BASE=/hive/data/genomes/ce7/bed/lastzCaePb2.2009-07-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF -verbose=2 -bigClusterHub=pk -workhorse=hgwdev \
 	-qRepeats=windowmaskerSdust -noLoadChainSplit -smallClusterHub=memk \
 	> do.log 2>&1 &
-    #	about 1h30m
+    #	real   62m19.458s
     #	forgot the -qRepeats=windowmaskerSdust
     rm axtChain/ce7.caePb2.net
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF -verbose=2 -bigClusterHub=pk -workhorse=hgwdev \
 	-qRepeats=windowmaskerSdust -noLoadChainSplit -smallClusterHub=memk \
 	-continue=load > load.log 2>&1 &
     cat fb.ce7.chainCaePb2Link.txt 
     #	40793071 bases of 100286002 (40.677%) in intersection
 
     #	swap, this is also in caePb2.txt
     mkdir /hive/data/genomes/caePb2/bed/blastz.ce7.swap
     cd /hive/data/genomes/caePb2/bed/blastz.ce7.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-workhorse=hgwdev -qRepeats=windowmaskerSdust \
 	/hive/data/genomes/ce7/bed/lastzCaePb2.2009-07-23/DEF \
 	-bigClusterHub=pk -smallClusterHub=memk -swap > swap.log 2>&1 &
     #	real    3m22.808s
     cat fb.caePb2.chainCe7Link.txt
     #	55084634 bases of 170473138 (32.313%) in intersection
 
 #########################################################################
+## BLASTZ caeJap2 (DONE - 2009-07-24 - Hiram)
+    screen 	#	use screen to control the job
+    mkdir /hive/data/genomes/ce7/bed/lastzCaeJap2.2009-07-24
+    cd /hive/data/genomes/ce7/bed/lastzCaeJap2.2009-07-24
+
+    cat << '_EOF_' > DEF
+# ce7 vs caeJap2
+BLASTZ_H=2000
+BLASTZ_M=50
+
+# TARGET: elegans Ce7
+SEQ1_DIR=/scratch/data/ce7/ce7.2bit
+SEQ1_LEN=/scratch/data/ce7/chrom.sizes
+SEQ1_CHUNK=1000000
+SEQ1_LAP=10000
+
+# QUERY: C. japonica caeJap2
+SEQ2_DIR=/scratch/data/caeJap2/caeJap2.2bit
+SEQ2_LEN=/scratch/data/caeJap2/chrom.sizes
+SEQ2_CTGDIR=/scratch/data/caeJap2/caeJap2.supers.2bit
+SEQ2_CTGLEN=/scratch/data/caeJap2/caeJap2.supers.sizes
+SEQ2_LIFT=/scratch/data/caeJap2/caeJap2.chrUn.lift
+SEQ2_CHUNK=1000000
+SEQ2_LAP=0
+SEQ2_LIMIT=50
+
+BASE=/hive/data/genomes/ce7/bed/lastzCaeJap2.2009-07-24
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << happy emacs
+
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-bigClusterHub=pk -noLoadChainSplit -qRepeats=windowmaskerSdust \
+	-workhorse=hgwdev -smallClusterHub=memk > do.log 2>&1 &
+    #	real    87m1.988s
+    cat fb.ce7.chainCaeJap2Link.txt 
+    #	27270064 bases of 100286002 (27.192%) in intersection
+
+    #	swap, this is also in caeJap2.txt
+    mkdir /hive/data/genomes/caeJap2/bed/blastz.ce7.swap
+    cd /hive/data/genomes/caeJap2/bed/blastz.ce7.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	-qRepeats=windowmaskerSdust -bigClusterHub=pk -noLoadChainSplit \
+	/hive/data/genomes/ce7/bed/lastzCaeJap2.2009-07-24/DEF \
+	-smallClusterHub=memk -swap > swap.log 2>&1 &
+    #	real    4m23.124s
+    cat fb.caeJap2.chainCe7Link.txt 
+    #	26441005 bases of 129295754 (20.450%) in intersection
+
+############################################################################
+## BLASTZ cb3 (DONE - 2009-07-24 - Hiram)
+    screen 	#	use screen to control the job
+    mkdir /hive/data/genomes/ce7/bed/lastzCb3.2009-07-24
+    cd /hive/data/genomes/ce7/bed/lastzCb3.2009-07-24
+
+    cat << '_EOF_' > DEF
+# ce7 vs cb3
+BLASTZ_H=2000
+BLASTZ_M=50
+
+# TARGET: elegans Ce7
+SEQ1_DIR=/scratch/data/ce7/ce7.2bit
+SEQ1_LEN=/scratch/data/ce7/chrom.sizes
+SEQ1_CHUNK=1000000
+SEQ1_LAP=10000
+
+# QUERY: C. briggsae cb3
+SEQ2_DIR=/hive/data/genomes/cb3/cb3.rmskTrf.2bit
+SEQ2_LEN=/hive/data/genomes/cb3/chrom.sizes
+SEQ2_CHUNK=1000000
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/ce7/bed/lastzCb3.2009-07-24
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << happy emacs
+
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-workhorse=hgwdev -bigClusterHub=pk -noLoadChainSplit \
+	-smallClusterHub=memk > do.log 2>&1 &
+    #	 real    50m9.701s
+    cat fb.ce7.chainCb3Link.txt 
+    #	42421335 bases of 100286002 (42.300%) in intersection
+
+    #	swap, this is also in cb3.txt
+    mkdir /hive/data/genomes/cb3/bed/blastz.ce7.swap
+    cd /hive/data/genomes/cb3/bed/blastz.ce7.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/ce7/bed/lastzCb3.2009-07-24/DEF \
+	-workhorse=hgwdev -bigClusterHub=pk -noLoadChainSplit \
+	-smallClusterHub=memk -swap > swap.log 2>&1 &
+    #	real    3m38.745s
+    cat fb.cb3.chainCe7Link.txt 
+    #	43115929 bases of 108433446 (39.763%) in intersection
+
+############################################################################
+## BLASTZ caeRem3 (DONE - 2009-07-24,09 - Hiram)
+    screen 	#	use screen to control the job
+    mkdir /hive/data/genomes/ce7/bed/lastzCaeRem3.2009-07-24
+    cd /hive/data/genomes/ce7/bed/lastzCaeRem3.2009-07-24
+
+    cat << '_EOF_' > DEF
+# ce7 vs caeRem3
+BLASTZ_H=2000
+BLASTZ_M=50
+
+# TARGET: elegans Ce7
+SEQ1_DIR=/scratch/data/ce7/ce7.2bit
+SEQ1_LEN=/scratch/data/ce7/chrom.sizes
+SEQ1_CHUNK=1000000
+SEQ1_LAP=10000
+
+# QUERY: C. remanei caeRem3
+SEQ2_DIR=/scratch/data/caeRem3/caeRem3.2bit
+SEQ2_LEN=/scratch/data/caeRem3/chrom.sizes
+SEQ2_CTGDIR=/scratch/data/caeRem3/caeRem3.supercontigs.2bit
+SEQ2_CTGLEN=/scratch/data/caeRem3/caeRem3.supercontigs.sizes
+SEQ2_LIFT=/scratch/data/caeRem3/caeRem3.chrUn.lift
+SEQ2_CHUNK=1000000
+SEQ2_LAP=0
+SEQ2_LIMIT=50
+
+BASE=/hive/data/genomes/ce7/bed/lastzCaeRem3.2009-07-24
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << happy emacs
+
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-workhorse=hgwdev -bigClusterHub=swarm -noLoadChainSplit \
+	-qRepeats=windowmaskerSdust -smallClusterHub=memk > do.log 2>&1 &
+    #	real    28m14.168s
+    cat fb.ce7.chainCaeRem3Link.txt 
+    #	41841199 bases of 100286002 (41.722%) in intersection
+
+    #	swap, this is also in caeRem3.txt
+    mkdir /hive/data/genomes/caeRem3/bed/blastz.ce7.swap
+    cd /hive/data/genomes/caeRem3/bed/blastz.ce7.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	-qRepeats=windowmaskerSdust \
+	-workhorse=hgwdev -noLoadChainSplit \
+	/hive/data/genomes/ce7/bed/lastzCaeRem3.2009-07-24/DEF \
+	-bigClusterHub=swarm -smallClusterHub=memk -swap > swap.log 2>&1 &
+    #	real    2m53.936s
+    cat fb.caeRem3.chainCe7Link.txt 
+    #	46320678 bases of 138406388 (33.467%) in intersection
+
+############################################################################