src/hg/makeDb/doc/dp4.txt 1.5

1.5 2009/04/06 17:07:51 angie
Added comment that 12fly dpse_caf1 is the same assembly as FlyBase dpse_r2.1_FB2008_02.
Index: src/hg/makeDb/doc/dp4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/dp4.txt,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/makeDb/doc/dp4.txt	19 Sep 2008 06:24:30 -0000	1.4
+++ src/hg/makeDb/doc/dp4.txt	6 Apr 2009 17:07:51 -0000	1.5
@@ -1,220 +1,221 @@
 # for emacs: -*- mode: sh; -*-
 
 # Drosophila pseudoobscura -- Baylor "CAF1" via Eisen's 12-fly site
+# (Identical to FlyBase dpse_r2.1_FB2008_02)
 
 # THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT
 
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE 9/26/06 angie)
     ssh kkstore05
     mkdir /cluster/store12/dp4
     ln -s /cluster/store12/dp4 /cluster/data/dp4
     mkdir /cluster/data/dp4/downloads
     cd /cluster/data/dp4/downloads
     wget http://rana.lbl.gov/drosophila/caf1/dpse_caf1.tar.gz
     tar xvzf dpse_caf1.tar.gz
     cd dpse
     faSize scaffolds.fa
 #152738921 bases (6681752 N's 146057169 real 146057169 upper 0 lower) in 4896 sequences in 1 files
 #Total size: mean 31196.7 sd 649312.0 min 101 (Unknown_singleton_1691) max 30794189 (Ch2) median 1734
 #N count: mean 1364.7 sd 21011.7
 #U count: mean 29831.9 sd 628747.5
 #L count: mean 0.0 sd 0.0
     # Tweak their funny chromosome names (Ch*) to our pattern (chr*):
     sed -e 's/^>Ch/>chr/' scaffolds.fa > UCSC.fa
     sed -e 's/^Ch/chr/' assembly.agp > UCSC.agp
 
 
 #########################################################################
 # MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/27/06 angie)
     ssh kkstore05
     cd /cluster/data/dp4
     cat > dp4.config.ra <<EOF
 # Config parameters for makeGenomeDb.pl:
 db dp4
 clade insect
 scientificName Drosophila pseudoobscura
 assemblyDate Feb. 2006
 assemblyLabel Baylor CAF1
 orderKey 57
 mitoAcc none
 fastaFiles /cluster/data/dp4/downloads/dpse/UCSC.fa
 agpFiles /cluster/data/dp4/downloads/dpse/UCSC.agp
 dbDbSpeciesDir drosophila
 EOF
 
     # Stop at db step so we can use featureBits, but don't do dbDb and trackDb
     # because we're not building an actual browser for now.
     makeGenomeDb.pl dp4.config.ra -stop=db \
       >& makeGenomeDb.log & tail -f makeGenomeDb.log
     # Because of the extremely long sequence names for unplaced sequences,
     # the chromInfo load command failed because the index string length was 
     # too short.  So I temporarily modified my ~/kent/src/hg/lib/chromInfo.sql
     # to lengthen the index, dropped the dp4 database and ran -continue db.
 
 
 #########################################################################
 # REPEATMASKER (DONE 9/27/06 angie)
     ssh kkstore05
     # Run -debug to create the dir structure and preview the scripts:
     doRepeatMasker.pl dp4 -verbose 3 -debug
     # Run it for real and tail the log:
     doRepeatMasker.pl dp4 -species drosophila -verbose 3 \
       >& /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log &
     tail -f /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log
     # RepeatMasker and lib version from do.log:
 #    March 20 2006 (open-3-1-5) version of RepeatMasker
 #CC   RELEASE 20060315;                                            *
     # Compare coverage to previous assembly:
     featureBits -chrom=chr2 dp4 rmsk
 #1245499 bases of 29873196 (4.169%) in intersection
     featureBits -chrom=chr2 dp3 rmsk
 #1140300 bases of 29702756 (3.839%) in intersection
 
 
 #########################################################################
 # SIMPLE REPEATS (TRF) (DONE 9/27/06 angie)
     ssh kolossus
     nice tcsh
     mkdir /cluster/data/dp4/bed/simpleRepeat
     cd /cluster/data/dp4/bed/simpleRepeat
     twoBitToFa ../../dp4.unmasked.2bit stdout \
     | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
       -bedAt=simpleRepeat.bed -tempDir=/tmp \
     >& trf.log & tail -f trf.log
     # ~50 minutes (longer than D. mel, must be because of the scaffolds)
 
     # Make a filtered version for sequence masking:
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
 
     # Load unfiltered repeats into the database:
     ssh hgwdev
     hgLoadBed dp4 simpleRepeat \
       /cluster/data/dp4/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     # Compare coverage to previous assembly:
     featureBits -chrom=chr2 dp4 simpleRepeat
 #530609 bases of 29873196 (1.776%) in intersection
     featureBits -chrom=chr2 dp3 simpleRepeat
 #517077 bases of 29702756 (1.741%) in intersection
 
 
 #########################################################################
 # MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/27/06 angie)
     ssh kolossus
     cd /cluster/data/dp4
     time twoBitMask dp4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dp4.2bit
     # This warning is OK -- the extra fields are not block coordinates.
 #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
 #0.201u 0.367s 0:01.78 31.4%     0+0k 0+0io 1pf+0w
 
     # Because this is a no-browser build (just masking for alignment)
     # I did not make the usual /gbdb/$db/$db.2bit link.
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROANA3 (DONE 10/3/06 angie)
     ssh kkstore05
     mkdir /cluster/data/dp4/bed/blastz.droAna3.2006-10-02
     cd /cluster/data/dp4/bed/blastz.droAna3.2006-10-02
     cat << '_EOF_' > DEF
 # D. pseudoobscura vs. D. ananassae
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. pseudoobscura
 SEQ1_DIR=/iscratch/i/dp4/dp4.2bit
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dp4/chrom.sizes
 
 # QUERY - D. ananassae
 SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droAna3/chrom.sizes
 
 BASE=/cluster/data/dp4/bed/blastz.droAna3.2006-10-02
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dp4droAna3 >& do.log &
     tail -f do.log
     ln -s blastz.droAna3.2006-10-02 /cluster/data/dp4/bed/blastz.droAna3
 
 
 ###########################################################################
 # BLASTZ/CHAIN/NET DROWIL1 (DONE 10/3/06 angie)
     ssh kkstore05
     mkdir /cluster/data/dp4/bed/blastz.droWil1.2006-10-02
     cd /cluster/data/dp4/bed/blastz.droWil1.2006-10-02
     cat << '_EOF_' > DEF
 # D. pseudoobscura vs. D. willistoni
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=4000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET - D. pseudoobscura
 SEQ1_DIR=/iscratch/i/dp4/dp4.2bit
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LEN=/cluster/data/dp4/chrom.sizes
 
 # QUERY - D. willistoni
 SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 SEQ2_LEN=/cluster/data/droWil1/chrom.sizes
 
 BASE=/cluster/data/dp4/bed/blastz.droWil1.2006-10-02
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -blastzOutRoot /panasas/store/dp4droWil1 >& do.log &
     tail -f do.log
     ln -s blastz.droWil1.2006-10-02 /cluster/data/dp4/bed/blastz.droWil1
 
 
 #########################################################################
 # SWAP DM3 CHAIN/NET (DONE 6/30/08 angie)
     ssh kkstore05
     mkdir /cluster/data/dp4/bed/blastz.dm3.swap
     cd /cluster/data/dp4/bed/blastz.dm3.swap
     doBlastzChainNet.pl -swap \
       /cluster/data/dm3/bed/blastz.dp4.2006-12-04/DEF >& do.log &
     tail -f do.log
     ln -s blastz.dm3.swap /cluster/data/dp4/bed/blastz.dm3
 
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE 9/18/08 angie)
     # Use -repMatch=85 -- increased from dp3's 75 because this assembly
     # loses 9272 tiles w/75, as opposed to dp3's 6790 -- even if dp4 has 
     # more repetitive reads than dp3, that seems like too big of an increase
     # (would lose sensitivity).
     ssh kolossus
     blat /hive/data/genomes/dp4/dp4.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/hive/data/genomes/dp4/11.ooc -repMatch=85
 #Wrote 7218 overused 11-mers to /hive/data/genomes/dp4/11.ooc
 
 
 #########################################################################
 # LIFTOVER TO DP3 (DONE 9/18/08)
     doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \
       -ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3 -debug
 # *** Steps were performed in /cluster/data/dp4/bed/blat.dp3.2008-09-18
     cd /cluster/data/dp4/bed/blat.dp3.2008-09-18
 #NOTE FOR NEXT TIME: save log file!  (oops)
     doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \
       -ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3
 
 
 #########################################################################