src/hg/makeDb/doc/dp4.txt 1.5
1.5 2009/04/06 17:07:51 angie
Added comment that 12fly dpse_caf1 is the same assembly as FlyBase dpse_r2.1_FB2008_02.
Index: src/hg/makeDb/doc/dp4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/dp4.txt,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/makeDb/doc/dp4.txt 19 Sep 2008 06:24:30 -0000 1.4
+++ src/hg/makeDb/doc/dp4.txt 6 Apr 2009 17:07:51 -0000 1.5
@@ -1,220 +1,221 @@
# for emacs: -*- mode: sh; -*-
# Drosophila pseudoobscura -- Baylor "CAF1" via Eisen's 12-fly site
+# (Identical to FlyBase dpse_r2.1_FB2008_02)
# THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT
#########################################################################
# DOWNLOAD SEQUENCE (DONE 9/26/06 angie)
ssh kkstore05
mkdir /cluster/store12/dp4
ln -s /cluster/store12/dp4 /cluster/data/dp4
mkdir /cluster/data/dp4/downloads
cd /cluster/data/dp4/downloads
wget http://rana.lbl.gov/drosophila/caf1/dpse_caf1.tar.gz
tar xvzf dpse_caf1.tar.gz
cd dpse
faSize scaffolds.fa
#152738921 bases (6681752 N's 146057169 real 146057169 upper 0 lower) in 4896 sequences in 1 files
#Total size: mean 31196.7 sd 649312.0 min 101 (Unknown_singleton_1691) max 30794189 (Ch2) median 1734
#N count: mean 1364.7 sd 21011.7
#U count: mean 29831.9 sd 628747.5
#L count: mean 0.0 sd 0.0
# Tweak their funny chromosome names (Ch*) to our pattern (chr*):
sed -e 's/^>Ch/>chr/' scaffolds.fa > UCSC.fa
sed -e 's/^Ch/chr/' assembly.agp > UCSC.agp
#########################################################################
# MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/27/06 angie)
ssh kkstore05
cd /cluster/data/dp4
cat > dp4.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db dp4
clade insect
scientificName Drosophila pseudoobscura
assemblyDate Feb. 2006
assemblyLabel Baylor CAF1
orderKey 57
mitoAcc none
fastaFiles /cluster/data/dp4/downloads/dpse/UCSC.fa
agpFiles /cluster/data/dp4/downloads/dpse/UCSC.agp
dbDbSpeciesDir drosophila
EOF
# Stop at db step so we can use featureBits, but don't do dbDb and trackDb
# because we're not building an actual browser for now.
makeGenomeDb.pl dp4.config.ra -stop=db \
>& makeGenomeDb.log & tail -f makeGenomeDb.log
# Because of the extremely long sequence names for unplaced sequences,
# the chromInfo load command failed because the index string length was
# too short. So I temporarily modified my ~/kent/src/hg/lib/chromInfo.sql
# to lengthen the index, dropped the dp4 database and ran -continue db.
#########################################################################
# REPEATMASKER (DONE 9/27/06 angie)
ssh kkstore05
# Run -debug to create the dir structure and preview the scripts:
doRepeatMasker.pl dp4 -verbose 3 -debug
# Run it for real and tail the log:
doRepeatMasker.pl dp4 -species drosophila -verbose 3 \
>& /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log &
tail -f /cluster/data/dp4/bed/RepeatMasker.2006-09-27/do.log
# RepeatMasker and lib version from do.log:
# March 20 2006 (open-3-1-5) version of RepeatMasker
#CC RELEASE 20060315; *
# Compare coverage to previous assembly:
featureBits -chrom=chr2 dp4 rmsk
#1245499 bases of 29873196 (4.169%) in intersection
featureBits -chrom=chr2 dp3 rmsk
#1140300 bases of 29702756 (3.839%) in intersection
#########################################################################
# SIMPLE REPEATS (TRF) (DONE 9/27/06 angie)
ssh kolossus
nice tcsh
mkdir /cluster/data/dp4/bed/simpleRepeat
cd /cluster/data/dp4/bed/simpleRepeat
twoBitToFa ../../dp4.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=simpleRepeat.bed -tempDir=/tmp \
>& trf.log & tail -f trf.log
# ~50 minutes (longer than D. mel, must be because of the scaffolds)
# Make a filtered version for sequence masking:
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
# Load unfiltered repeats into the database:
ssh hgwdev
hgLoadBed dp4 simpleRepeat \
/cluster/data/dp4/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Compare coverage to previous assembly:
featureBits -chrom=chr2 dp4 simpleRepeat
#530609 bases of 29873196 (1.776%) in intersection
featureBits -chrom=chr2 dp3 simpleRepeat
#517077 bases of 29702756 (1.741%) in intersection
#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/27/06 angie)
ssh kolossus
cd /cluster/data/dp4
time twoBitMask dp4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed dp4.2bit
# This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.201u 0.367s 0:01.78 31.4% 0+0k 0+0io 1pf+0w
# Because this is a no-browser build (just masking for alignment)
# I did not make the usual /gbdb/$db/$db.2bit link.
###########################################################################
# BLASTZ/CHAIN/NET DROANA3 (DONE 10/3/06 angie)
ssh kkstore05
mkdir /cluster/data/dp4/bed/blastz.droAna3.2006-10-02
cd /cluster/data/dp4/bed/blastz.droAna3.2006-10-02
cat << '_EOF_' > DEF
# D. pseudoobscura vs. D. ananassae
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. pseudoobscura
SEQ1_DIR=/iscratch/i/dp4/dp4.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dp4/chrom.sizes
# QUERY - D. ananassae
SEQ2_DIR=/iscratch/i/droAna3/droAna3.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droAna3/chrom.sizes
BASE=/cluster/data/dp4/bed/blastz.droAna3.2006-10-02
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dp4droAna3 >& do.log &
tail -f do.log
ln -s blastz.droAna3.2006-10-02 /cluster/data/dp4/bed/blastz.droAna3
###########################################################################
# BLASTZ/CHAIN/NET DROWIL1 (DONE 10/3/06 angie)
ssh kkstore05
mkdir /cluster/data/dp4/bed/blastz.droWil1.2006-10-02
cd /cluster/data/dp4/bed/blastz.droWil1.2006-10-02
cat << '_EOF_' > DEF
# D. pseudoobscura vs. D. willistoni
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=4000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - D. pseudoobscura
SEQ1_DIR=/iscratch/i/dp4/dp4.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dp4/chrom.sizes
# QUERY - D. willistoni
SEQ2_DIR=/iscratch/i/droWil1/droWil1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/droWil1/chrom.sizes
BASE=/cluster/data/dp4/bed/blastz.droWil1.2006-10-02
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF \
-blastzOutRoot /panasas/store/dp4droWil1 >& do.log &
tail -f do.log
ln -s blastz.droWil1.2006-10-02 /cluster/data/dp4/bed/blastz.droWil1
#########################################################################
# SWAP DM3 CHAIN/NET (DONE 6/30/08 angie)
ssh kkstore05
mkdir /cluster/data/dp4/bed/blastz.dm3.swap
cd /cluster/data/dp4/bed/blastz.dm3.swap
doBlastzChainNet.pl -swap \
/cluster/data/dm3/bed/blastz.dp4.2006-12-04/DEF >& do.log &
tail -f do.log
ln -s blastz.dm3.swap /cluster/data/dp4/bed/blastz.dm3
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 9/18/08 angie)
# Use -repMatch=85 -- increased from dp3's 75 because this assembly
# loses 9272 tiles w/75, as opposed to dp3's 6790 -- even if dp4 has
# more repetitive reads than dp3, that seems like too big of an increase
# (would lose sensitivity).
ssh kolossus
blat /hive/data/genomes/dp4/dp4.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/hive/data/genomes/dp4/11.ooc -repMatch=85
#Wrote 7218 overused 11-mers to /hive/data/genomes/dp4/11.ooc
#########################################################################
# LIFTOVER TO DP3 (DONE 9/18/08)
doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \
-ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3 -debug
# *** Steps were performed in /cluster/data/dp4/bed/blat.dp3.2008-09-18
cd /cluster/data/dp4/bed/blat.dp3.2008-09-18
#NOTE FOR NEXT TIME: save log file! (oops)
doSameSpeciesLiftOver.pl -bigClusterHub=pk -workhorse=kolossus \
-ooc=/hive/data/genomes/dp4/11.ooc dp4 dp3
#########################################################################