src/hg/makeDb/doc/droGri2.txt 1.3
1.3 2009/04/07 16:55:46 angie
Swapped chains/nets from dm3 to get liftOver chains to dm3 -- user request.
Index: src/hg/makeDb/doc/droGri2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/droGri2.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/makeDb/doc/droGri2.txt 23 Oct 2006 22:45:39 -0000 1.2
+++ src/hg/makeDb/doc/droGri2.txt 7 Apr 2009 16:55:46 -0000 1.3
@@ -1,156 +1,168 @@
# for emacs: -*- mode: sh; -*-
# Drosophila grimshawi -- Agencourt "CAF1" via Eisen's 12-fly site
# THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT
#########################################################################
# DOWNLOAD SEQUENCE (DONE 9/26/06 angie)
ssh kkstore05
mkdir /cluster/store12/droGri2
ln -s /cluster/store12/droGri2 /cluster/data/droGri2
mkdir /cluster/data/droGri2/downloads
cd /cluster/data/droGri2/downloads
wget http://rana.lbl.gov/drosophila/caf1/dgri_caf1.tar.gz
tar xvzf dgri_caf1.tar.gz
cd dgri
faSize scaffolds.bases
#200467819 bases (14377150 N's 186090669 real 186090669 upper 0 lower) in 17440 sequences in 1 files
#Total size: mean 11494.7 sd 331870.4 min 80 (scaffold_16070) max 24565398 (scaffold_15110) median 1702
#N count: mean 824.4 sd 8307.0
#U count: mean 10670.3 sd 326457.5
#L count: mean 0.0 sd 0.0
#########################################################################
# MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/26/06 angie)
ssh kkstore05
cd /cluster/data/droGri2
cat > droGri2.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db droGri2
clade insect
scientificName Drosophila grimshawi
assemblyDate Feb. 2006
assemblyLabel Agencourt CAF1
orderKey 57
mitoAcc none
fastaFiles /cluster/data/droGri2/downloads/dgri/scaffolds.bases
agpFiles /cluster/data/droGri2/downloads/dgri/assembly.agp
dbDbSpeciesDir drosophila
EOF
# Stop at db step so we can use featureBits, but don't do dbDb and trackDb
# because we're not building an actual browser for now.
makeGenomeDb.pl droGri2.config.ra -stop=db \
>& makeGenomeDb.log & tail -f makeGenomeDb.log
#########################################################################
# REPEATMASKER (DONE 9/26/06 angie)
ssh kkstore05
# Run -debug to create the dir structure and preview the scripts:
doRepeatMasker.pl droGri2 -verbose 3 -debug
# Run it for real and tail the log:
doRepeatMasker.pl droGri2 -species drosophila -verbose 3 \
>& /cluster/data/droGri2/bed/RepeatMasker.2006-09-26/do.log &
tail -f /cluster/data/droGri2/bed/RepeatMasker.2006-09-26/do.log
# RepeatMasker and lib version from do.log:
# March 20 2006 (open-3-1-5) version of RepeatMasker
#CC RELEASE 20060315; *
# Compare coverage to previous assembly (largest scaf in each, not apples-apples):
featureBits -chrom=scaffold_15110 droGri2 rmsk
#1755374 bases of 24078664 (7.290%) in intersection
featureBits -chrom=scaffold_25013 droGri1 rmsk
#1231873 bases of 14091279 (8.742%) in intersection
#########################################################################
# SIMPLE REPEATS (TRF) (DONE 9/26/06 angie)
ssh kolossus
nice tcsh
mkdir /cluster/data/droGri2/bed/simpleRepeat
cd /cluster/data/droGri2/bed/simpleRepeat
twoBitToFa ../../droGri2.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=simpleRepeat.bed -tempDir=/tmp \
>& trf.log & tail -f trf.log
# ~120 minutes (longer than D. mel, must be because of the scaffolds)
# Make a filtered version for sequence masking:
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
# Load unfiltered repeats into the database:
ssh hgwdev
hgLoadBed droGri2 simpleRepeat \
/cluster/data/droGri2/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Compare coverage to previous assembly (largest scaf in each, not apples-apples):
featureBits -chrom=scaffold_15110 droGri2 simpleRepeat
#559069 bases of 24078664 (2.322%) in intersection
featureBits -chrom=scaffold_25013 droGri1 simpleRepeat
#251144 bases of 14091279 (1.782%) in intersection
#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/26/06 angie)
ssh kolossus
cd /cluster/data/droGri2
time twoBitMask droGri2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed droGri2.2bit
# This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.345u 0.405s 0:02.33 31.7% 0+0k 0+0io 1pf+0w
# Because this is a no-browser build (just masking for alignment)
# I did not make the usual /gbdb/$db/$db.2bit link.
###########################################################################
# WINDOWMASKER EXPERIMENT (DONE 10/23/06 angie)
# The droVir3-droGri2 blastz run was just destroyed by mega-output,
# even with -chainFilterMinScore=10000 and M=50 (trying M=20 but not
# too hopeful)... so let's try a de-novo masker before alignment:
ssh kolossus
mkdir /cluster/data/droGri2/bed/windowmasker.2006-10-23
cd /cluster/data/droGri2/bed/windowmasker.2006-10-23
twoBitToFa /cluster/data/droGri2/droGri2.2bit tmp.fa
# First, collect counts:
time /cluster/bin/x86_64/windowmasker -mk_counts true -input tmp.fa \
-output wm.counts
#109.210u 2.980s 1:54.25 98.1% 0+0k 0+0io 0pf+0w
# Then use those counts to mask sequence:
time /cluster/bin/x86_64/windowmasker -ustat wm.counts -input tmp.fa \
-output wm.intervals
#188.383u 1.445s 3:11.42 99.1% 0+0k 0+0io 0pf+0w
perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \
if (/^(\d+) - (\d+)/) { \
$s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \
}' wm.intervals > windowmasker.bed
# Quick coverage:
awk '{print $3 - $2;}' windowmasker.bed | total
#76056110
awk '{print $2;}' ../../chrom.sizes | total
#200467819
calc 76056110 / 200467819
#76056110 / 200467819 = 0.379393
# Make a masked .2bit:
twoBitMask ../../droGri2.2bit windowmasker.bed ../../droGri2.WM.2bit
# Now try with -sdust to additionally mask low-complexity sequence:
time /cluster/bin/x86_64/windowmasker -ustat wm.counts -sdust true \
-input tmp.fa -output wm.sdust.intervals
#488.269u 1.617s 8:11.81 99.6% 0+0k 0+0io 0pf+0w
perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \
if (/^(\d+) - (\d+)/) { \
$s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \
}' wm.sdust.intervals > windowmasker.sdust.bed
awk '{print $3 - $2;}' windowmasker.sdust.bed | total
#92936045
calc 92936045 / 200467819
#92936045 / 200467819 = 0.463596
# Make a masked .2bit (even if we don't end up needing it):
twoBitMask ../../droGri2.2bit windowmasker.sdust.bed \
../../droGri2.WMSDust.2bit
rm tmp.fa
+#########################################################################
+# SWAP DM3 CHAIN/NET (DONE 4/3/09 angie)
+ mkdir /hive/data/genomes/droGri2/bed/blastz.dm3.swap
+ cd /hive/data/genomes/droGri2/bed/blastz.dm3.swap
+ doBlastzChainNet.pl -swap -bigClusterHub swarm -smallClusterHub memk \
+ -workhorse kolossus \
+ /hive/data/genomes/dm3/bed/blastz.droGri2/DEF >& do.log &
+ tail -f do.log
+ ln -s blastz.dm3.swap /hive/data/genomes/droGri2/bed/blastz.dm3
+
+
+#########################################################################