src/hg/makeDb/doc/bosTauMd3.txt 1.2
1.2 2009/11/25 00:29:11 galt
make liftover
Index: src/hg/makeDb/doc/bosTauMd3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/bosTauMd3.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/makeDb/doc/bosTauMd3.txt 21 Nov 2009 01:25:40 -0000 1.1
+++ src/hg/makeDb/doc/bosTauMd3.txt 25 Nov 2009 00:29:11 -0000 1.2
@@ -1,142 +1,154 @@
# Bos Taurus -- University of Maryland UMD Release 3.0 (Aug 25 2009)
#
# "$Id$"
#
# creating minimal build in order make liftover files between MD3 and Btau4
###########################################################################
# set up main genome directory
ssh hgwdev
cd /hive/data/genomes
mkdir bosTauMd3
cd bosTauMd3
# DOWNLOAD SEQUENCE (DONE - 2009-11-18 - Galt)
mkdir download
cd download
# get sequence from UMD
wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.agp'
wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.fa.gz'
# fixup ids for agp
# change ">Chr"
# to ">chr"
mv bos_taurus.agp bos_taurus.orig.agp
cat bos_taurus.orig.agp | sed 's/\(^Chr\)/chr/' > bos_taurus.agp
# back to the main directory
cd /hive/data/genomes/bosTauMd3
# Run automation to make the basic genome
cat << '_EOF_' > bosTauMd3.config.ra
# Config parameters for makeGenomeDb.pl:
db bosTauMd3
clade mammal
scientificName Bos Taurus
assemblyDate Aug. 2009
assemblyLabel Univ. Maryland Release 3.0
orderKey 236
dbDbSpeciesDir cow
mitoAcc 60101824
commonName Cow
taxId 9913
fastaFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.fa.gz
agpFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.agp
subsetLittleIds Y
'_EOF_'
# << happy emacs
time makeGenomeDb.pl bosTauMd3.config.ra > & makeGenomeDb.pl.out &
# took 11 minutes
#note: I added subsetLittleIds option to config.ra
# because I had checked the .agp which was ok,
# but the script complained that there were some extra
# sequences in the fasta file. This option allows
# the system to ignore those, as long as everything
# in column 6 of agp is found in the fasta file(s).
featureBits -countGaps bosTauMd3 gap
#20737263 bases of 2660922743 (0.779%) in intersection
cat chrom.sizes | awk '{sum+=$2;print sum,$0}'
#2660922743
# similar total
# TODO
# Organism Image
wget -O /usr/local/apache/htdocs/images/Aplysia_californica.jpg \
'http://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Aplysia_californica.jpg/250px-Aplysia_californica.jpg'
# TODO
# Edit and check-in templates for description.html, gold.html, gap.html, bosTauMd3/trackDb.ra
# repeat mask
# for next time, put this under the bed/ dir
mkdir repeatMasker
cd repeatMasker
time doRepeatMasker.pl bosTauMd3 -buildDir=`pwd` > & doRepeatMasker.pl.out &
cat faSize.rmsk.txt
#%47.84 masked total, %48.22 masked real
featureBits -countGaps bosTauMd3 rmsk
#1273525727 bases of 2660922743 (47.860%) in intersection
# simple repeat masker trf
cd /hive/data/genomes/bosTauMd3/bed
mkdir simpleRepeat
cd simpleRepeat
time doSimpleRepeat.pl bosTauMd3 -buildDir=`pwd` > & doSimpleRepeat.pl.out &
# failed only on chrM which doesn't matter
# perhaps someone will look into why bigTrf fails on chrM
ssh swarm
cd /hive/data/genomes/bosTauMd3/bed/simpleRepeat/run.cluster
/parasol/bin/para time > run.time
time doSimpleRepeat.pl -continue filter -buildDir `pwd` bosTauMd3 \
>> & doSimpleRepeat.pl.out &
featureBits -countGaps bosTauMd3 simpleRepeat
#29450984 bases of 2660922743 (1.107%) in intersection
# make final masked .2bit
cd /hive/data/genomes/bosTauMd3
twoBitMask bosTauMd3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTauMd3.2bit
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it
#might contain block coordinates, but this program uses only the first three
#fields
# (the entire span -- no support for blocks).
# this seems to be generic output we always see.
############################################################################
# prepare cluster data (DONE - 2009-11-20 - Galt)
ssh hgwdev
cd /hive/data/genomes/bosTauMd3
# create gbdb symlink
ln -s `pwd`/bosTauMd3.2bit /gbdb/bosTauMd3/
time blat bosTauMd3.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
#Wrote 27298 overused 11-mers to 11.ooc
#151.299u 3.434s 3:06.07 83.1% 0+0k 0+0io 3pf+0w
mkdir /hive/data/staging/data/bosTauMd3
cp -p bosTauMd3.2bit /hive/data/staging/data/bosTauMd3
cp -p 11.ooc /hive/data/staging/data/bosTauMd3
cp -p chrom.sizes /hive/data/staging/data/bosTauMd3
# ask admin to sync this directory: /hive/data/staging/data/bosTauMd3/
# to the kluster nodes /scratch/data/bosTauMd3/
+#############################################################################
+# LIFTOVER TO bosTau4 (DONE - 2009-11-23 - Galt )
+ mkdir /hive/data/genomes/bosTauMd3/bed/blat.bosTau4.2009-11-23
+ cd /hive/data/genomes/bosTauMd3/bed/blat.bosTau4.2009-11-23
+ time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \
+ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
+ bosTauMd3 bosTau4 >& do.log
+ # it actually ran out of space on /scratch/tmp
+ # and I moved the temp dir somewhere else and
+ # finished the net step. I did not do the load or cleanup steps.
+
+