src/hg/makeDb/doc/bosTauMd3.txt 1.1
1.1 2009/11/21 01:25:40 galt
adding MD3 so we can make chains to bosTau4
Index: src/hg/makeDb/doc/bosTauMd3.txt
===================================================================
RCS file: src/hg/makeDb/doc/bosTauMd3.txt
diff -N src/hg/makeDb/doc/bosTauMd3.txt
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/bosTauMd3.txt 21 Nov 2009 01:25:40 -0000 1.1
@@ -0,0 +1,142 @@
+# Bos Taurus -- University of Maryland UMD Release 3.0 (Aug 25 2009)
+#
+# "$Id$"
+#
+# creating minimal build in order make liftover files between MD3 and Btau4
+###########################################################################
+
+# set up main genome directory
+
+ssh hgwdev
+cd /hive/data/genomes
+mkdir bosTauMd3
+cd bosTauMd3
+
+# DOWNLOAD SEQUENCE (DONE - 2009-11-18 - Galt)
+
+mkdir download
+cd download
+
+# get sequence from UMD
+
+wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.agp'
+wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.fa.gz'
+
+# fixup ids for agp
+# change ">Chr"
+# to ">chr"
+mv bos_taurus.agp bos_taurus.orig.agp
+cat bos_taurus.orig.agp | sed 's/\(^Chr\)/chr/' > bos_taurus.agp
+
+# back to the main directory
+cd /hive/data/genomes/bosTauMd3
+
+# Run automation to make the basic genome
+
+ cat << '_EOF_' > bosTauMd3.config.ra
+# Config parameters for makeGenomeDb.pl:
+db bosTauMd3
+clade mammal
+scientificName Bos Taurus
+assemblyDate Aug. 2009
+assemblyLabel Univ. Maryland Release 3.0
+orderKey 236
+dbDbSpeciesDir cow
+mitoAcc 60101824
+commonName Cow
+taxId 9913
+fastaFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.fa.gz
+agpFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.agp
+subsetLittleIds Y
+'_EOF_'
+ # << happy emacs
+
+
+time makeGenomeDb.pl bosTauMd3.config.ra > & makeGenomeDb.pl.out &
+# took 11 minutes
+
+#note: I added subsetLittleIds option to config.ra
+# because I had checked the .agp which was ok,
+# but the script complained that there were some extra
+# sequences in the fasta file. This option allows
+# the system to ignore those, as long as everything
+# in column 6 of agp is found in the fasta file(s).
+
+featureBits -countGaps bosTauMd3 gap
+#20737263 bases of 2660922743 (0.779%) in intersection
+
+cat chrom.sizes | awk '{sum+=$2;print sum,$0}'
+#2660922743
+# similar total
+
+# TODO
+# Organism Image
+wget -O /usr/local/apache/htdocs/images/Aplysia_californica.jpg \
+ 'http://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Aplysia_californica.jpg/250px-Aplysia_californica.jpg'
+
+# TODO
+# Edit and check-in templates for description.html, gold.html, gap.html, bosTauMd3/trackDb.ra
+
+# repeat mask
+# for next time, put this under the bed/ dir
+mkdir repeatMasker
+cd repeatMasker
+time doRepeatMasker.pl bosTauMd3 -buildDir=`pwd` > & doRepeatMasker.pl.out &
+
+
+cat faSize.rmsk.txt
+#%47.84 masked total, %48.22 masked real
+
+featureBits -countGaps bosTauMd3 rmsk
+#1273525727 bases of 2660922743 (47.860%) in intersection
+
+# simple repeat masker trf
+cd /hive/data/genomes/bosTauMd3/bed
+mkdir simpleRepeat
+cd simpleRepeat
+time doSimpleRepeat.pl bosTauMd3 -buildDir=`pwd` > & doSimpleRepeat.pl.out &
+
+
+# failed only on chrM which doesn't matter
+# perhaps someone will look into why bigTrf fails on chrM
+ssh swarm
+cd /hive/data/genomes/bosTauMd3/bed/simpleRepeat/run.cluster
+/parasol/bin/para time > run.time
+
+time doSimpleRepeat.pl -continue filter -buildDir `pwd` bosTauMd3 \
+ >> & doSimpleRepeat.pl.out &
+
+
+featureBits -countGaps bosTauMd3 simpleRepeat
+#29450984 bases of 2660922743 (1.107%) in intersection
+
+# make final masked .2bit
+cd /hive/data/genomes/bosTauMd3
+twoBitMask bosTauMd3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTauMd3.2bit
+#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it
+#might contain block coordinates, but this program uses only the first three
+#fields
+# (the entire span -- no support for blocks).
+# this seems to be generic output we always see.
+
+############################################################################
+# prepare cluster data (DONE - 2009-11-20 - Galt)
+
+ssh hgwdev
+cd /hive/data/genomes/bosTauMd3
+
+# create gbdb symlink
+ln -s `pwd`/bosTauMd3.2bit /gbdb/bosTauMd3/
+
+time blat bosTauMd3.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
+#Wrote 27298 overused 11-mers to 11.ooc
+#151.299u 3.434s 3:06.07 83.1% 0+0k 0+0io 3pf+0w
+
+mkdir /hive/data/staging/data/bosTauMd3
+cp -p bosTauMd3.2bit /hive/data/staging/data/bosTauMd3
+cp -p 11.ooc /hive/data/staging/data/bosTauMd3
+cp -p chrom.sizes /hive/data/staging/data/bosTauMd3
+
+# ask admin to sync this directory: /hive/data/staging/data/bosTauMd3/
+# to the kluster nodes /scratch/data/bosTauMd3/
+