src/hg/makeDb/doc/bosTauMd3.txt 1.2

1.2 2009/11/25 00:29:11 galt
make liftover
Index: src/hg/makeDb/doc/bosTauMd3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/bosTauMd3.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/makeDb/doc/bosTauMd3.txt	21 Nov 2009 01:25:40 -0000	1.1
+++ src/hg/makeDb/doc/bosTauMd3.txt	25 Nov 2009 00:29:11 -0000	1.2
@@ -1,142 +1,154 @@
 # Bos Taurus -- University of Maryland UMD Release 3.0 (Aug 25 2009)
 #
 #	"$Id$"
 #
 # creating minimal build in order make liftover files between MD3 and Btau4
 ###########################################################################
 
 # set up main genome directory
 
 ssh hgwdev
 cd /hive/data/genomes
 mkdir bosTauMd3
 cd bosTauMd3
 
 # DOWNLOAD SEQUENCE (DONE - 2009-11-18 - Galt)
 
 mkdir download
 cd download
 
 # get sequence from UMD
 
 wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.agp'
 wget --timestamping -nd 'ftp://ftp.cbcb.umd.edu/pub/data/Bos_taurus/Bos_taurus_UMD_3.0/bos_taurus.fa.gz'
 
 # fixup ids for agp
 #  change ">Chr" 
 #    to   ">chr"
 mv bos_taurus.agp bos_taurus.orig.agp
 cat bos_taurus.orig.agp | sed 's/\(^Chr\)/chr/' > bos_taurus.agp
 
 # back to the main directory
 cd /hive/data/genomes/bosTauMd3
 
 # Run automation to make the basic genome
 
     cat << '_EOF_' > bosTauMd3.config.ra
 # Config parameters for makeGenomeDb.pl:
 db bosTauMd3
 clade mammal
 scientificName Bos Taurus
 assemblyDate Aug. 2009
 assemblyLabel Univ. Maryland Release 3.0
 orderKey 236
 dbDbSpeciesDir cow
 mitoAcc 60101824
 commonName Cow
 taxId 9913
 fastaFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.fa.gz
 agpFiles /hive/data/genomes/bosTauMd3/download/bos_taurus.agp
 subsetLittleIds Y
 '_EOF_'
     # << happy emacs
 
 
 time makeGenomeDb.pl bosTauMd3.config.ra > & makeGenomeDb.pl.out &
 # took 11 minutes
 
 #note: I added subsetLittleIds option to config.ra 
 # because I had checked the .agp which was ok,
 # but the script complained that there were some extra
 # sequences in the fasta file.  This option allows
 # the system to ignore those, as long as everything
 # in column 6 of agp is found in the fasta file(s).
 
 featureBits -countGaps bosTauMd3 gap
 #20737263 bases of 2660922743 (0.779%) in intersection
 
 cat chrom.sizes | awk '{sum+=$2;print sum,$0}'
 #2660922743
 # similar total
 
 # TODO
 # Organism Image
 wget -O /usr/local/apache/htdocs/images/Aplysia_californica.jpg \
  'http://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/Aplysia_californica.jpg/250px-Aplysia_californica.jpg'
 
 # TODO
 # Edit and check-in templates for description.html, gold.html, gap.html, bosTauMd3/trackDb.ra
 
 # repeat mask
 # for next time, put this under the bed/ dir
 mkdir repeatMasker
 cd repeatMasker
 time doRepeatMasker.pl bosTauMd3 -buildDir=`pwd` > & doRepeatMasker.pl.out &
 
 
 cat faSize.rmsk.txt 
 #%47.84 masked total, %48.22 masked real
 
 featureBits -countGaps bosTauMd3 rmsk
 #1273525727 bases of 2660922743 (47.860%) in intersection
 
 # simple repeat masker trf
 cd /hive/data/genomes/bosTauMd3/bed
 mkdir simpleRepeat
 cd simpleRepeat
 time doSimpleRepeat.pl bosTauMd3 -buildDir=`pwd` > & doSimpleRepeat.pl.out &
 
 
 # failed only on chrM which doesn't matter
 # perhaps someone will look into why bigTrf fails on chrM
 ssh swarm
 cd /hive/data/genomes/bosTauMd3/bed/simpleRepeat/run.cluster
 /parasol/bin/para time > run.time
 
 time doSimpleRepeat.pl -continue filter -buildDir `pwd` bosTauMd3 \
   >> & doSimpleRepeat.pl.out &
 
 
 featureBits -countGaps bosTauMd3 simpleRepeat
 #29450984 bases of 2660922743 (1.107%) in intersection
 
 # make final masked .2bit
 cd /hive/data/genomes/bosTauMd3
 twoBitMask bosTauMd3.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTauMd3.2bit
 #Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it
 #might contain block coordinates, but this program uses only the first three
 #fields
 # (the entire span -- no support for blocks).
 # this seems to be generic output we always see.
 
 ############################################################################
 #	prepare cluster data (DONE - 2009-11-20 - Galt)
 
 ssh hgwdev
 cd /hive/data/genomes/bosTauMd3
 
 # create gbdb symlink
 ln -s `pwd`/bosTauMd3.2bit /gbdb/bosTauMd3/
 
 time blat bosTauMd3.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
 #Wrote 27298 overused 11-mers to 11.ooc
 #151.299u 3.434s 3:06.07 83.1%   0+0k 0+0io 3pf+0w
 
 mkdir /hive/data/staging/data/bosTauMd3
 cp -p bosTauMd3.2bit /hive/data/staging/data/bosTauMd3
 cp -p 11.ooc /hive/data/staging/data/bosTauMd3
 cp -p chrom.sizes /hive/data/staging/data/bosTauMd3
 
 # ask admin to sync this directory: /hive/data/staging/data/bosTauMd3/
 #	to the kluster nodes /scratch/data/bosTauMd3/
 
+#############################################################################
+# LIFTOVER TO bosTau4 (DONE - 2009-11-23 - Galt )
+    mkdir /hive/data/genomes/bosTauMd3/bed/blat.bosTau4.2009-11-23
+    cd /hive/data/genomes/bosTauMd3/bed/blat.bosTau4.2009-11-23
+    time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \
+	-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
+	 bosTauMd3 bosTau4 >& do.log
+    # it actually ran out of space on /scratch/tmp
+    # and I moved the temp dir somewhere else and
+    # finished the net step.  I did not do the load or cleanup steps.
+
+