src/hg/makeDb/doc/calJac3.txt 1.11
1.11 2010/05/13 21:35:49 chinhli
Add gap annotation for multiz13way
Index: src/hg/makeDb/doc/calJac3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/calJac3.txt,v
retrieving revision 1.10
retrieving revision 1.11
diff -b -B -U 4 -r1.10 -r1.11
--- src/hg/makeDb/doc/calJac3.txt 13 May 2010 17:18:14 -0000 1.10
+++ src/hg/makeDb/doc/calJac3.txt 13 May 2010 21:35:49 -0000 1.11
@@ -1248,39 +1248,157 @@
mkdir maf
para -ram=8g create jobList
# put the split mafs back together into a single result
+ ssh hgwdev
+ cd /hive/data/genomes/calJac3/bed/multiz13way/run
+
head -q -n 1 maf/000.maf > calJac3.13way.maf
for F in maf/*.maf
do
grep -h -v "^#" ${F} >> calJac3.13way.maf
done
tail -q -n 1 maf/000.maf >> calJac3.13way.maf
- tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
-
- # real 13m32.340s
# load tables for a look
mkdir -p /gbdb/calJac3/multiz13way/maf
- cd /hive/data/genomes/calJac3/bed/multiz13way/maf
+ cd /hive/data/genomes/calJac3/bed/multiz13way/run
ln -s `pwd`/calJac3.13way.maf \
/gbdb/calJac3/multiz13way/maf/multiz13way.maf
# this generates an immense multiz13way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/calJac3/multiz13way/maf calJac3 multiz13way
- # Loaded 13316945 mafs in 1 files from /gbdb/calJac3/multiz13way/maf
- # real 9m9.365s
+ # Loading multiz13way into database
+ # Loaded 27583585 mafs in 257 files from /gbdb/calJac3/multiz13way/maf
+ # real 34m38.042s
# load summary table
time nice -n +19 cat /gbdb/calJac3/multiz13way/maf/*.maf \
| hgLoadMafSummary calJac3 -minSize=30000 -verbose=2 \
-mergeGap=1500 -maxSize=200000 multiz13waySummary stdin
-# Created 2330531 summary blocks from 99659162 components and
-# 13316945 mafs from stdin
- # real 17m54.685s
+ # Created 4661062 summary blocks from 199318324 components
+ # and 27583585 mafs from stdin
+ # Loading into calJac3 table multiz13waySummary...
+ # Loading complete
+ # real 38m46.339s
+
+ # Gap Annotation
+ # prepare bed files with gap info
+ mkdir /hive/data/genomes/calJac3/bed/multiz13way/anno
+ cd /hive/data/genomes/calJac3/bed/multiz13way/anno
+ mkdir maf run
+
+ # most of these will already exist from previous multiple
+ # alignments
+ # remove the echo from in front of the twoBitInfo command to get
+ # them
+ # to run if this loop appears to be correct
+ for DB in `cat ../species.list`
+do
+ CDIR="/hive/data/genomes/${DB}"
+ if [ ! -f ${CDIR}/${DB}.N.bed ]; then
+ echo "creating ${DB}.N.bed"
+ echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
+ else
+ ls -og ${CDIR}/${DB}.N.bed
+ fi
+done
+
+ cd run
+ # rm -f nBeds sizes
+ for DB in `sed -e "s/calJac3 //" ../../species.list`
+do
+ echo "${DB} "
+ ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
+ echo ${DB}.bed >> nBeds
+ ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
+ echo ${DB}.len >> sizes
+done
+
+ # the annotation step requires large memory, run on memk nodes
+ ssh memk
+ cd /hive/data/genomes/calJac3/bed/multiz13way/anno/run
+ ls ../../run/maf | sed -e "s/.maf//" > chr.list
+ cat << '_EOF_' > template
+#LOOP
+./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ cat << '_EOF_' > anno.csh
+#!/bin/csh -fe
+
+set inMaf = ../../run/maf/$1.maf
+set outMaf = ../maf/$1.maf
+rm -f $outMaf
+mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/calJac3/calJac3.2bit $outMaf
+'_EOF_'
+ # << happy emacs
+ chmod +x anno.csh
+
+ gensub2 chr.list single template jobList
+ para -ram=30g create jobList
+ # specify lots of ram to get one job per node
+ para -ram=30g push
+ #
+ # para check
+ # 256 jobs in batch
+ # 0 jobs (including everybody's) in Parasol queue or running.
+ # Checking finished jobs
+ # ...
+ # ranOk: 256
+ # total jobs in batch: 256
+
+
+ ssh hgwdev
+ rm -fr /gbdb/calJac3/multiz13way/maf
+ mkdir /gbdb/calJac3/multiz13way/maf
+ cd /hive/data/genomes/calJac3/bed/multiz13way/anno/maf
+ ln -s `pwd`/*.maf /gbdb/calJac3/multiz13way/maf/
+ # by loading this into the table multiz13way, it will replace the
+ # previously loaded table with the unannotated mafs
+ # huge temp files are made, do them on local disk
+ cd /data/tmp
+ time nice -n +19 hgLoadMaf \
+ -pathPrefix=/gbdb/calJac3/multiz13way/maf calJac3 multiz13way
+ # Loaded 14259674 mafs in 256 files from /gbdb/calJac3/multiz13way/maf
+ # real 15m41.804s
+
+ time nice -n +19 cat /gbdb/calJac3/multiz13way/maf/*.maf \
+ | hgLoadMafSummary calJac3 -minSize=30000 -mergeGap=1500 \
+ -maxSize=200000 multiz13waySummary stdin
+ # Indexing and tabulating stdin
+ # Created 2330531 summary blocks from 99659162 components
+ # and 14259674 mafs from stdin
+ # Loading into calJac3 table multiz13waySummary...
+ # Loading complete
+ # real 17m17.229s
+
+ # by loading this into the table multiz13waySummary, it will
+ # replace
+ # the previously loaded table with the unannotated mafs
+ # remove the multiz13way*.tab files in this /data/tmp directory
+ # -rw-rw-r-- 1 727501509 May 11 16:48 multiz13way.tab
+ # -rw-rw-r-- 1 113079736 May 12 08:57 multiz13waySummary.tab
+ wc -l multiz13way*.tab
+ # 14259674 multiz13way.tab
+ # 2330531 multiz13waySummary.tab
+ # 16590205 total
+ rm multiz13way*.tab
+
+ # create some downloads
+ mkdir -p /hive/data/genomes/calJac3/bed/multiz13way/download/maf
+ cd /hive/data/genomes/calJac3/bed/multiz13way/download/maf
+ time cp -p ../../anno/maf/*.maf .
+ # real 37m48.902s
+ time gzip --rsyncable *.maf
+ # real 64m55.554s
+ time md5sum *.gz > md5sum.txt
+ # real 2m41.707s
#####################################################################
# all.joiner update, downloads and in pushQ - (DONE - 2010-04-01 - Hiram)
cd $HOME/kent/src/hg/makeDb/schema