src/hg/makeDb/doc/hg19.txt 1.50
1.50 2009/10/21 19:15:36 hiram
Bring the msa_split procedure up to date
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.49
retrieving revision 1.50
diff -b -B -U 4 -r1.49 -r1.50
--- src/hg/makeDb/doc/hg19.txt 21 Oct 2009 18:34:58 -0000 1.49
+++ src/hg/makeDb/doc/hg19.txt 21 Oct 2009 19:15:36 -0000 1.50
@@ -5252,36 +5252,43 @@
# maf files as they were split up during multiz
# split 46way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
- ssh memk
+ ssh swarm
mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
./splitRegions.pl mafSplit.bed > \
/hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/region.list
mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/ss
- cd /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
+ mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
-set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/cons/ss/$c
+set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/2009-10-19/ss/$c
+set WC = `cat $MAF | wc -l`
+set NL = `grep "^#" $MAF | wc -l`
+if ( -s $2 ) then
+ exit 0
+endif
+if ( -s $2.running ) then
+ exit 0
+endif
+
+date >> $2.running
+
rm -fr $WINDOWS
-# set seq = `egrep "${c}"'$' region.list | awk '{printf "-seq=%s -start=%d
-# -end=%d", $1, $2, $3}'`
-set seq = `egrep "${c}"'$' region.list | awk '{printf "-seq=%s", $1}'`
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
-twoBitToFa ${seq} /hive/data/genomes/hg19/hg19.2bit hg19.$c.fa
-set empty = `faSize hg19.$c.fa | egrep " 0 real 0 upper 0 lower|masked total" | wc -l`
-if ( $empty != 2 ) then
- /cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
- -M hg19.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
+if ( $WC != $NL ) then
+/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
+ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
endif
-rm -f hg19.$c.fa
popd > /dev/null
date >> $2
+rm -f $2.running
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
@@ -5292,13 +5299,22 @@
'_EOF_'
# << happy emacs
# do the easy ones first to see some immediate results
- ls -1S -r ../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list
+ ls -1S -r ../../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list
gensub2 maf.list single template jobList
para -ram=32g create jobList
para try ... check ... etc
+# Completed: 503 of 504 jobs
+# Crashed: 1 jobs
+# CPU time in finished jobs: 14171s 236.18m 3.94h 0.16d 0.000 y
+# IO & Wait Time: 188193s 3136.55m 52.28h 2.18d 0.006 y
+# Average job time: 402s 6.71m 0.11h 0.00d
+# Longest finished job: 1597s 26.62m 0.44h 0.02d
+# Submission to last job: 2586s 43.10m 0.72h 0.03d
+ # the one crashed job is hg19_chr18_gl000207_random.00.maf
+
# XXX - this did not work
# this takes a really long time. memk was down to 2 usable
# machines - got it finished manually on a combination of hgwdevnew CPUs
# and other machines
@@ -6848,11 +6864,9 @@
zcat ${F}
done | sort -k1,1 -k2,2n > multiz46wayFrames.bed
featureBits -countGaps hg19 multiz46wayFrames.bed
- # 62315198 bases of 3107677273 (2.005%) in intersection
- featureBits -countGaps hg19 multiz28wayFrames
- # 48236360 bases of 3107677273 (1.552%) in intersection
+ # 57146632 bases of 3137161264 (1.822%) in intersection
# enable the trackDb entries:
# frames multiz46wayFrames
# irows on