src/hg/makeDb/doc/hg19.txt 1.35
1.35 2009/08/11 17:16:53 hiram
done with hg19 tetNig2 chain and net
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.34
retrieving revision 1.35
diff -b -B -U 4 -r1.34 -r1.35
--- src/hg/makeDb/doc/hg19.txt 29 Jul 2009 18:42:22 -0000 1.34
+++ src/hg/makeDb/doc/hg19.txt 11 Aug 2009 17:16:53 -0000 1.35
@@ -4437,17 +4438,17 @@
(((vicPac1:0.105252,(turTru1:0.064182,bosTau4:0.121911):0.025111):0.039691,
((equCab2:0.107726,(felCat3:0.097971,canFam2:0.100888):0.049486):0.006252,
(myoLuc1:0.141155,pteVam1:0.111787):0.033187):0.004179):0.011699,
(eriEur1:0.220580,sorAra1:0.266859):0.056117):0.021065):0.023276,
-(((loxAfr2:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
+(((loxAfr3:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
(dasNov2:0.115179,choHof1:0.096272):0.052373):0.006713):0.132748,
macEug1:0.3):0.1,
monDom5:0.325899):0.072430,ornAna1:0.453916):0.109903,
((galGal3:0.166386,taeGut1:0.170717):0.199763,
anoCar1:0.509545):0.108130):0.166150,xenTro2:0.852482):0.300396,
(((tetNig1:0.224774,fr2:0.205294):0.191836,
(gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,
-danRer5:0.731166):0.155214):0.511293,petMar1:0.511293);
+danRer6:0.731166):0.155214):0.511293,petMar1:0.511293);
'_EOF_'
# << happy emacs
# Use this specification in the phyloGif tool:
@@ -4528,9 +4529,9 @@
# 13 0.3500 - Squirrel speTri1 (% 35.713) (N/A)
# 14 0.3611 - Alpaca vicPac1 (% 39.399) (N/A)
# 15 0.3620 - Sloth choHof1 (% 34.377) (N/A)
# 16 0.3653 - Megabat pteVam1 (% 45.414) (N/A)
-# 17 0.3732 - Elephant loxAfr2 (% 35.153) (N/A)
+# 17 0.3732 - Elephant loxAfr3 (% 46.636) (% 42.430)
# 18 0.3740 - Cat felCat3 (% 35.713) (% 61.104)
# 19 0.3769 - Dog canFam2 (% 52.879) (% 62.055)
# 20 0.3809 - Armadillo dasNov2 (% 33.543) (N/A)
# 21 0.3941 - Rabbit oryCun1 (% 33.676) (N/A)
@@ -4553,9 +4554,9 @@
# 38 1.2394 - Lizard anoCar1 (% 3.591) (% 5.146)
# 39 1.6403 - X. tropicalis xenTro2 (% 3.176) (% 6.773)
# 40 1.9387 - Stickleback gasAcu1 (% 1.916) (% 11.175)
# 41 1.9634 - Fugu fr2 (% 1.702) (% 10.929)
-# 42 1.9746 - Zebrafish danRer5 (% 2.562) (% 5.144)
+# 42 1.9746 - Zebrafish danRer6 (% 3.051) (% 6.399)
# 43 1.9829 - Tetraodon tetNig1 (% 2.003) (% 14.443)
# 44 2.1031 - Medaka oryLat2 (% 1.849) (% 6.705)
# 45 2.1108 - Lamprey petMar1 (% 1.082) (% 3.200)
@@ -4668,35 +4669,35 @@
cd splitRun
mkdir maf run
cd run
mkdir penn
- cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn
- cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn
- cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
# set the db and pairs directories here
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = hg19
set c = $1
set result = $2
-set run = `pwd`
+set run = `/bin/pwd`
set tmp = $run/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg19/bed/multiz46way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
pushd $tmp
-foreach s (`sed -e "s/ $db//" species.list`)
+foreach s (`/bin/sed -e "s/ $db//" species.list`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
/bin/zcat $in.gz > $out
if (! -s $out) then
echo "##maf version=1 scoring=autoMZ" > $out
endif
else if (-e $in) then
- ln -s $in $out
+ /bin/ln -s $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
@@ -4718,8 +4719,10 @@
#ENDLOOP
'_EOF_'
# << happy emacs
+ find ../../mafSplit -type f | grep hg19_ | xargs -L 1 basename \
+ | sed -e "s/.gz//" | sort -u > chr.part.list
gensub2 chr.part.list single template jobList
para -ram=8g create jobList
# initial run experience suggest some of the big jobs reach 8 Gb
# of memory usage, so, tell parasol to limit the number of jobs per
@@ -4725,15 +4728,14 @@
# of memory usage, so, tell parasol to limit the number of jobs per
# node to avoid thrashing
para -ram=8g try
para -ram=8g push
-
# Completed: 504 of 504 jobs
-# CPU time in finished jobs: 3013519s 50225.31m 837.09h 34.88d 0.096 y
-# IO & Wait Time: 1450670s 24177.84m 402.96h 16.79d 0.046 y
-# Average job time: 8858s 147.63m 2.46h 0.10d
-# Longest finished job: 30846s 514.10m 8.57h 0.36d
-# Submission to last job: 47247s 787.45m 13.12h 0.55d
+# CPU time in finished jobs: 1313981s 21899.69m 364.99h 15.21d 0.042 y
+# IO & Wait Time: 675855s 11264.24m 187.74h 7.82d 0.021 y
+# Average job time: 3948s 65.80m 1.10h 0.05d
+# Longest finished job: 19767s 329.45m 5.49h 0.23d
+# Submission to last job: 128616s 2143.60m 35.73h 1.49d
# put the split maf results back together into a single maf file
# eliminate duplicate comments
ssh hgwdev
@@ -4745,10 +4747,8 @@
# almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
# HOWEVER, this is actually not necessary to maintain these comments,
# they are lost during the mafAddIRows
- # plus, these things shouldn't be gzipped, they need to be
- # ordinary files for loading into database
cat << '_EOF_' >> runOne
#!/bin/csh -fe
set C = $1
if ( -s ../maf/${C}.maf.gz ) then
@@ -4759,9 +4759,8 @@
sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
| sort -u >> ../maf/${C}.maf
grep -h -v "^#" `ls maf/hg19_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
-gzip ../maf/${C}.maf
'_EOF_'
# << happy emacs
chmod +x runOne
@@ -4777,20 +4776,18 @@
cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
gensub2 chr.list single template jobList
para create jobList
para try ... check ... push ... etc ...
-XXX - running Fri Jun 12 16:09:24 PDT 2009
# Completed: 92 of 93 jobs
# Crashed: 1 jobs
-# CPU time in finished jobs: 15960s 266.01m 4.43h 0.18d 0.001 y
-# IO & Wait Time: 61004s 1016.73m 16.95h 0.71d 0.002 y
-# Average job time: 837s 13.94m 0.23h 0.01d
-# Longest finished job: 5799s 96.65m 1.61h 0.07d
-# Submission to last job: 8608s 143.47m 2.39h 0.10d
- # one of the results is completely empty and didn't make any final answer:
- # ../maf/chrUn_gl000226.maf.gz does not exist
- # that small contig is completely repeat masked out, it has no alignments
- # it was there, but it wasn't gzipped, all it has is comments
+# CPU time in finished jobs: 1027s 17.12m 0.29h 0.01d 0.000 y
+# IO & Wait Time: 20761s 346.01m 5.77h 0.24d 0.001 y
+# Average job time: 237s 3.95m 0.07h 0.00d
+# Longest finished job: 1629s 27.15m 0.45h 0.02d
+# Submission to last job: 1640s 27.33m 0.46h 0.02d
+
+ # one of the results is completely empty, the grep for results failed
+ # this file ../maf/chrUn_gl000226.maf only has header comments, no result
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/hg19/multiz46way/maf
@@ -4801,35 +4798,22 @@
# where it is running. Best to run this over in scratch.
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
+ # real 44m9.177s
+ # Loaded 33540773 mafs in 93 files from /gbdb/hg19/multiz46way/maf
+
# real 81m26.382s
# Loaded 74158519 mafs in 93 files from /gbdb/hg19/multiz46way/maf
# load summary table
time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
- | hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
- -maxSize=200000 multiz46waySummary stdin
-XXX - running with limits in place to avoid out of memory situation
-Got up to about 50 Gb in memory, I wonder if it has a memory leak
-XXX - Thu Jun 18 14:43:52 PDT 2009
-XXX - failed the first time:
-Created 9192683 summary blocks from 1296420257 components and 74158519 mafs from stdin
-Loading into hg19 table multiz46waySummary...
-Loading completeAdvisory lock has been released
-
-real 67m5.748s
-user 56m27.129s
-sys 4m30.870s
-
-# Indexing and tabulating stdin
-# needLargeMem: Out of memory - request size 4128 bytes, errno: 12
-# real 43m32.011s
-# user 39m33.083s
-# sys 3m32.477s
-
- # real 2m39.822
- # Created 353577 summary blocks from 2852890 components and 1197504 mafs
- # from stdin
+ | $HOME/bin/$MACHTYPE/hgLoadMafSummary hg19 -minSize=30000 -verbose=2 \
+ -mergeGap=1500 -maxSize=200000 multiz46waySummary stdin
+ # real 65m2.993s
+# flushSummaryBlocks: output 45 blocks
+# Created 8869916 summary blocks from 642016936 components
+# and 33540773 mafs from stdin
+# blocks too small to be used: 27359
# Gap Annotation
# prepare bed files with gap info
mkdir /hive/data/genomes/hg19/bed/multiz46way/anno
@@ -5318,19 +5302,85 @@
cat fb.danRer6.chainHg19Link.txt
# 96424507 bases of 1506896106 (6.399%) in intersection
##############################################################################
-############################################################################
+# LASTZ Elephant LoxAfr3 (DONE - 2009-07-21,23 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
+ cd /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
+
+ cat << '_EOF_' > DEF
+# Human vs. Elephant
+BLASTZ_M=50
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/nib
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY: Elephant
+SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
+SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LIMIT=50
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ # establish a screen to control this job
+ screen
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ # real 317m32.664s
+ # broken when it went to chaining on encodek, finish the chain then:
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ -continue=chainMerge > chainMerge.log 2>&1 &
+ # real 217m25.159s
+
+ # time about 3h23m
+ cat fb.hg19.chainLoxAfr3Link.txt
+ # 1351200080 bases of 2897316137 (46.636%) in intersection
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -syntenicNet -continue=syntenicNet -stop=syntenicNet \
+ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ > synNet.log 2>&1 &
+ # real 32m40.554s
+XXX - running Mon Aug 3 13:36:26 PDT 2009
+
+ time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr3 > rbest.log 2>&1
+ # real 184m3.435s
+
+ mkdir /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
+ cd /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21/DEF \
+ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ -swap > swap.log 2>&1 &
+ # real 220m16.839s
+ cat fb.loxAfr3.chainHg19Link.txt
+ # 1323201500 bases of 3118565340 (42.430%) in intersection
+
+##############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
-############################################################################
-testing
-
############################################################################
# AGILENT PROBES LIFTED FROM HG18 (DONE, 2009-07-28 Andy)
@@ -5359,4 +5409,86 @@
mkdir agilentProbes
cd agilentProbes/
ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped beds
ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped.tar.gz
+
+##############################################################################
+# LASTZ Tetraodon TetNig2 (DONE - 2009-08-10,11 - Hiram)
+ # This is the incorrect date/time stamp on this directory,
+ # it should be 2009-08-10
+ mkdir /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
+ cd /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
+
+ cat << '_EOF_' > DEF
+# human vs tetraodon
+BLASTZ_Y=3400
+BLASTZ_L=6000
+BLASTZ_K=2200
+BLASTZ_M=50
+BLASTZ_Q=/scratch/data/blastz/HoxD55.q
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/hg19.2bit
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=5
+
+# QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
+SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
+SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
+SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
+SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
+SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
+SEQ2_CHUNK=20000000
+SEQ2_LAP=0
+SEQ2_LIMIT=50
+
+BASE=/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ # establish a screen to control this job
+ screen
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -qRepeats=windowmaskerSdust \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ # real 220m36.068s
+ # forgot the qRepeats for tetNig2
+ rm axtChain/hg19.tetNig2.net
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -continue=load -qRepeats=windowmaskerSdust \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ > load.log 2>&1 &
+ # real 5m53.096s
+ cat fb.hg19.chainTetNig2Link.txt
+ # 49611132 bases of 2897316137 (1.712%) in intersection
+
+ # running the swap
+ mkdir /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
+ cd /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
+ -qRepeats=windowmaskerSdust \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -swap > swap.log 2>&1 &
+ # real 13m21.591s
+ # forgot the qRepeats for tetNig2
+ rm axtChain/tetNig2.hg19.net
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
+ -continue=load -qRepeats=windowmaskerSdust \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -swap > load.log 2>&1 &
+ # real 4m7.559s
+ cat fb.tetNig2.chainHg19Link.txt
+ # 42910930 bases of 302314788 (14.194%) in intersection
+
+##############################################################################