src/hg/makeDb/doc/hg19.txt 1.35

1.35 2009/08/11 17:16:53 hiram
done with hg19 tetNig2 chain and net
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.34
retrieving revision 1.35
diff -b -B -U 4 -r1.34 -r1.35
--- src/hg/makeDb/doc/hg19.txt	29 Jul 2009 18:42:22 -0000	1.34
+++ src/hg/makeDb/doc/hg19.txt	11 Aug 2009 17:16:53 -0000	1.35
@@ -4437,17 +4438,17 @@
 (((vicPac1:0.105252,(turTru1:0.064182,bosTau4:0.121911):0.025111):0.039691,
 ((equCab2:0.107726,(felCat3:0.097971,canFam2:0.100888):0.049486):0.006252,
 (myoLuc1:0.141155,pteVam1:0.111787):0.033187):0.004179):0.011699,
 (eriEur1:0.220580,sorAra1:0.266859):0.056117):0.021065):0.023276,
-(((loxAfr2:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
+(((loxAfr3:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
 (dasNov2:0.115179,choHof1:0.096272):0.052373):0.006713):0.132748,
 macEug1:0.3):0.1,
 monDom5:0.325899):0.072430,ornAna1:0.453916):0.109903,
 ((galGal3:0.166386,taeGut1:0.170717):0.199763,
 anoCar1:0.509545):0.108130):0.166150,xenTro2:0.852482):0.300396,
 (((tetNig1:0.224774,fr2:0.205294):0.191836,
 (gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,
-danRer5:0.731166):0.155214):0.511293,petMar1:0.511293);
+danRer6:0.731166):0.155214):0.511293,petMar1:0.511293);
 '_EOF_'
     # << happy emacs
 
     #	Use this specification in the phyloGif tool:
@@ -4528,9 +4529,9 @@
 # 13  0.3500 - Squirrel speTri1 (% 35.713) (N/A)
 # 14  0.3611 - Alpaca vicPac1   (% 39.399) (N/A)
 # 15  0.3620 - Sloth choHof1    (% 34.377) (N/A)
 # 16  0.3653 - Megabat pteVam1  (% 45.414) (N/A)
-# 17  0.3732 - Elephant loxAfr2 (% 35.153) (N/A)
+# 17  0.3732 - Elephant loxAfr3 (% 46.636) (% 42.430)
 # 18  0.3740 - Cat felCat3      (% 35.713) (% 61.104)
 # 19  0.3769 - Dog canFam2      (% 52.879) (% 62.055)
 # 20  0.3809 - Armadillo dasNov2        (% 33.543) (N/A)
 # 21  0.3941 - Rabbit oryCun1   (% 33.676) (N/A)
@@ -4553,9 +4554,9 @@
 # 38  1.2394 - Lizard anoCar1   (% 3.591) (% 5.146)
 # 39  1.6403 - X. tropicalis xenTro2    (% 3.176) (% 6.773)
 # 40  1.9387 - Stickleback gasAcu1      (% 1.916) (% 11.175)
 # 41  1.9634 - Fugu fr2 (% 1.702) (% 10.929)
-# 42  1.9746 - Zebrafish danRer5        (% 2.562) (% 5.144)
+# 42  1.9746 - Zebrafish danRer6        (% 3.051) (% 6.399)
 # 43  1.9829 - Tetraodon tetNig1        (% 2.003) (% 14.443)
 # 44  2.1031 - Medaka oryLat2   (% 1.849) (% 6.705)
 # 45  2.1108 - Lamprey petMar1  (% 1.082) (% 3.200)
 
@@ -4668,35 +4669,35 @@
     cd splitRun
     mkdir maf run
     cd run
     mkdir penn
-    cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn
-    cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn
-    cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn
+    cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn 
+    cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn 
+    cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn 
 
     #	set the db and pairs directories here
     cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 set db = hg19
 set c = $1
 set result = $2
-set run = `pwd`
+set run = `/bin/pwd`
 set tmp = $run/tmp/$db/multiz.$c
 set pairs = /hive/data/genomes/hg19/bed/multiz46way/mafSplit
 /bin/rm -fr $tmp
 /bin/mkdir -p $tmp
 /bin/cp -p ../../tree.nh ../../species.list $tmp
 pushd $tmp
-foreach s (`sed -e "s/ $db//" species.list`)
+foreach s (`/bin/sed -e "s/ $db//" species.list`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if (-e $in.gz) then
         /bin/zcat $in.gz > $out
 	if (! -s $out) then
 	    echo "##maf version=1 scoring=autoMZ" > $out
 	endif
     else if (-e $in) then
-        ln -s $in $out
+        /bin/ln -s $in $out
     else
         echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
@@ -4718,8 +4719,10 @@
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
+    find ../../mafSplit -type f | grep hg19_ | xargs -L 1 basename \
+	| sed -e "s/.gz//" | sort -u > chr.part.list
     gensub2 chr.part.list single template jobList
     para -ram=8g create jobList
     #	initial run experience suggest some of the big jobs reach 8 Gb
     #	of memory usage, so, tell parasol to limit the number of jobs per
@@ -4725,15 +4728,14 @@
     #	of memory usage, so, tell parasol to limit the number of jobs per
     #	node to avoid thrashing
     para -ram=8g try
     para -ram=8g push
-
 # Completed: 504 of 504 jobs
-# CPU time in finished jobs:    3013519s   50225.31m   837.09h   34.88d  0.096 y
-# IO & Wait Time:               1450670s   24177.84m   402.96h   16.79d  0.046 y
-# Average job time:                8858s     147.63m     2.46h    0.10d
-# Longest finished job:           30846s     514.10m     8.57h    0.36d
-# Submission to last job:         47247s     787.45m    13.12h    0.55d
+# CPU time in finished jobs:    1313981s   21899.69m   364.99h   15.21d  0.042 y
+# IO & Wait Time:                675855s   11264.24m   187.74h    7.82d  0.021 y
+# Average job time:                3948s      65.80m     1.10h    0.05d
+# Longest finished job:           19767s     329.45m     5.49h    0.23d
+# Submission to last job:        128616s    2143.60m    35.73h    1.49d
 
     # put the split maf results back together into a single maf file
     #	eliminate duplicate comments
     ssh hgwdev
@@ -4745,10 +4747,8 @@
     #	almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
     #	HOWEVER, this is actually not necessary to maintain these comments,
     #	they are lost during the mafAddIRows
 
-    #	plus, these things shouldn't be gzipped, they need to be
-    #	ordinary files for loading into database
     cat << '_EOF_' >> runOne
 #!/bin/csh -fe
 set C = $1
 if ( -s ../maf/${C}.maf.gz ) then
@@ -4759,9 +4759,8 @@
     sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
         | sort -u >> ../maf/${C}.maf 
 grep -h -v "^#" `ls maf/hg19_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
 tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
-gzip ../maf/${C}.maf
 '_EOF_'
     # << happy emacs
     chmod +x runOne
 
@@ -4777,20 +4776,18 @@
     cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
     gensub2 chr.list single template jobList
     para create jobList
     para try ... check ... push ... etc ...
-XXX - running Fri Jun 12 16:09:24 PDT 2009
 # Completed: 92 of 93 jobs
 # Crashed: 1 jobs
-# CPU time in finished jobs:      15960s     266.01m     4.43h    0.18d  0.001 y
-# IO & Wait Time:                 61004s    1016.73m    16.95h    0.71d  0.002 y
-# Average job time:                 837s      13.94m     0.23h    0.01d
-# Longest finished job:            5799s      96.65m     1.61h    0.07d
-# Submission to last job:          8608s     143.47m     2.39h    0.10d
-    #	one of the results is completely empty and didn't make any final answer:
-    #	../maf/chrUn_gl000226.maf.gz does not exist
-    #	that small contig is completely repeat masked out, it has no alignments
-    #	it was there, but it wasn't gzipped, all it has is comments
+# CPU time in finished jobs:       1027s      17.12m     0.29h    0.01d  0.000 y
+# IO & Wait Time:                 20761s     346.01m     5.77h    0.24d  0.001 y
+# Average job time:                 237s       3.95m     0.07h    0.00d
+# Longest finished job:            1629s      27.15m     0.45h    0.02d
+# Submission to last job:          1640s      27.33m     0.46h    0.02d
+
+    #	one of the results is completely empty, the grep for results failed
+    #	this file ../maf/chrUn_gl000226.maf only has header comments, no result
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/hg19/multiz46way/maf
@@ -4801,35 +4798,22 @@
     #	where it is running.  Best to run this over in scratch.
     cd /data/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
+    #	real    44m9.177s
+    #	Loaded 33540773 mafs in 93 files from /gbdb/hg19/multiz46way/maf
+
     #	real    81m26.382s
     #	Loaded 74158519 mafs in 93 files from /gbdb/hg19/multiz46way/maf
     # load summary table
     time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
-	| hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
-	 -maxSize=200000  multiz46waySummary stdin
-XXX - running with limits in place to avoid out of memory situation
-Got up to about 50 Gb in memory, I wonder if it has a memory leak
-XXX - Thu Jun 18 14:43:52 PDT 2009
-XXX - failed the first time:
-Created 9192683 summary blocks from 1296420257 components and 74158519 mafs from stdin
-Loading into hg19 table multiz46waySummary...
-Loading completeAdvisory lock has been released
-
-real    67m5.748s
-user    56m27.129s
-sys     4m30.870s
-
-# Indexing and tabulating stdin
-# needLargeMem: Out of memory - request size 4128 bytes, errno: 12
-# real    43m32.011s
-# user    39m33.083s
-# sys     3m32.477s
-
-    #	real    2m39.822
-    #	Created 353577 summary blocks from 2852890 components and 1197504 mafs
-    #	from stdin
+	| $HOME/bin/$MACHTYPE/hgLoadMafSummary hg19 -minSize=30000 -verbose=2 \
+		-mergeGap=1500 -maxSize=200000  multiz46waySummary stdin
+    #	real    65m2.993s
+# flushSummaryBlocks: output 45 blocks
+# Created 8869916 summary blocks from 642016936 components
+#	and 33540773 mafs from stdin
+# blocks too small to be used: 27359
 
     # Gap Annotation
     # prepare bed files with gap info
     mkdir /hive/data/genomes/hg19/bed/multiz46way/anno
@@ -5318,19 +5302,85 @@
     cat fb.danRer6.chainHg19Link.txt 
     #	96424507 bases of 1506896106 (6.399%) in intersection
 
 ##############################################################################
-############################################################################
+# LASTZ Elephant LoxAfr3 (DONE - 2009-07-21,23 - Hiram)
+    mkdir /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
+    cd /hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
+
+    cat << '_EOF_' > DEF
+# Human vs. Elephant
+BLASTZ_M=50
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/nib
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY: Elephant
+SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
+SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LIMIT=50
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << happy emacs
+
+    #	establish a screen to control this job
+    screen
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+	> do.log 2>&1 &
+    #	real    317m32.664s
+    #	broken when it went to chaining on encodek, finish the chain then:
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+	-continue=chainMerge > chainMerge.log 2>&1 &
+    #	real    217m25.159s
+
+    # time about 3h23m
+    cat fb.hg19.chainLoxAfr3Link.txt 
+    #	1351200080 bases of 2897316137 (46.636%) in intersection
+
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-syntenicNet -continue=syntenicNet -stop=syntenicNet \
+	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	> synNet.log 2>&1 &
+    #	real    32m40.554s
+XXX - running Mon Aug  3 13:36:26 PDT 2009
+
+    time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr3 > rbest.log 2>&1
+    #	real    184m3.435s
+
+    mkdir /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
+    cd /hive/data/genomes/loxAfr3/bed/blastz.hg19.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/hg19/bed/lastzLoxAfr3.2009-07-21/DEF \
+	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+	-swap > swap.log 2>&1 &
+    #	real    220m16.839s
+    cat fb.loxAfr3.chainHg19Link.txt 
+    #	1323201500 bases of 3118565340 (42.430%) in intersection
+
+##############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
-############################################################################
-testing
-
 
 ############################################################################
 # AGILENT PROBES LIFTED FROM HG18 (DONE, 2009-07-28 Andy)
 
@@ -5359,4 +5409,86 @@
 mkdir agilentProbes
 cd agilentProbes/
 ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped beds
 ln -s /hive/data/genomes/hg19/bed/agilentProbes/agilentProbesHg18Unmapped.tar.gz
+
+##############################################################################
+# LASTZ Tetraodon TetNig2 (DONE - 2009-08-10,11 - Hiram)
+    #	This is the incorrect date/time stamp on this directory,
+    #	it should be 2009-08-10
+    mkdir /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
+    cd /hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
+
+    cat << '_EOF_' > DEF
+# human vs tetraodon
+BLASTZ_Y=3400
+BLASTZ_L=6000
+BLASTZ_K=2200
+BLASTZ_M=50
+BLASTZ_Q=/scratch/data/blastz/HoxD55.q
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/hg19.2bit
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=5
+
+# QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
+SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
+SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
+SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
+SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
+SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
+SEQ2_CHUNK=20000000
+SEQ2_LAP=0
+SEQ2_LIMIT=50
+
+BASE=/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10
+TMPDIR=/scratch/tmp
+'_EOF_'
+    # << happy emacs
+
+    #	establish a screen to control this job
+    screen
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-qRepeats=windowmaskerSdust \
+	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	> do.log 2>&1 &
+    #	real    220m36.068s
+    #	forgot the qRepeats for tetNig2
+    rm axtChain/hg19.tetNig2.net
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	`pwd`/DEF \
+	-continue=load -qRepeats=windowmaskerSdust \
+	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	> load.log 2>&1 &
+    #	real    5m53.096s
+    cat fb.hg19.chainTetNig2Link.txt 
+    #	49611132 bases of 2897316137 (1.712%) in intersection
+
+    #	running the swap
+    mkdir /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
+    cd /hive/data/genomes/tetNig2/bed/blastz.hg19.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
+	-qRepeats=windowmaskerSdust \
+	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	-swap > swap.log 2>&1 &
+    #	real    13m21.591s
+    #	forgot the qRepeats for tetNig2
+    rm axtChain/tetNig2.hg19.net
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/hg19/bed/lastzTetNig2.2009-10-10/DEF \
+	-continue=load -qRepeats=windowmaskerSdust \
+	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+	-swap > load.log 2>&1 &
+    #	real    4m7.559s
+    cat fb.tetNig2.chainHg19Link.txt 
+    #	42910930 bases of 302314788 (14.194%) in intersection
+
+##############################################################################