src/hg/makeDb/doc/hg19.txt 1.39
1.39 2009/09/24 20:57:52 hiram
Working on the 46way conservation, oryCun2 lastz chain net cavPor3 synNet and reload cavPor3 extra large chain tables
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.38
retrieving revision 1.39
diff -b -B -U 4 -r1.38 -r1.39
--- src/hg/makeDb/doc/hg19.txt 20 Sep 2009 17:16:44 -0000 1.38
+++ src/hg/makeDb/doc/hg19.txt 24 Sep 2009 20:57:52 -0000 1.39
@@ -3582,8 +3582,17 @@
# 104045950 bases of 2897316137 (3.591%) in intersection
time doRecipBest.pl -buildDir=`pwd` hg19 anoCar1 > rbest.log 2>&1
# real 45m58.001s
+ # running syntenic Net 2009-08-27 - Hiram
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ -continue=syntenicNet -syntenicNet \
+ -qRepeats=windowmaskerSdust > syntenicNet.log 2>&1 &
+ # real 6m13.304s
+
# running the swap - DONE - 2009-06-02
mkdir /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
cd /hive/data/genomes/anoCar1/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
@@ -4433,9 +4442,9 @@
(micMur1:0.091452,otoGar1:0.128984):0.035463):0.015304,
tupBel1:0.183583):0.004688,(((((mm9:0.083220,rn4:0.090564):0.196605,
dipOrd1:0.209532):0.022555,cavPor3:0.223415):0.009828,
speTri1:0.146894):0.025042,
-(oryCun1:0.116009,ochPri2:0.198295):0.100037):0.015355):0.020666,
+(oryCun2:0.116009,ochPri2:0.198295):0.100037):0.015355):0.020666,
(((vicPac1:0.105252,(turTru1:0.064182,bosTau4:0.121911):0.025111):0.039691,
((equCab2:0.107726,(felCat3:0.097971,canFam2:0.100888):0.049486):0.006252,
(myoLuc1:0.141155,pteVam1:0.111787):0.033187):0.004179):0.011699,
(eriEur1:0.220580,sorAra1:0.266859):0.056117):0.021065):0.023276,
@@ -4444,9 +4453,9 @@
macEug1:0.3):0.1,
monDom5:0.325899):0.072430,ornAna1:0.453916):0.109903,
((galGal3:0.166386,taeGut1:0.170717):0.199763,
anoCar1:0.509545):0.108130):0.166150,xenTro2:0.852482):0.300396,
-(((tetNig1:0.224774,fr2:0.205294):0.191836,
+(((tetNig2:0.224774,fr2:0.205294):0.191836,
(gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,
danRer6:0.731166):0.155214):0.511293,petMar1:0.511293);
'_EOF_'
# << happy emacs
@@ -4533,9 +4542,9 @@
# 17 0.3732 - Elephant loxAfr3 (% 46.636) (% 42.430)
# 18 0.3740 - Cat felCat3 (% 35.713) (% 61.104)
# 19 0.3769 - Dog canFam2 (% 52.879) (% 62.055)
# 20 0.3809 - Armadillo dasNov2 (% 33.543) (N/A)
-# 21 0.3941 - Rabbit oryCun1 (% 33.676) (N/A)
+# 21 0.3941 - Rabbit oryCun2 (% 44.317) (58.405)
# 22 0.3946 - Microbat myoLuc1 (% 33.174) (N/A)
# 23 0.4028 - Cow bosTau4 (% 46.506) (% 50.297)
# 24 0.4363 - Guinea Pig cavPor3 (% 43.680) (N/A)
# 25 0.4421 - Rock hyrax proCap1 (% 30.864) (N/A)
@@ -4555,9 +4564,9 @@
# 39 1.6403 - X. tropicalis xenTro2 (% 3.176) (% 6.773)
# 40 1.9387 - Stickleback gasAcu1 (% 1.916) (% 11.175)
# 41 1.9634 - Fugu fr2 (% 1.702) (% 10.929)
# 42 1.9746 - Zebrafish danRer6 (% 3.051) (% 6.399)
-# 43 1.9829 - Tetraodon tetNig1 (% 2.003) (% 14.443)
+# 43 1.9829 - Tetraodon tetNig2 (% 1.712) (% 14.194)
# 44 2.1031 - Medaka oryLat2 (% 1.849) (% 6.705)
# 45 2.1108 - Lamprey petMar1 (% 1.082) (% 3.200)
# create species list and stripped down tree for autoMZ
@@ -4651,16 +4660,16 @@
done > species.list
cut -f 1 ../../../chrom.sizes > chr.list
gensub2 species.list chr.list template jobList
- para create jobList
+ para -ram=8g create jobList
para try ... check ... push ... etc...
# Completed: 4185 of 4185 jobs
-# CPU time in finished jobs: 27549s 459.15m 7.65h 0.32d 0.001 y
-# IO & Wait Time: 15763s 262.71m 4.38h 0.18d 0.000 y
-# Average job time: 10s 0.17m 0.00h 0.00d
-# Longest finished job: 158s 2.63m 0.04h 0.00d
-# Submission to last job: 1647s 27.45m 0.46h 0.02d
+# CPU time in finished jobs: 25547s 425.78m 7.10h 0.30d 0.001 y
+# IO & Wait Time: 268664s 4477.73m 74.63h 3.11d 0.009 y
+# Average job time: 70s 1.17m 0.02h 0.00d
+# Longest finished job: 1234s 20.57m 0.34h 0.01d
+# Submission to last job: 3048s 50.80m 0.85h 0.04d
# the autoMultiz cluster run
ssh swarm
cd /hive/data/genomes/hg19/bed/multiz46way/
@@ -4680,14 +4689,14 @@
set db = hg19
set c = $1
set result = $2
set run = `/bin/pwd`
-set tmp = $run/tmp/$db/multiz.$c
+set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/hg19/bed/multiz46way/mafSplit
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../../tree.nh ../../species.list $tmp
-pushd $tmp
+pushd $tmp > /dev/null
foreach s (`/bin/sed -e "s/ $db//" species.list`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
@@ -4701,15 +4710,15 @@
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
-$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
-popd
+$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
+ > /dev/null
+popd > /dev/null
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
-/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
-/bin/rmdir --ignore-fail-on-non-empty $run/tmp
+/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
@@ -4729,13 +4738,13 @@
# node to avoid thrashing
para -ram=8g try
para -ram=8g push
# Completed: 504 of 504 jobs
-# CPU time in finished jobs: 1313981s 21899.69m 364.99h 15.21d 0.042 y
-# IO & Wait Time: 675855s 11264.24m 187.74h 7.82d 0.021 y
-# Average job time: 3948s 65.80m 1.10h 0.05d
-# Longest finished job: 19767s 329.45m 5.49h 0.23d
-# Submission to last job: 128616s 2143.60m 35.73h 1.49d
+# CPU time in finished jobs: 1342039s 22367.32m 372.79h 15.53d 0.043 y
+# IO & Wait Time: 63835s 1063.91m 17.73h 0.74d 0.002 y
+# Average job time: 2789s 46.49m 0.77h 0.03d
+# Longest finished job: 12625s 210.42m 3.51h 0.15d
+# Submission to last job: 15300s 255.00m 4.25h 0.18d
# put the split maf results back together into a single maf file
# eliminate duplicate comments
ssh hgwdev
@@ -4765,26 +4774,26 @@
chmod +x runOne
cat << '_EOF_' >> template
#LOOP
-runOne $(root1) {check out exists+ ../maf/$(root1).maf.gz}
+runOne $(root1) {check out exists+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 ../../../chrom.sizes > chr.list
- ssh memk
+ ssh encodek
cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
gensub2 chr.list single template jobList
para create jobList
para try ... check ... push ... etc ...
# Completed: 92 of 93 jobs
# Crashed: 1 jobs
-# CPU time in finished jobs: 1027s 17.12m 0.29h 0.01d 0.000 y
-# IO & Wait Time: 20761s 346.01m 5.77h 0.24d 0.001 y
-# Average job time: 237s 3.95m 0.07h 0.00d
-# Longest finished job: 1629s 27.15m 0.45h 0.02d
-# Submission to last job: 1640s 27.33m 0.46h 0.02d
+# CPU time in finished jobs: 412s 6.86m 0.11h 0.00d 0.000 y
+# IO & Wait Time: 21187s 353.12m 5.89h 0.25d 0.001 y
+# Average job time: 235s 3.91m 0.07h 0.00d
+# Longest finished job: 1529s 25.48m 0.42h 0.02d
+# Submission to last job: 1542s 25.70m 0.43h 0.02d
# one of the results is completely empty, the grep for results failed
# this file ../maf/chrUn_gl000226.maf only has header comments, no result
@@ -4798,22 +4807,21 @@
# where it is running. Best to run this over in scratch.
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
- # real 44m9.177s
- # Loaded 33540773 mafs in 93 files from /gbdb/hg19/multiz46way/maf
+ # Loaded 33558634 mafs in 93 files from /gbdb/hg19/multiz46way/maf
+ # real 512m8.053s
- # real 81m26.382s
- # Loaded 74158519 mafs in 93 files from /gbdb/hg19/multiz46way/maf
# load summary table
time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
| $HOME/bin/$MACHTYPE/hgLoadMafSummary hg19 -minSize=30000 -verbose=2 \
-mergeGap=1500 -maxSize=200000 multiz46waySummary stdin
- # real 65m2.993s
+ # real 92m30.700s
# flushSummaryBlocks: output 45 blocks
-# Created 8869916 summary blocks from 642016936 components
-# and 33540773 mafs from stdin
-# blocks too small to be used: 27359
+# Created 8766427 summary blocks from 645238409 components and
+# 33558634 mafs from stdin
+# blocks too small to be used: 29456
+# Loading into hg19 table multiz46waySummary...
# Gap Annotation
# prepare bed files with gap info
mkdir /hive/data/genomes/hg19/bed/multiz46way/anno
@@ -4867,11 +4875,18 @@
# << happy emacs
chmod +x anno.csh
gensub2 chr.list single template jobList
- para create jobList
+ para -ram=30g create jobList
# specify lots of ram to get one job per node
para -ram=30g push
+ #
+# Completed: 93 of 93 jobs
+# CPU time in finished jobs: 10371s 172.85m 2.88h 0.12d 0.000 y
+# IO & Wait Time: 3365s 56.09m 0.93h 0.04d 0.000 y
+# Average job time: 148s 2.46m 0.04h 0.00d
+# Longest finished job: 1153s 19.22m 0.32h 0.01d
+# Submission to last job: 7402s 123.37m 2.06h 0.09d
ssh hgwdev
rm -fr /gbdb/hg19/multiz46way/maf
mkdir /gbdb/hg19/multiz46way/maf
@@ -4882,20 +4897,11 @@
# huge temp files are made, do them on local disk
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
- # with final set of quality annotated files:
- # Loaded 33320838 mafs in 49 files from /gbdb/hg19/multiz46way/maf
- # real 91m46.889s
- # running on Irow annotated mafs Fri Nov 28 00:28:09 PST 2008
- # Loaded 33320675 mafs in 49 files from /gbdb/hg19/multiz46way/maf
- # real 236m15.279s
- # running on bare bones mafs Thu Nov 27 19:29:44 PST 2008
- # Loaded 33273351 mafs in 49 files from /gbdb/hg19/multiz46way/maf
- # real 198m55.761s - while swarm busy with rebalancing
- # from before the fixed multiz:
- # Loaded 35154852 mafs in 49 files from /gbdb/hg19/multiz46way/maf
- # real 71m5.594s
+ # real 113m11.709s
+ # Loaded 33612571 mafs in 93 files from /gbdb/hg19/multiz46way/maf
+XXX - done to here
time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
| hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz46waySummary stdin
@@ -5239,8 +5245,279 @@
$PHASTBIN/tree_doctor \
--prune=`cat notPlacentals.list` \
tree_4d.46way.nh > tree_4d.46way.placental.nh
+#############################################################################
+# phastCons 46-way (WORKING - 2009-09-21 - Hiram)
+
+ # split 46way mafs into 10M chunks and generate sufficient statistics
+ # files for # phastCons
+ ssh memk
+ mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/ss
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
+
+ cat << '_EOF_' > doSplit.csh
+#!/bin/csh -ef
+set c = $1
+set MAF = /hive/data/genomes/hg19/bed/multiz46way/maf/$c.maf
+set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/cons/ss/$c
+rm -fr $WINDOWS
+mkdir $WINDOWS
+pushd $WINDOWS > /dev/null
+twoBitToFa -seq=$c /hive/data/genomes/hg19/hg19.2bit hg19.$c.fa
+/cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
+ -M hg19.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
+rm -f hg19.$c.fa
+popd > /dev/null
+date >> $c.done
+'_EOF_'
+ # << happy emacs
+ chmod +x doSplit.csh
+
+ cat << '_EOF_' > template
+#LOOP
+doSplit.csh $(root1) {check out line+ $(root1).done}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ # do the easy ones first to see some immediate results
+ ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
+
+ gensub2 maf.list single template jobList
+ para -ram=32g create jobList
+ para try ... check ... etc
+ # this takes a really long time. memk was down to 2 usable
+ # machines - got it finished manually on a combination of hgwdevnew CPUs
+ # and other machines
+
+ # Estimate phastCons parameters
+ # experimented with this as a parasol job on hgwdevnew to try a number
+ # of SS files. With a command of:
+
+/cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
+--tree "(((((((((((((((((hg19,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
+--out-root=$OUT/starting_tree
+
+ # running over the input files ../ss/*/*.ss results to
+#.../genomes/hg19/bed/multiz46way/cons/startingTree/result/*/starting-tree.mod
+
+ # add up the C and G:
+ find ./result -type f | xargs ls -rt | while read F
+do
+ D=`dirname $F`
+ echo -n `basename $D`" - "
+ grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
+done
+ # counting number of species seen in the maf file:
+ find ./result -type f | xargs ls -rt | while read F
+do
+ D=`dirname $F`
+ echo -n `basename $D`" - "
+ grep TREE $F | sed -e \
+"s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g" | tr ',' '\n' | wc -l
+done
+
+ # Run phastCons
+ # This job is I/O intensive in its output files, thus it is all
+ # working over in /scratch/tmp/
+ ssh swarm
+ mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
+
+ # there are going to be several different phastCons runs using
+ # this same script. They trigger off of the current working directory
+ # $cwd:t which is the "grp" in this script. It is one of:
+ # all euarchontogliers placentals
+
+ cat << '_EOF_' > doPhast.csh
+#!/bin/csh -fe
+set PHASTBIN = /cluster/bin/phast/x86_64
+set c = $1
+set f = $2
+set len = $3
+set cov = $4
+set rho = $5
+set grp = $cwd:t
+set cons = /hive/data/genomes/hg19/bed/multiz46way/cons
+set tmp = $cons/tmp/$f
+mkdir -p $tmp
+set ssSrc = $cons
+if (-s $cons/$grp/$grp.non-inf) then
+ ln -s $cons/$grp/$grp.mod $tmp
+ ln -s $cons/$grp/$grp.non-inf $tmp
+ ln -s $ssSrc/ss/$c/$f.ss $tmp
+ ln -s $cons/$grp/$grp.mod $tmp
+ ln -s $cons/$grp/$grp.non-inf $tmp
+else
+ ln -s $ssSrc/ss/$c/$f.ss $tmp
+ ln -s $cons/$grp/$grp.mod $tmp
+endif
+pushd $tmp > /dev/null
+if (-s $grp.non-inf) then
+ $PHASTBIN/phastCons $f.ss $grp.mod \
+ --rho $rho --expected-length $len --target-coverage $cov --quiet \
+ --not-informative `cat $grp.non-inf` \
+ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
+else
+ $PHASTBIN/phastCons $f.ss $grp.mod \
+ --rho $rho --expected-length $len --target-coverage $cov --quiet \
+ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
+endif
+popd > /dev/null
+mkdir -p pp/$c bed/$c
+sleep 4
+touch pp/$c bed/$c
+rm -f pp/$c/$f.pp
+rm -f bed/$c/$f.bed
+mv $tmp/$f.pp pp/$c
+mv $tmp/$f.bed bed/$c
+rm -fr $tmp
+'_EOF_'
+ # << happy emacs
+ chmod a+x doPhast.csh
+
+ # this template will serve for all runs
+ # root1 == chrom name, file1 == ss file name without .ss suffix
+ cat << '_EOF_' > template
+#LOOP
+../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ bed/$(root1)/$(file1).bed}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ # Create parasol batch and run it
+ ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > ss.list
+
+ # run for all species
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons
+ mkdir -p all
+ cd all
+ # Using Kate's .mod tree
+ cp -p ../../4d/46way.all.mod ./all.mod
+
+ gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+ para -ram=8g create jobList
+ para try ... check ... push ... etc.
+XXX - running Tue Jan 13 22:19:21 PST 2009
+# Completed: 322 of 322 jobs
+# CPU time in finished jobs: 47406s 790.10m 13.17h 0.55d 0.002 y
+# IO & Wait Time: 29902s 498.37m 8.31h 0.35d 0.001 y
+# Average job time: 240s 4.00m 0.07h 0.00d
+# Longest finished job: 354s 5.90m 0.10h 0.00d
+# Submission to last job: 536s 8.93m 0.15h 0.01d
+
+ # create Most Conserved track
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons
+ cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
+ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
+ /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
+ # ~ 1 minute
+
+ # load into database
+ ssh hgwdev
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+ time nice -n +19 hgLoadBed hg19 phastConsElements46way mostConserved.bed
+ # Loaded 4878296 elements of size 5
+ # real 2m3.414s
+
+ # Try for 5% overall cov, and 70% CDS cov
+ # --rho 0.3 --expected-length 45 --target-coverage 0.3
+ featureBits hg19 -enrichment refGene:cds phastConsElements46way
+ # refGene:cds 1.144%, mostConserved.bed 4.973%,
+ # both 0.854%, cover 74.62%, enrich 15.01x
+
+ # --rho .31 --expected-length 45 --target-coverage .3
+ # refGene:cds 1.144%, phastConsElements46way 4.706%,
+ # both 0.824%, cover 72.07%, enrich 15.31x
+
+ # --rho 0.3 --expected-length 45 --target-coverage 0.3
+ featureBits hg19 -enrichment knownGene:cds phastConsElements46way
+ # knownGene:cds 1.205%, mostConserved.bed 4.973%,
+ # both 0.874%, cover 72.55%, enrich 14.59x
+
+ # --rho .31 --expected-length 45 --target-coverage .3
+ # knownGene:cds 1.205%, phastConsElements46way 4.706%,
+ # both 0.844%, cover 70.05%, enrich 14.88x
+
+ featureBits hg19 -enrichment refGene:cds phastConsElements28way
+ # refGene:cds 1.144%, phastConsElements28way 4.920%,
+ # both 0.858%, cover 74.96%, enrich 15.24x
+ featureBits hg19 -enrichment knownGene:cds phastConsElements28way
+ # knownGene:cds 1.205%, phastConsElements28way 4.920%,
+ # both 0.878%, cover 72.88%, enrich 14.81x
+
+ # Create merged posterier probability file and wiggle track data files
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+ cat << '_EOF_' > gzipAscii.sh
+#!/bin/sh
+
+TOP=`pwd`
+export TOP
+
+mkdir -p downloads
+
+for D in pp/chr*
+do
+ C=${D/pp\/}
+ out=downloads/${C}.phastCons46way.wigFix.gz
+ echo "${D} > ${C}.phastCons46way.wigFix.gz"
+ ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
+ gzip > ${out}
+done
+'_EOF_'
+ # << happy emacs
+ chmod +x gzipAscii.sh
+ time nice -n +19 ./gzipAscii.sh
+ # real 30m7.228s
+
+ # encode those files into wiggle data
+ zcat downloads/*.wigFix.gz \
+ | wigEncode stdin phastCons46way.wig phastCons46way.wib
+ # Converted stdin, upper limit 1.00, lower limit 0.00
+ # real 22m54.291s
+
+ # Load gbdb and database with wiggle.
+ ssh hgwdev
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+ ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
+ time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+ phastCons46way phastCons46way.wig
+ # real 1m13.681s
+
+ # Create histogram to get an overview of all the data
+ ssh hgwdev
+ cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+ time nice -n +19 hgWiggle -doHistogram \
+ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+ -db=hg19 phastCons46way > histogram.data 2>&1
+ # real 8m6.841s
+
+ # create plot of histogram:
+
+ cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Human Hg18 Histogram phastCons46way track"
+set xlabel " phastCons46way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+ # << happy emacs
+
+ display histo.png &
+
+
#########################################################################
# LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
cd /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
@@ -5355,9 +5632,8 @@
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> synNet.log 2>&1 &
# real 32m40.554s
-XXX - running Mon Aug 3 13:36:26 PDT 2009
time doRecipBest.pl -buildDir=`pwd` hg19 loxAfr3 > rbest.log 2>&1
# real 184m3.435s
@@ -5793,8 +6069,131 @@
rm -r run*/split tmp.txt *.orthoGlom.txt
##############################################################################
+<<<<<<< hg19.txt
+# LASTZ Rabbit OryCun2 (DONE - 2009-08-12 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
+ cd /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
+
+ cat << '_EOF_' > DEF
+# Human vs. Rabbit
+BLASTZ_M=50
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/nib
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job
+SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit
+SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes
+SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit
+SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes
+SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
+SEQ2_CHUNK=20000000
+SEQ2_LIMIT=400
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ # establish a screen to control this job
+ screen
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ # real 516m41.981s
+ cat fb.hg19.chainOryCun2Link.txt
+ # 1283994337 bases of 2897316137 (44.317%) in intersection
+ # should have run syntenicNet in that first run
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ -continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
+ # about 1 hour
+
+ mkdir /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
+ cd /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12/DEF \
+ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ -swap -syntenicNet > swap.log 2>&1 &
+ # real 176m35.932s
+ cat fb.oryCun2.chainHg19Link.txt
+ # 1260477501 bases of 2604023284 (48.405%) in intersection
+
+##############################################################################
+# running syntenicNet on CavPor3 lastz (DONE - 2009-08-27 - Hiram)
+ cd /hive/data/genomes/hg19/bed/lastzCavPor3.2009-06-04
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ -continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
+ # about 44 minutes
+
+##############################################################################
+# loading the lastz tables on cavPor3 - (DONE - 2009-08-28 - Hiram)
+ # the chain.tab and link.tab files are left over from the failed load
+ cd /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
+
+ # find out their sizes, average and total:
+ awk '{print length($0)}' chain.tab | ave stdin
+Q1 92.000000 median 93.000000 Q3 96.000000
+average 93.651267
+min 64.000000 max 109.000000
+count 27186468
+total 2546047186.000000
+ awk '{print length($0)}' link.tab | ave stdin
+Q1 45.000000 median 47.000000 Q3 48.000000
+average 46.731871
+min 22.000000 max 52.000000
+count 240602108
+total 11243786622.000000
+
+ cat << '_EOF_' > chainHg19Link.sql
+CREATE TABLE chainHg19Link (
+ bin smallint(5) unsigned NOT NULL default 0,
+ tName varchar(255) NOT NULL default '',
+ tStart int(10) unsigned NOT NULL default 0,
+ tEnd int(10) unsigned NOT NULL default 0,
+ qStart int(10) unsigned NOT NULL default 0,
+ chainId int(10) unsigned NOT NULL default 0,
+ KEY tName (tName(13),bin),
+ KEY chainId (chainId)
+) ENGINE=MyISAM max_rows=241000000 avg_row_length=50 pack_keys=1 CHARSET=latin1;
+'_EOF_'
+ # << happy emacs
+ hgsql cavPor3 < chainHg19Link.sql
+
+ time hgsql -e \
+ 'load data local infile "link.tab" into table chainHg19Link;' cavPor3
+ # real 405m15.956s
+
+ cd /hive/data/genomes/cavPor3/bed/blastz.hg19.swap/axtChain
+
+ # and the net tracks were not loaded:
+ time netClass -verbose=0 -noAr noClass.net cavPor3 hg19 cavPor3.hg19.net
+ # real 40m25.078s
+
+ netFilter -minGap=10 cavPor3.hg19.net \
+ | hgLoadNet -verbose=0 cavPor3 netHg19 stdin
+ # real 33m24.972s (plus the featureBits below)
+
+ featureBits cavPor3 chainHg19Link > fb.cavPor3.chainHg19Link.txt 2>&1
+ cat fb.cavPor3.chainHg19Link.txt
+ # 1279572660 bases of 2663369733 (48.043%) in intersection
+
+##############################################################################
# DBSNP CODING ANNOTATIONS (DONE 9/1/09 angie)
# Repeat the coord-remapping performed for snp130 on the hg18 coding anno table.
cd /hive/data/outside/dbSNP/130/human/hg19
@@ -5845,10 +6244,8 @@
/data/tmp/hg19.snp130Coding.bed
#Loaded 198493 elements of size 11
mv /data/tmp/hg19.snp130Coding.bed hg19.snp130CodingDbSnp.bed
-
-##############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded