src/hg/makeDb/doc/hg19.txt 1.49

1.49 2009/10/21 18:34:58 hiram
done with phastCons runs for the 46-way
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.48
retrieving revision 1.49
diff -b -B -U 4 -r1.48 -r1.49
--- src/hg/makeDb/doc/hg19.txt	16 Oct 2009 17:17:44 -0000	1.48
+++ src/hg/makeDb/doc/hg19.txt	21 Oct 2009 18:34:58 -0000	1.49
@@ -5247,30 +5247,41 @@
                 tree_4d.46way.nh > tree_4d.46way.placental.nh
 
 #############################################################################
 # phastCons 46-way (WORKING - 2009-09-21 - Hiram)
+    #	was unable to split the full chrom MAF files, now working on the
+    #	maf files as they were split up during multiz
 
     # split 46way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh memk
     mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
+    cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
+    ./splitRegions.pl mafSplit.bed > \
+	/hive/data/genomes/hg19/bed/multiz46way/cons/msa.split/region.list
     mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/ss
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/msa.split
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set c = $1
-set MAF = /hive/data/genomes/hg19/bed/multiz46way/maf/$c.maf
+set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
 set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/cons/ss/$c
 rm -fr $WINDOWS
+# set seq = `egrep "${c}"'$' region.list | awk '{printf "-seq=%s -start=%d
+# -end=%d", $1, $2, $3}'`
+set seq = `egrep "${c}"'$' region.list | awk '{printf "-seq=%s", $1}'`
 mkdir $WINDOWS
 pushd $WINDOWS > /dev/null
-twoBitToFa -seq=$c /hive/data/genomes/hg19/hg19.2bit hg19.$c.fa
-/cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
+twoBitToFa ${seq} /hive/data/genomes/hg19/hg19.2bit hg19.$c.fa
+set empty = `faSize hg19.$c.fa | egrep " 0 real 0 upper 0 lower|masked total" | wc -l`
+if ( $empty != 2 ) then
+    /cluster/bin/phast/$MACHTYPE/msa_split $MAF -i MAF \
     -M hg19.$c.fa -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
+endif
 rm -f hg19.$c.fa
 popd > /dev/null
-date >> $c.done
+date >> $2
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
@@ -5281,13 +5292,14 @@
 '_EOF_'
     # << happy emacs
 
     #	do the easy ones first to see some immediate results
-    ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
+    ls -1S -r ../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list
 
     gensub2 maf.list single template jobList
     para -ram=32g create jobList
     para try ... check ... etc
+    #	XXX - this did not work
     #	this takes a really long time.  memk was down to 2 usable
     #	machines - got it finished manually on a combination of hgwdevnew CPUs
     #	and other machines
 
@@ -5318,18 +5330,18 @@
 "s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g"  | tr ',' '\n' | wc -l
 done
 
     # Run phastCons
-    #	This job is I/O intensive in its output files, thus it is all
-    #	working over in /scratch/tmp/
+    #	This job is I/O intensive in its output files, beware where this
+    #	takes place or do not run too many at once.
     ssh swarm
     mkdir -p /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
-    #	all euarchontogliers placentals
+    #	all primates placentals
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast/x86_64
@@ -5386,10 +5398,8 @@
 '_EOF_'
     # << happy emacs
 
     # Create parasol batch and run it
-    ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > ss.list
-
     # run for all species
     cd /hive/data/genomes/hg19/bed/multiz46way/cons
     mkdir -p all
     cd all
@@ -5398,101 +5408,422 @@
 
     gensub2 ../run.cons/ss.list single ../run.cons/template jobList
     para -ram=8g create jobList
     para try ... check ... push ... etc.
-XXX - running Tue Jan 13 22:19:21 PST 2009
-# Completed: 322 of 322 jobs
-# CPU time in finished jobs:      47406s     790.10m    13.17h    0.55d  0.002 y
-# IO & Wait Time:                 29902s     498.37m     8.31h    0.35d  0.001 y
-# Average job time:                 240s       4.00m     0.07h    0.00d
-# Longest finished job:             354s       5.90m     0.10h    0.00d
-# Submission to last job:           536s       8.93m     0.15h    0.01d
+
+# second run on swarm parasol:  the failed jobs have empty bed file results
+# Completed: 575 of 580 jobs
+# Crashed: 5 jobs
+# CPU time in finished jobs:      42049s     700.81m    11.68h    0.49d  0.001 y
+# IO & Wait Time:                 19735s     328.92m     5.48h    0.23d  0.001 y
+# Average job time:                 107s       1.79m     0.03h    0.00d
+# Longest finished job:             267s       4.45m     0.07h    0.00d
+# Submission to last job:           479s       7.98m     0.13h    0.01d
+
+# first run on hgwdev parasol:
+# Completed: 574 of 579 jobs
+# Crashed: 5 jobs
+# CPU time in finished jobs:      53050s     884.17m    14.74h    0.61d  0.002 y
+# IO & Wait Time:                  6633s     110.55m     1.84h    0.08d  0.000 y
+# Average job time:                 104s       1.73m     0.03h    0.00d
+# Longest finished job:             248s       4.13m     0.07h    0.00d
+# Submission to last job:          4121s      68.68m     1.14h    0.05d
 
     # create Most Conserved track
-    cd /hive/data/genomes/hg19/bed/multiz46way/cons
-    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
-        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
-            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+    cut -f1 ../../../../chrom.sizes | while read C
+do
+    ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
+    do
+        cat ${D}/${C}*.bed
+    done | awk 'BEGIN{ ID=1 }{printf "%s\t%d\t%d\t%s.%d\t%d\t%s\n", "'${C}'", $2, $3, "'${C}'", ID, $5, $6; ++ID}'
+done > mostConserved.bed
     #	~ 1 minute
 
     # load into database
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
     time nice -n +19 hgLoadBed hg19 phastConsElements46way mostConserved.bed
-    #	Loaded 4878296 elements of size 5
-    #	real     2m3.414s
+    #	Loaded 5868432 elements of size 6
+    #	real     1m14.357s
 
     # Try for 5% overall cov, and 70% CDS cov 
-    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
     featureBits hg19 -enrichment refGene:cds phastConsElements46way
-    #	refGene:cds 1.144%, mostConserved.bed 4.973%,
-    #	both 0.854%, cover 74.62%, enrich 15.01x
-
-    #	--rho .31 --expected-length 45 --target-coverage .3
-    #	refGene:cds 1.144%, phastConsElements46way 4.706%,
-    #	both 0.824%, cover 72.07%, enrich 15.31x
-
     #	--rho 0.3 --expected-length 45 --target-coverage 0.3
-    featureBits hg19 -enrichment knownGene:cds phastConsElements46way
-    #	knownGene:cds 1.205%, mostConserved.bed 4.973%,
-    #	both 0.874%, cover 72.55%, enrich 14.59x
-
-    #	--rho .31 --expected-length 45 --target-coverage .3
-    #	knownGene:cds 1.205%, phastConsElements46way 4.706%,
-    #	both 0.844%, cover 70.05%, enrich 14.88x
-
-    featureBits hg19 -enrichment refGene:cds phastConsElements28way
-    #	refGene:cds 1.144%, phastConsElements28way 4.920%,
-    #	both 0.858%, cover 74.96%, enrich 15.24x
-    featureBits hg19 -enrichment knownGene:cds phastConsElements28way
-    #	knownGene:cds 1.205%, phastConsElements28way 4.920%,
-    #	both 0.878%, cover 72.88%, enrich 14.81x
+    #	refGene:cds 1.186%, phastConsElements46way 5.621%,
+    #	both 0.878%, cover 73.98%, enrich 13.16x
 
     # Create merged posterier probability file and wiggle track data files
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
-    cat << '_EOF_' > gzipAscii.sh
+    mkdir downloads
+    cat << '_EOF_' > phastCat.sh
 #!/bin/sh
 
-TOP=`pwd`
-export TOP
+set -beEu -o pipefail
 
 mkdir -p downloads
-
-for D in pp/chr*
+cut -f1 ../../../../chrom.sizes | while read C
 do
-    C=${D/pp\/}
-    out=downloads/${C}.phastCons46way.wigFix.gz
-    echo "${D} > ${C}.phastCons46way.wigFix.gz"
-    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
-	gzip > ${out}
+    echo -n "${C} ... working ... "
+    ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+    do
+        cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+    done | gzip > downloads/${C}.phastCons46way.wigFix.gz
+    echo "done"
 done
 '_EOF_'
     #	<< happy emacs
-    chmod +x gzipAscii.sh
-    time nice -n +19 ./gzipAscii.sh
-    #	real    30m7.228s
+    chmod +x phastCat.sh
+    time nice -n +19 ./phastCat.sh
+    #	real    30m2.623s
 
     #	encode those files into wiggle data
     zcat downloads/*.wigFix.gz \
 	| wigEncode stdin phastCons46way.wig phastCons46way.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
-    #	real    22m54.291s
+    #	real    18m37.881s
+    du -hsc *.wi?
+    #	2.7G    phastCons46way.wib
+    #	271M    phastCons46way.wig
+    #	3.0G    total
+
+    #	encode into a bigWig file:
+    #	(warning wigToBigWig process grows to about 36 Gb)
+    #	in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG 
+ulimit -v $sizeG
+    zcat downloads/*.wigFix.gz \
+        | wigToBigWig stdin ../../../../chrom.sizes phastCons46way.bw
+    #	real    52m36.142s
+# -rw-rw-r--   1 21667535139 Oct 20 13:59 phastCons46way.bw
+    mkdir /gbdb/hg19/bbi
+    ln -s `pwd`/phastCons46way.bw /gbdb/hg19/bbi
+    #	loading bigWig table:
+    hgsql hg19 -e 'drop table if exists phastCons46way; \
+            create table phastCons46way (fileName varchar(255) not null); \
+            insert into phastCons46way values
+	("/gbdb/hg19/bbi/phastCons46way.bw");'
 
+    #	Using the bigWig file instead of this database table:
     # Load gbdb and database with wiggle.
+#    ssh hgwdev
+#    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+#    ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
+#    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+#	phastCons46way phastCons46way.wig
+
+    #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
-    ln -s `pwd`/phastCons46way.wib /gbdb/hg19/multiz46way/phastCons46way.wib
-    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
-	phastCons46way phastCons46way.wig
-    #	real    1m13.681s
+    time nice -n +19 hgWiggle -doHistogram -db=hg19 \
+	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+	    pc46 > histogram.data 2>&1
+    #	real    7m37.212s
+
+    #	create plot of histogram:
+
+    cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Human Hg19 Histogram phastCons46way track"
+set xlabel " phastCons46way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+    #	<< happy emacs
+
+    display histo.png &
+
+    ########################################################################
+    ### Create a phastCons data set for Primates
+
+    # setup primates-only run
+    ssh swarm
+    mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    # primates-only: exclude all but these for phastCons tree:
+
+    /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
+	--prune-all-but=hg19,panTro2,gorGor1,ponAbe2,rheMac2,papHam1,calJac1,tarSyr1,micMur1,otoGar1 \
+	> primates.mod
+    #	and place the removed ones in the non-inf file so phastCons will
+    #	truly ignore them:
+    echo "tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun2,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr3,proCap1,echTel1,dasNov2,choHof1,macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
+	> primates.non-inf
+
+    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+    para -ram=8g create jobList
+    para try ... check ... push ... etc.
+# Completed: 539 of 580 jobs
+# Crashed: 41 jobs
+# CPU time in finished jobs:      19518s     325.30m     5.42h    0.23d  0.001 y
+# IO & Wait Time:                 19782s     329.70m     5.50h    0.23d  0.001 y
+# Average job time:                  73s       1.22m     0.02h    0.00d
+# Longest finished job:             157s       2.62m     0.04h    0.00d
+# Submission to last job:          1989s      33.15m     0.55h    0.02d
+
+    # the 41 crashed jobs are due to empty bed file results.
+# bed/chrUn_gl000237.00/chrUn_gl000237.00.1-45866.bed is empty
+# ... etc
+
+    # create Most Conserved track
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    ../all/bedCat.sh > mostConserved.bed
+    featureBits hg19 mostConserved.bed
+    #	146285948 bases of 2897316137 (5.049%) in intersection
+
+    # load into database
+    ssh hgwdev
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    time nice -n +19 hgLoadBed hg19 phastConsElements46wayPrimates \
+	mostConserved.bed
+    #	Loaded 1109918 elements of size 6
+    #	real    0m15.498s
+    # verify coverage
+    featureBits hg19 phastConsElements46wayPrimates
+    #	146285948 bases of 2897316137 (5.049%) in intersection
+
+    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
+    featureBits hg19 -enrichment refGene:cds phastConsElements46wayPrimates
+    #	refGene:cds 1.186%, phastConsElements46wayPrimates 5.049%,
+    #	both 0.771%, cover 64.95%, enrich 12.86x
+
+    featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPrimates
+    #	knownGene:cds 1.252%, phastConsElements46wayPrimates 5.049%,
+    #	both 0.784%, cover 62.65%, enrich 12.41x
+
+    #	Create the downloads .pp files, from which the phastCons wiggle data
+    #	is calculated
+    # sort by chromName, chromStart so that items are in numerical order 
+    #  for wigEncode
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+    mkdir downloads
+    cat << '_EOF_' > phastCat.sh
+#!/bin/sh
+
+mkdir -p downloads
+cut -f1 ../../../../chrom.sizes | while read C
+do
+    echo -n "${C} ... working ... "
+    if [ -d "pp/${C}.00" ]; then
+        ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+        do
+            cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+        done | gzip > downloads/${C}.phastCons46way.primates.wigFix.gz
+    fi
+    echo "done"
+done
+'_EOF_'
+    # << happy emacs
+    chmod +x ./phastCat.sh
+    time nice -n +19 ./phastCat.sh
+    #	real    39m47.189s
+
+    # Create merged posterier probability file and wiggle track data files
+    zcat downloads/chr*.wigFix.gz \
+	 | wigEncode stdin phastCons46wayPrimates.wig phastCons46wayPrimates.wib
+    # Converted stdin, upper limit 1.00, lower limit 0.00
+    #	real    17m20.601s
+
+    #	encode to bigWig
+    #	(warning wigToBigWig process grows to about 36 Gb)
+    #	in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG 
+ulimit -v $sizeG
+
+    zcat downloads/*.wigFix.gz \
+        | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPrimates.bw
+
+    ln -s `pwd`/phastCons46wayPrimates.bw /gbdb/hg19/bbi
+    #	loading bigWig table:
+    hgsql hg19 -e 'drop table if exists phastCons46wayPrimates; \
+            create table phastCons46wayPrimates \
+		(fileName varchar(255) not null); \
+            insert into phastCons46wayPrimates values
+	("/gbdb/hg19/bbi/phastCons46wayPrimates.bw");'
+
+    ## load table with wiggle data
+    ## not done now, using the bigWig file instead
+#    ssh hgwdev
+#    cd /hive/data/genomes/hg19/bed/multiz46way/cons/primates
+#    ln -s `pwd`/phastCons46wayPrimates.wib \
+#	/gbdb/hg19/multiz46way/phastCons46wayPrimates.wib
+#    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+#	phastCons46wayPrimates phastCons46wayPrimates.wig
+    #	Instead, temporary load into a table so we can do the histogram
+    ln -s `pwd`/phastCons46wayPrimates.wib /gbdb/hg19/wib/pc46.wib
+    hgLoadWiggle hg19 pc46 phastCons46wayPrimates.wig
 
     #  Create histogram to get an overview of all the data
+    time nice -n +19 hgWiggle -doHistogram \
+	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+	    -db=hg19 pc46 > histogram.data 2>&1
+    #	real    5m30.086s
+
+    #	create plot of histogram:
+
+    cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color \
+        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Mouse Hg19 Histogram phastCons46wayPrimates track"
+set xlabel " phastCons46wayPrimates score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+    #	<< happy emacs
+
+    display histo.png &
+
+    ########################################################################
+    ### Create a phastCons data set for Placentals
+    # setup placental-only run
+    ssh swarm
+    mkdir /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+
+    # placental-only: exclude all but these for phastCons tree:
+    /cluster/bin/phast/x86_64/tree_doctor ../all/all.mod \
+	--prune-all-but=hg19,panTro2,gorGor1,ponAbe2,rheMac2,papHam1,calJac1,tarSyr1,micMur1,otoGar1,tupBel1,mm9,rn4,dipOrd1,cavPor3,speTri1,oryCun2,ochPri2,vicPac1,turTru1,bosTau4,equCab2,felCat3,canFam2,myoLuc1,pteVam1,eriEur1,sorAra1,loxAfr3,proCap1,echTel1,dasNov2,choHof1 \
+	> placental.mod
+    #	and place the removed ones in the non-inf file so phastCons will
+    #	truly ignore them:
+    echo "macEug1,monDom5,ornAna1,galGal3,taeGut1,anoCar1,xenTro2,tetNig2,fr2,gasAcu1,oryLat2,danRer6,petMar1" \
+        > placental.non-inf
+
+    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+    para -ram=8g create jobList
+    para try ... check ... push ... etc.
+# Completed: 562 of 580 jobs
+# Crashed: 18 jobs
+# CPU time in finished jobs:      33874s     564.57m     9.41h    0.39d  0.001 y
+# IO & Wait Time:                 12493s     208.21m     3.47h    0.14d  0.000 y
+# Average job time:                  83s       1.38m     0.02h    0.00d
+# Longest finished job:             193s       3.22m     0.05h    0.00d
+# Submission to last job:         62872s    1047.87m    17.46h    0.73d
+
+    #	The crashed jobs produce zero length bed files: e.g.
+    #	bed/chrUn_gl000246.00/chrUn_gl000246.00.1-38144.bed is empty
+
+    # create Most Conserved track
+    ../all/bedCat.sh > mostConserved.bed
+
+    # load into database
     ssh hgwdev
-    cd /hive/data/genomes/hg19/bed/multiz46way/cons/all
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+    time nice -n +19 hgLoadBed hg19 phastConsElements46wayPlacental \
+	mostConserved.bed
+    #	Loaded 4785089 elements of size 6
+    #	real    0m58.367s
+    # verify coverage
+    featureBits hg19 phastConsElements46wayPlacental
+    #	146457699 bases of 2897316137 (5.055%) in intersection
+    #	119635433 bases of 2881515245 (4.152%) in intersection
+
+    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
+    featureBits hg19 -enrichment refGene:cds phastConsElements46wayPlacental
+    #	refGene:cds 1.186%, phastConsElements46wayPlacental 5.055%,
+    #	both 0.847%, cover 71.42%, enrich 14.13x
+    featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPlacental
+    #	knownGene:cds 1.252%, phastConsElements46wayPlacental 5.055%,
+    #	both 0.865%, cover 69.10%, enrich 13.67x
+
+    #	Create the downloads .pp files, from which the phastCons wiggle data
+    #	is calculated
+    # sort by chromName, chromStart so that items are in numerical order 
+    #  for wigEncode
+    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+    mkdir downloads
+    cat << '_EOF_' > phastCat.sh
+#!/bin/sh
+
+mkdir -p downloads
+cut -f1 ../../../../chrom.sizes | while read C
+do
+    echo -n "${C} ... working ... "
+    if [ -d "pp/${C}.00" ]; then
+        ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+        do
+            cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+        done | gzip > downloads/${C}.phastCons46way.placental.wigFix.gz
+    fi
+    echo "done"
+done
+'_EOF_'
+    # << happy emacs
+    chmod +x ./phastCat.sh
+    time nice -n +19 ./phastCat.sh
+
+    # Create merged posterier probability file and wiggle track data files
+    zcat downloads/chr*.wigFix.gz \
+	| wigEncode stdin phastCons46wayPlacental.wig \
+		phastCons46wayPlacental.wib
+    #	Converted stdin, upper limit 1.00, lower limit 0.00
+    #	real    14m53.395s
+
+    #	encode to bigWig
+    #	(warning wigToBigWig process grows to about 36 Gb)
+    #	in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG 
+ulimit -v $sizeG
+
+    zcat downloads/*.wigFix.gz \
+        | wigToBigWig stdin ../../../../chrom.sizes phastCons46wayPlacental.bw
+    #	real    40m55.568s
+
+    ln -s `pwd`/phastCons46wayPlacental.bw /gbdb/hg19/bbi
+    #	loading bigWig table:
+    hgsql hg19 -e 'drop table if exists phastCons46wayPlacental; \
+            create table phastCons46wayPlacental \
+		(fileName varchar(255) not null); \
+            insert into phastCons46wayPlacental values
+	("/gbdb/hg19/bbi/phastCons46wayPlacental.bw");'
+
+
+    ## load table with wiggle data
+    ## no longer load this data, using the bigWig file instead
+#    ssh hgwdev
+#    cd /hive/data/genomes/hg19/bed/multiz46way/cons/placental
+#    ln -s `pwd`/phastCons46wayPlacental.wib \
+#	/gbdb/hg19/multiz46way/phastCons46wayPlacental.wib
+#    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/hg19/multiz46way hg19 \
+#	phastCons46wayPlacental phastCons46wayPlacental.wig
+
+    #	Instead, temporary load into a table so we can do the histogram
+    ln -s `pwd`/phastCons46wayPlacental.wib /gbdb/hg19/wib/pc46.wib
+    hgLoadWiggle hg19 pc46 phastCons46wayPlacental.wig
+
+    #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-	    -db=hg19 phastCons46way > histogram.data 2>&1
-    #	real    8m6.841s
+	    -db=hg19 pc46 > histogram.data 2>&1
+    #	real    8m15.623s
+    hgsql -e "drop table pc46;" hg19
+    rm /gbdb/hg19/wib/pc46.wib
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
@@ -5500,10 +5831,10 @@
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
-set title " Human Hg18 Histogram phastCons46way track"
-set xlabel " phastCons46way score"
+set title " Human Hg19 Histogram phastCons46wayPlacental track"
+set xlabel " phastCons46wayPlacental score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
@@ -6067,11 +6398,9 @@
     cd /hive/data/genomes/hg19/bed/snp130Ortho
     gzip snp130Simple.bed snp130ExcludeIds.txt snp130ForLiftOver.bed &
     rm -r run*/split tmp.txt *.orthoGlom.txt
 
-
 ##############################################################################
-<<<<<<< hg19.txt
 # LASTZ Rabbit OryCun2 (DONE - 2009-08-12 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
     cd /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
 
@@ -6284,8 +6613,253 @@
 # Make mapping between known genes and allenBrain
    hgMapToGene hg19 allenBrainAli -type=psl knownGene knownToAllenBrain
 
 ############################################################################
+## Annotate 46-way multiple alignment with gene annotations
+##		(DONE - 2008-12-08,23 - Hiram)
+    # Gene frames
+    ## survey all genomes to see what type of gene track to use
+    ssh hgwdev
+    mkdir /hive/data/genomes/hg19/bed/multiz46way/frames
+    cd /hive/data/genomes/hg19/bed/multiz46way/frames
+    #
+    #	survey all the genomes to find out what kinds of gene tracks they have
+    cat << '_EOF_' > showGenes.csh
+#!/bin/csh -fe
+foreach db (`cat ../species.list`)
+    echo -n "${db}: "
+    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
+    foreach table ($tables)
+	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
+	    $table == "knownGene" || $table == "xenoRefGene" ) then
+		set count = `hgsql $db -N -e "select count(*) from $table"`
+		echo -n "${table}: ${count}, "
+	endif
+    end
+    set orgName = `hgsql hgcentraltest -N -e \
+	    "select scientificName from dbDb where name='$db'"`
+    set orgId = `hgsql hg19 -N -e \
+	    "select id from organism where name='$orgName'"`
+    if ($orgId == "") then
+	echo "Mrnas: 0"
+    else
+	set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
+	echo "Mrnas: ${count}"
+    endif
+end
+'_EOF_'
+    # << happy emacs
+    chmod +x ./showGenes.csh
+    #	rearrange that output to create four sections:
+    #	1. knownGenes for hg19, mm9, rn4
+    #	2. ensGene for almost everything else
+    #	3. xenoRefGene for calJac1, petMar1, loxAfr3, papHam1, macEug1, oryCun2
+
+    mkdir genes
+    # knownGene
+    for DB in hg19 mm9 rn4
+do
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    echo "panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+	tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+	bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+	proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+	taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6" \
+    | sed -e "s/  */ /g" > ensGene.list
+
+
+do
+    # ensGene
+    for DB in panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+	tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+	bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+	proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+	taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6
+do
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    echo "calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2" > xenoRef.list
+
+    # xenoRefGene
+    for DB in calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2
+do
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    #	the following single command doesn't work on any 32 Gb computer,
+    #	requires much more memory, turn it into a kluster job, see below ...
+
+    #	Create this command with this script:
+    cat << '_EOF_' > mkCmd.sh
+#!/bin/sh
+
+echo "time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \\"
+for G in mm9 rn4
+do
+    if [ ! -s genes/${G}.gp.gz ]; then
+	echo "missing genes/${G}.gp.gz"
+	exit 255
+    fi
+    echo -n "${G} genes/${G}.gp.gz "
+done
+echo "\\"
+for D in `sort ensGene.list`
+do
+    if [ ! -s genes/${D}.gp.gz ]; then
+        echo "missing genes/${D}.gp.gz"
+        exit 255
+    fi
+    echo -n "${D} genes/${D}.gp.gz "
+done
+echo "\\"
+for D in `sort xenoRef.list`
+do
+    if [ ! -s genes/${D}.gp.gz ]; then
+        echo "missing genes/${D}.gp.gz"
+        exit 255
+    fi
+    echo -n "${D} genes/${D}.gp.gz "
+done
+echo "\\"
+echo "    | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1"
+'_EOF_'
+    # << happy emacs
+    chmod +x ./mkCmd.sh
+
+    time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \
+mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz \
+panTro2 genes/panTro2.gp.gz gorGor1 genes/gorGor1.gp.gz ponAbe2 genes/ponAbe2.gp.gz rheMac2 genes/rheMac2.gp.gz tarSyr1 genes/tarSyr1.gp.gz micMur1 genes/micMur1.gp.gz otoGar1 genes/otoGar1.gp.gz tupBel1 genes/tupBel1.gp.gz dipOrd1 genes/dipOrd1.gp.gz cavPor3 genes/cavPor3.gp.gz speTri1 genes/speTri1.gp.gz ochPri2 genes/ochPri2.gp.gz vicPac1 genes/vicPac1.gp.gz turTru1 genes/turTru1.gp.gz bosTau4 genes/bosTau4.gp.gz equCab2 genes/equCab2.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz myoLuc1 genes/myoLuc1.gp.gz pteVam1 genes/pteVam1.gp.gz eriEur1 genes/eriEur1.gp.gz sorAra1 genes/sorAra1.gp.gz proCap1 genes/proCap1.gp.gz echTel1 genes/echTel1.gp.gz dasNov2 genes/dasNov2.gp.gz choHof1 genes/choHof1.gp.gz monDom5 genes/monDom5.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz taeGut1 genes/taeGut1.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz tetNig2 genes/tetNig2.gp.gz fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat2 genes/oryLat2.gp.gz danRer6 genes/danRer6.gp.gz \
+calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz loxAfr3 genes/loxAfr3.gp.gz papHam1 genes/papHam1.gp.gz macEug1 genes/macEug1.gp.gz oryCun2 genes/oryCun2.gp.gz \
+    | gzip > multiz46way.mafFrames.gz) > frames.log 2>&1
+
+    #	that doesn't work on any 32 Gb computer, requires much more memory
+    #	turn it into a kluster job
+    ssh swarm
+    cd /hive/data/genomes/hg19/bed/multiz46way/frames
+    cat << '_EOF_' > runOne
+#!/bin/csh -fe
+
+set C = $1
+set G = $2
+
+cat ../maf/${C}.maf | genePredToMafFrames hg19 stdin stdout \
+        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
+'_EOF_'
+    # << happy emacs
+    chmod +x runOne
+
+    ls ../maf | sed -e "s/.maf//" > chr.list
+    ls genes | sed -e "s/.gp.gz//" | grep -v hg19 > gene.list
+
+    cat << '_EOF_' > template
+#LOOP
+runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    mkdir parts
+    gensub2 chr.list gene.list template jobList
+    para -ram=8g create jobList
+    para try ... check ... push
+# Completed: 4185 of 4185 jobs
+# CPU time in finished jobs:      72491s    1208.19m    20.14h    0.84d  0.002 y
+# IO & Wait Time:               1462162s   24369.36m   406.16h   16.92d  0.046 y
+# Average job time:                 367s       6.11m     0.10h    0.00d
+# Longest finished job:            3165s      52.75m     0.88h    0.04d
+# Submission to last job:          6364s     106.07m     1.77h    0.07d
+
+    # see what it looks like in terms of number of annotations per DB:
+    find ./parts -type f | while read F
+do
+    zcat ${F}
+done | cut -f4 | sort | uniq -c | sort -n > annotation.survey.txt
+  79191 rn4
+ 108287 petMar1
+ 139581 gorGor1
+ 140487 taeGut1
+ 143058 choHof1
+ 143233 vicPac1
+ 150073 anoCar1
+ 154462 tarSyr1
+ 163930 sorAra1
+ 164575 galGal3
+ 171191 macEug1
+ 174221 felCat3
+ 175831 dasNov2
+ 177622 ornAna1
+ 190729 eriEur1
+ 192285 tupBel1
+ 198052 speTri1
+ 199639 micMur1
+ 201731 papHam1
+ 201961 panTro2
+ 206170 oryCun2
+ 209327 ponAbe2
+ 209504 otoGar1
+ 210860 rheMac2
+ 212533 proCap1
+ 212848 myoLuc1
+ 213146 dipOrd1
+ 213479 calJac1
+ 215995 echTel1
+ 220341 ochPri2
+ 225132 loxAfr3
+ 226689 turTru1
+ 230903 monDom5
+ 232025 pteVam1
+ 232831 equCab2
+ 236945 cavPor3
+ 238167 bosTau4
+ 239857 mm9
+ 255727 canFam2
+ 316850 xenTro2
+ 359507 danRer6
+ 375156 oryLat2
+ 390076 fr2
+ 426532 gasAcu1
+ 434619 tetNig2
+
+    #	load the resulting file
+    ssh hgwdev
+    cd /cluster/data/hg19/bed/multiz46way/frames
+    find ./parts -type f | while read F
+do
+    zcat ${F}
+done | sort -k1,1 -k2,2n | hgLoadMafFrames hg19 multiz46wayFrames stdin
+    #	real    5m47.840s
+
+    find ./parts -type f | while read F
+do
+    zcat ${F}
+done | sort -k1,1 -k2,2n > multiz46wayFrames.bed
+
+    featureBits -countGaps hg19 multiz46wayFrames.bed
+    #	62315198 bases of 3107677273 (2.005%) in intersection
+    featureBits -countGaps hg19 multiz28wayFrames
+    #	48236360 bases of 3107677273 (1.552%) in intersection
+
+    #	enable the trackDb entries:
+# frames multiz46wayFrames
+# irows on
+    #	appears to work OK
+
+#############################################################################
 # AFFY U133AB (Done - 2009-09-30 - Jim)
     # Align probes 
     ssh swarm
     cd /cluster/data/hg19/bed