src/hg/makeDb/doc/calJac3.txt 1.5
1.5 2010/04/01 17:28:00 hiram
done and in the pushQ except for the multiz track
Index: src/hg/makeDb/doc/calJac3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/calJac3.txt,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 4 -r1.4 -r1.5
--- src/hg/makeDb/doc/calJac3.txt 18 Feb 2010 22:15:04 -0000 1.4
+++ src/hg/makeDb/doc/calJac3.txt 1 Apr 2010 17:28:00 -0000 1.5
@@ -14,9 +14,9 @@
mkdir genbank
cd genbank
wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \
--no-remove-listing -np \
-"ftp.ncbi.nlm.nih.gov:genbank/genomes/Eukaryotes/vertebrates_mammals/Callithrix_jacchus/Callithrix_jacchus-3.2/*"
+"ftp://ftp.ncbi.nlm.nih.gov:/genbank/genomes/Eukaryotes/vertebrates_mammals/Callithrix_jacchus/Callithrix_jacchus-3.2/*"
mkdir ucscChr
cd ucscChr
# fixup the accession names to become UCSC chrom names
@@ -88,9 +88,10 @@
makeGenomeDb.pl -continue=agp -stop=agp calJac3.config.ra > agp.out 2>&1
# real 0m20.968s
makeGenomeDb.pl -continue=db -stop=db calJac3.config.ra > db.out 2>&1
# real 5m39.181s
-XXX - chromInfo doesn't have large enough fields for the name keys
+ # XXX - chromInfo doesn't have large enough fields for the name keys
+ # been fixed in later versions of makeGenomeDb.pl
makeGenomeDb.pl -continue=dbDb -stop=dbDb calJac3.config.ra > dbDb.out 2>&1
makeGenomeDb.pl -continue=trackDb -stop=trackDb calJac3.config.ra > trackDb.out 2>&1
##########################################################################
@@ -645,42 +646,45 @@
`pwd`/DEF \
-verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
-XXX - running Mon Feb 15 21:49:52 PST 2010
+ # failed lastz run, finished manually
# real 287m24.258s
+ time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
+ -continue=cat \
+ -verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ > cat.log 2>&1 &
+ # real 158m17.502s
cat fb.calJac3.chainPapHam1Link.txt
- # 2047068864 bases of 2897316137 (70.654%) in intersection
+ # 1928203329 bases of 2752505800 (70.053%) in intersection
+ time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
+ calJac3 papHam1 > rbest.log 2>&1
+ # real 232m
mkdir /hive/data/genomes/papHam1/bed/blastz.calJac3.swap
cd /hive/data/genomes/papHam1/bed/blastz.calJac3.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
- /hive/data/genomes/calJac3/bed/lastzPapHam1.2010-02-11/DEF \
+ /hive/data/genomes/calJac3/bed/lastzPapHam1.2010-02-15/DEF \
-swap -syntenicNet \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
-chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
- # real 120m42.991s
+ # real 791m46.765s
cat fb.papHam1.chainCalJac3Link.txt
- # 2030475813 bases of 2752505800 (73.768%) in intersection
+ # 1908519637 bases of 2741867288 (69.607%) in intersection
##############################################################################
-# tarSyr1 Tarsier LASTZ/CHAIN/NET (DONE - 2010-02-15 - Hiram)
+# tarSyr1 Tarsier LASTZ/CHAIN/NET (DONE - 2010-02-21 - Hiram)
screen # use a screen to manage this multi-day job
- mkdir /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-15
- cd /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-15
+ mkdir /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-21
+ cd /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-21
cat << '_EOF_' > DEF
# tarsier vs. marmoset
-# same paramters as human hg19 vs marmoset calJac3
+# same paramters as human hg19 vs tarsier tarSyr1
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
-BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
-BLASTZ_O=600
-BLASTZ_E=150
-BLASTZ_K=4500
-BLASTZ_Y=15000
-BLASTZ_T=2
# TARGET: Marmoset (calJac3)
SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ1_LEN=/scratch/data/calJac3/chrom.sizes
@@ -694,59 +698,56 @@
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
-BASE=/hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-15
+BASE=/hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-21
+TMPDIR=/scratch/tmp
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
- time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
+ time nice -n +19 doBlastzChainNet.pl \
`pwd`/DEF \
- -verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
> do.log 2>&1 &
-XXX - running Mon Feb 15 22:00:23 PST 2010
- # real 287m24.258s
cat fb.calJac3.chainTarSyr1Link.txt
- # 2047068864 bases of 2897316137 (70.654%) in intersection
+ # 1286219755 bases of 2752505800 (46.729%) in intersection
+ time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
+ calJac3 tarSyr1 > rbest.log 2>&1 &
+ # real 532m
mkdir /hive/data/genomes/tarSyr1/bed/blastz.calJac3.swap
cd /hive/data/genomes/tarSyr1/bed/blastz.calJac3.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
- /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-11/DEF \
+ /hive/data/genomes/calJac3/bed/lastzTarSyr1.2010-02-21/DEF \
-swap -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
- -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
+ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
+XXX - running Wed Feb 24 14:36:31 PST 2010
# real 120m42.991s
cat fb.tarSyr1.chainCalJac3Link.txt
# 2030475813 bases of 2752505800 (73.768%) in intersection
#####################################################################
-# micMur1 Mouse lemur LASTZ/CHAIN/NET (DONE - 2010-02-15 - Hiram)
+# micMur1 Mouse lemur LASTZ/CHAIN/NET (DONE - 2010-02-17,22 - Hiram)
# Mouse lemur ( Microcebus murinus)
screen # use a screen to manage this multi-day job
- mkdir /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-15
- cd /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-15
+ mkdir /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17
+ cd /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17
cat << '_EOF_' > DEF
# mouse lemur vs. marmoset
-# same paramters as human hg19 vs marmoset calJac3
+# same paramters as human hg19 vs Mouse lemur micMur1
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
-BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
-BLASTZ_O=600
-BLASTZ_E=150
-BLASTZ_K=4500
-BLASTZ_Y=15000
-BLASTZ_T=2
# TARGET: Marmoset (calJac3)
SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ1_LEN=/scratch/data/calJac3/chrom.sizes
-SEQ1_LIMIT=50
-SEQ1_CHUNK=20000000
+SEQ1_LIMIT=5
+SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Mouse lemur micMur1
SEQ2_DIR=/hive/data/genomes/micMur1/micMur1.2bit
@@ -754,9 +755,9 @@
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
-BASE=/hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-15
+BASE=/hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
@@ -764,62 +765,50 @@
`pwd`/DEF \
-verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
-BASE=/hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17
-XXX - running Wed Feb 17 13:57:47 PST 2010
-
- time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
- `pwd`/DEF \
- -verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
- > do.log 2>&1 &
- # failed first kluster job, finished manually
- # real 287m24.258s
+ # real 5502m6.707s
+ # some kluster difficulties, finished cat run manually, then:
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ -continue=chainRun `pwd`/DEF \
+ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
+ > chainRun.log 2>&1 &
+ # real 374m19.587s calJac3 micMur1 02-17
cat fb.calJac3.chainMicMur1Link.txt
- # 2047068864 bases of 2897316137 (70.654%) in intersection
- time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
- -continue=cat `pwd`/DEF \
- -verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
- > cat.log 2>&1 &
-XXX - running Wed Feb 17 10:36:56 PST 2010
+ # 1258616069 bases of 2752505800 (45.726%) in intersection
+ time doRecipBest.pl -buildDir=`pwd` calJac3 micMur1 > rbest.log 2>&1
+ # real 235m55.179s calJac3 micMur1
mkdir /hive/data/genomes/micMur1/bed/blastz.calJac3.swap
cd /hive/data/genomes/micMur1/bed/blastz.calJac3.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
- /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-11/DEF \
+ /hive/data/genomes/calJac3/bed/lastzMicMur1.2010-02-17/DEF \
-swap -syntenicNet \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
- -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
- # real 120m42.991s
+ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
+ # real 455m11.215s micMur1 calJac3 swap
cat fb.micMur1.chainCalJac3Link.txt
- # 2030475813 bases of 2752505800 (73.768%) in intersection
+ # 1243785262 bases of 1852394361 (67.145%) in intersection
#####################################################################
-# otoGar1 Bushbaby LASTZ/CHAIN/NET (DONE - 2010-02-15 - Hiram)
+# otoGar1 Bushbaby LASTZ/CHAIN/NET (DONE - 2010-02-17,22 - Hiram)
screen # use a screen to manage this multi-day job
- mkdir /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-15
- cd /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-15
+ mkdir /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17
+ cd /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17
cat << '_EOF_' > DEF
# bushbaby vs. marmoset
-# same paramters as human hg19 vs marmoset calJac3
+# same paramters as human hg19 vs Bushbaby otoGar1
BLASTZ=lastz
# maximum M allowed with lastz is only 254
BLASTZ_M=254
-BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
-BLASTZ_O=600
-BLASTZ_E=150
-BLASTZ_K=4500
-BLASTZ_Y=15000
-BLASTZ_T=2
# TARGET: Marmoset (calJac3)
SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ1_LEN=/scratch/data/calJac3/chrom.sizes
-SEQ1_LIMIT=50
-SEQ1_CHUNK=20000000
+SEQ1_LIMIT=5
+SEQ1_CHUNK=200000000
SEQ1_LAP=10000
# QUERY: Bushbaby otoGar1
SEQ2_DIR=/scratch/data/otoGar1/otoGar1.rmsk.2bit
@@ -827,9 +816,9 @@
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
-BASE=/hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-15
+BASE=/hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
@@ -837,35 +826,476 @@
`pwd`/DEF \
-verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
-BASE=/hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17
-XXX - running Wed Feb 17 13:57:47 PST 2010
-
- time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
- `pwd`/DEF \
- -verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
- > do.log 2>&1 &
- # failed first kluster job, finished manually
- # real 287m24.258s
+ # real 4722m38.163s
+ # memk failed at the cat run, finish it manually, then, continuing:
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ -continue=chainRun `pwd`/DEF \
+ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
+ > chainRun.log 2>&1 &
+ # real 285m58.314s
cat fb.calJac3.chainOtoGar1Link.txt
- # 2047068864 bases of 2897316137 (70.654%) in intersection
- time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
- -continue=cat `pwd`/DEF \
- -verbose=2 -syntenicNet -chainMinScore=5000 -chainLinearGap=medium \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
- > cat.log 2>&1 &
-XXX - running Wed Feb 17 10:36:21 PST 2010
+ # 1176505967 bases of 2752505800 (42.743%) in intersection
+ time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
+ calJac3 otoGar1 > rbest.log 2>&1 &
+ # real 332m14.375s calJac3 otoGar1
mkdir /hive/data/genomes/otoGar1/bed/blastz.calJac3.swap
cd /hive/data/genomes/otoGar1/bed/blastz.calJac3.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
- /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-11/DEF \
+ /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-17/DEF \
-swap -syntenicNet \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
- -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
- # real 120m42.991s
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
+ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
+ # real 310m4.077s
cat fb.otoGar1.chainCalJac3Link.txt
- # 2030475813 bases of 2752505800 (73.768%) in intersection
+ # 1158531484 bases of 1969052059 (58.837%) in intersection
+
+#####################################################################
+## 8-Way Multiz (DONE - 2009-06-09,2009-11-10 - Hiram)
+ mkdir /hive/data/genomes/calJac3/bed/multiz8way
+ cd /hive/data/genomes/calJac3/bed/multiz8way
+
+ /cluster/bin/phast/tree_doctor \
+ --prune-all-but=calJac1,hg19,panTro2,rheMac2,ponAbe2,mm9,canFam2,monDom5 \
+ --rename="calJac1 -> calJac3 " \
+/hive/data/genomes/hg19/bed/multiz46way/fixedTree/46wayFixed.nh > 8way.nh
+ # *carefully* edit 8way.nh to get calJac3 at the top of this picture
+ # resulting in this tree:
+
+(calJac3:0.066389,((rheMac2:0.057695,(ponAbe2:0.018342,
+(hg19:0.006591,panTro2:0.006639):0.012126):0.014256):0.010000,
+(mm9:0.352605,(canFam2:0.193569,monDom5:0.581923):0.020666)
+:0.088210):0.000001);
+
+ # Use this specification in the phyloGif tool:
+ # http://genome.ucsc.edu/cgi-bin/phyloGif
+ # to obtain a gif image for htdocs/images/phylo/calJac3_8way.gif
+
+ /cluster/bin/phast/all_dists 8way.nh > 8way.distances.txt
+ # Use this output to create the table below, with this perl script:
+ cat << '_EOF_' > sizeStats.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+open (FH, "grep -y calJac3 8way.distances.txt | sort -k3,3n|") or
+ die "can not read 8way.distances.txt";
+
+my $count = 0;
+while (my $line = <FH>) {
+ chomp $line;
+ my ($calJac3, $D, $dist) = split('\s+', $line);
+ my $chain = "chain" . ucfirst($D);
+ my $B="/hive/data/genomes/calJac3/bed/lastz.$D/fb.calJac3." .
+ $chain . "Link.txt";
+ my $chainLinkMeasure =
+ `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
+ chomp $chainLinkMeasure;
+ $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
+ $chainLinkMeasure =~ s/\%//;
+ my $swapFile="/hive/data/genomes/${D}/bed/lastz.calJac3/fb.${D}.chainCalJac3Link.txt";
+ my $swapMeasure = "N/A";
+ if ( -s $swapFile ) {
+ $swapMeasure =
+ `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
+ chomp $swapMeasure;
+ $swapMeasure = 0.0 if (length($swapMeasure) < 1);
+ $swapMeasure =~ s/\%//;
+ }
+ my $orgName=
+ `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
+ chomp $orgName;
+ if (length($orgName) < 1) {
+ $orgName="N/A";
+ }
+ ++$count;
+ if ($swapMeasure eq "N/A") {
+ printf "# %02d %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist,
+ $orgName, $D, $chainLinkMeasure, $swapMeasure
+ } else {
+ printf "# %02d %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist,
+ $orgName, $D, $chainLinkMeasure, $swapMeasure
+ }
+}
+close (FH);
+'_EOF_'
+ # << happy emacs
+ chmod +x ./sizeStats.pl
+ ./sizeStats.pl
+#
+# If you can fill in all the numbers in this table, you are ready for
+# the multiple alignment procedure
+#
+# featureBits chainLink measures
+# chainCalJac3Link
+# distance on calJac3 on other
+# 01 0.1090 - Orangutan ponAbe2 (% 71.893) (% 67.448)
+# 02 0.1094 - Human hg19 (% 73.768) (% 70.654)
+# 03 0.1094 - Chimp panTro2 (% 72.304) (% 69.302)
+# 04 0.1341 - Rhesus rheMac2 (% 69.625) (% 70.711)
+# 05 0.3688 - Dog canFam2 (% 50.766) (% 57.162)
+# 06 0.5072 - Mouse mm9 (% 31.310) (% 32.815)
+# 07 0.7572 - Opossum monDom5 (% 7.906) (% 6.174)
+
+ # create species list and stripped down tree for autoMZ
+ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
+ 8way.nh > tmp.nh
+ echo `cat tmp.nh` > tree-commas.nh
+ echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
+ sed 's/[()]//g; s/,/ /g' tree.nh > species.list
+
+ # bash shell syntax here ...
+ mkdir -p mafLinks
+H="/hive/data/genomes/calJac3/bed/"
+for S in `sed -e "s/calJac3 //" species.list`
+do
+ echo $S
+ ls -og ${H}/lastz.${S}/axtChain/calJac3.${S}.synNet.maf.gz
+ ln -s ${H}/lastz.${S}/axtChain/calJac3.${S}.synNet.maf.gz \
+ mafLinks/${S}.maf.gz
+done
+
+HERE=`pwd`
+export HERE
+PATH=${HERE}/penn:${PATH}
+export PATH
+rm -fr tmp
+mkdir -p tmp
+cd mafLinks
+time ../penn/autoMZ + T=${HERE}/tmp \
+ E=calJac3 "`cat ../tree.nh`" *.sing.maf result.maf
+ # real 3584m8.094s
+ mkdir /gbdb/calJac3/multiz8way
+ ln -s `pwd`/mafLinks/result.maf /gbdb/calJac3/multiz8way/multiz8way.maf
+ cd /scratch/tmp
+# Loaded 7475045 mafs in 1 files from /gbdb/calJac3/multiz8way
+
+ time nice -n +19 hgLoadMaf calJac3 multiz8way
+ time nice -n +19 hgLoadMafSummary -minSize=30000 -mergeGap=1500 \
+ -maxSize=200000 calJac3 multiz8waySummary multiz8way.maf
+ | hgLoadMafSummary calJac1 -minSize=30000 -mergeGap=1500 \
+ -maxSize=200000 multiz9waySummary stdin
+ # Created 1313222 summary blocks from 34128178 components
+ # and 7475045 mafs from multiz8way.maf
+ # real 8m36.016s
+
+##############################################################################
+# gorGor2 Bushbaby LASTZ/CHAIN/NET (DONE - 2010-02-22,24 - Hiram)
+ screen # use a screen to manage this multi-day job
+ mkdir /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-22
+ cd /hive/data/genomes/calJac3/bed/lastzOtoGar1.2010-02-22
+
+ cat << '_EOF_' > DEF
+# Gorilla vs. marmoset
+# same paramters as human hg19 vs other nearby primates
+# without all the extra blastz parameters
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+
+# TARGET: Marmoset (calJac3)
+SEQ1_DIR=/scratch/data/calJac3/calJac3.2bit
+SEQ1_LEN=/scratch/data/calJac3/chrom.sizes
+SEQ1_LIMIT=20
+SEQ1_CHUNK=200000000
+SEQ1_LAP=10000
+
+# QUERY: Gorilla gorGor2
+SEQ2_DIR=/scratch/data/gorGor2/gorGor2.2bit
+SEQ2_LEN=/scratch/data/gorGor2/chrom.sizes
+SEQ2_CHUNK=12000000
+SEQ2_LIMIT=300
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/calJac3/bed/lastzGorGor2.2010-02-22
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << this line keeps emacs coloring happy
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -verbose=2 -syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ # Elapsed time: 1956m3.678s
+ cat fb.calJac3.chainGorGor2Link.txt
+ # 2101356280 bases of 2752505800 (76.343%) in intersection
+ time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \
+ calJac3 gorGor2 > rbest.log 2>&1
+ # about 4h16m
+
+ mkdir /hive/data/genomes/gorGor2/bed/blastz.calJac3.swap
+ cd /hive/data/genomes/gorGor2/bed/blastz.calJac3.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/calJac3/bed/lastzGorGor2.2010-02-22/DEF \
+ -swap -syntenicNet \
+ -workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
+ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
+ # real 250m57.089s
+ cat fb.gorGor2.chainCalJac3Link.txt
+ # 2135885920 bases of 2829687208 (75.481%) in intersection
+
+#####################################################################
+## 13-Way Multiz (DONE - 2010-02-23 - Hiram)
+ mkdir /hive/data/genomes/calJac3/bed/multiz13way
+ cd /hive/data/genomes/calJac3/bed/multiz13way
+
+ /cluster/bin/phast/tree_doctor \
+ --prune-all-but=calJac1,hg19,panTro2,rheMac2,ponAbe2,gorGor1,micMur1,otoGar1,papHam1,tarSyr1,mm9,canFam2,monDom5 \
+ --rename="calJac1 -> calJac3 ; gorGor1 -> gorGor2 " \
+/hive/data/genomes/hg19/bed/multiz46way/fixedTree/46wayFixed.nh > 13way.nh
+
+ # rearrange calJac3 to the top, get some help from tree_doctor:
+ /cluster/bin/phast/tree_doctor --name-ancestors --reroot calJac3 \
+ --with-branch 13way.nh
+ # edit out the ancestors, and move calJac3 from the bottom to
+ # the top, resulting in this tree:
+
+(calJac3:0.066389,(((((hg19:0.006591,panTro2:0.006639):0.002184,
+gorGor2:0.009411):0.009942,ponAbe2:0.018342):0.014256,
+(rheMac2:0.036199,papHam1:0.040000):0.021496):0.010000,
+((((monDom5:0.581923,canFam2:0.193569):0.020666,mm9:0.352605):0.019992,
+(micMur1:0.091452,otoGar1:0.128984):0.035463):0.011307,
+tarSyr1:0.135169):0.056911):0.000001);
+
+ # more rearranging after seeing what the distance table looks like
+ # below to get them appearing as much as possible in their
+ # distance order top to bottom:
+(calJac3:0.066389,(((ponAbe2:0.018342,
+((hg19:0.006591,panTro2:0.006639):0.002184,
+gorGor2:0.009411):0.009942):0.014256,
+(rheMac2:0.036199,papHam1:0.040000):0.021496):0.010000,
+(tarSyr1:0.135169,((micMur1:0.091452,otoGar1:0.128984):0.035463,
+(mm9:0.352605,
+(canFam2:0.193569,monDom5:0.581923):0.020666):0.019992):0.011307):0.056911)
+:0.000001);
+
+ # Use this specification in the phyloGif tool after changing the names:
+ /cluster/bin/phast/tree_doctor \
+--rename="calJac3 -> Marmoset ; ponAbe2 -> Orangutan ; hg19 -> Human ; panTro2 -> Chimp ; gorGor2 -> Gorilla ; rheMac2 -> Rhesus ; papHam1 -> Baboon ; tarSyr1 -> Tarsier ; micMur1 -> Mouse_lemur ; otoGar1 -> Bushbaby ; canFam2 -> Dog ; mm9 -> Mouse ; monDom5 -> Opossum " 13way.nh
+ # http://genome.ucsc.edu/cgi-bin/phyloGif
+ # to obtain a gif image for htdocs/images/phylo/calJac3_13way.gif
+
+ /cluster/bin/phast/all_dists 13way.nh > 13way.distances.txt
+ # make sure all symlinks lastz.DB -> lastzDb-date
+ # exist here and at the swap locations, the perl script expects this
+ # in order to find featureBits numbers.
+ # Use 13way.distances.txt to create the table below
+ # with this perl script:
+
+ cat << '_EOF_' > sizeStats.pl
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+
+open (FH, "grep -y calJac3 13way.distances.txt | sort -k3,3n|") or
+ die "can not read 13way.distances.txt";
+
+my $count = 0;
+while (my $line = <FH>) {
+ chomp $line;
+ my ($calJac3, $D, $dist) = split('\s+', $line);
+ my $chain = "chain" . ucfirst($D);
+ my $B="/hive/data/genomes/calJac3/bed/lastz.$D/fb.calJac3." .
+ $chain . "Link.txt";
+ my $chainLinkMeasure =
+ `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
+ chomp $chainLinkMeasure;
+ $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
+ $chainLinkMeasure =~ s/\%//;
+ my $swapFile="/hive/data/genomes/${D}/bed/lastz.calJac3/fb.${D}.chainCalJac3Link.txt";
+ my $swapMeasure = "N/A";
+ if ( -s $swapFile ) {
+ $swapMeasure =
+ `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
+ chomp $swapMeasure;
+ $swapMeasure = 0.0 if (length($swapMeasure) < 1);
+ $swapMeasure =~ s/\%//;
+ }
+ my $orgName=
+ `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
+ chomp $orgName;
+ if (length($orgName) < 1) {
+ $orgName="N/A";
+ }
+ ++$count;
+ if ($swapMeasure eq "N/A") {
+ printf "# %02d %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist,
+ $orgName, $D, $chainLinkMeasure, $swapMeasure
+ } else {
+ printf "# %02d %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist,
+ $orgName, $D, $chainLinkMeasure, $swapMeasure
+ }
+}
+close (FH);
+'_EOF_'
+ # << happy emacs
+ chmod +x ./sizeStats.pl
+ ./sizeStats.pl
+
+# 01 0.1090 - Orangutan ponAbe2 (% 71.893) (% 67.448)
+# 02 0.1094 - Human hg19 (% 73.768) (% 70.654)
+# 03 0.1094 - Chimp panTro2 (% 72.304) (% 69.302)
+# 04 0.1100 - Gorilla gorGor2 (% 76.343) (% 75.481)
+# 05 0.1341 - Rhesus rheMac2 (% 69.625) (% 70.711)
+# 06 0.1379 - Baboon papHam1 (% 70.053) (% 69.607)
+# 07 0.2585 - Tarsier tarSyr1 (% 46.729) (N/A)
+# 08 0.2615 - Mouse lemur micMur1 (% 45.726) (% 67.145)
+# 09 0.2991 - Bushbaby otoGar1 (% 42.743) (% 58.837)
+# 10 0.3688 - Dog canFam2 (% 50.766) (% 57.162)
+# 11 0.5072 - Mouse mm9 (% 31.310) (% 32.815)
+# 12 0.7572 - Opossum monDom5 (% 7.906) (% 6.174)
+
+ # create species list and stripped down tree for autoMZ
+ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
+ 13way.nh > tmp.nh
+ echo `cat tmp.nh` > tree-commas.nh
+ echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
+ sed 's/[()]//g; s/,/ /g' tree.nh > species.list
+
+ # collect the single whole mafs into one place for splitting:
+ mkdir singleMafs
+ cd singleMafs
+ ln -s ../../lastz.ponAbe2/axtChain/calJac3.ponAbe2.synNet.maf.gz .
+ ln -s ../../lastz.hg19/axtChain/calJac3.hg19.synNet.maf.gz .
+ ln -s ../../lastz.panTro2/axtChain/calJac3.panTro2.synNet.maf.gz .
+ ln -s ../../lastz.gorGor2/axtChain/calJac3.gorGor2.synNet.maf.gz .
+ ln -s ../../lastz.rheMac2/axtChain/calJac3.rheMac2.synNet.maf.gz .
+ ln -s ../../lastz.papHam1/mafRBestNet/calJac3.papHam1.rbest.maf.gz .
+ ln -s ../../lastz.tarSyr1/mafRBestNet/calJac3.tarSyr1.rbest.maf.gz .
+ ln -s ../../lastz.micMur1/mafRBestNet/calJac3.micMur1.rbest.maf.gz .
+ ln -s ../../lastz.otoGar1/mafRBestNet/calJac3.otoGar1.rbest.maf.gz .
+ ln -s ../../lastz.mm9/axtChain/calJac3.mm9.synNet.maf.gz .
+ ln -s ../../lastz.canFam2/axtChain/calJac3.canFam2.synNet.maf.gz .
+ ln -s ../../lastz.monDom5/axtChain/calJac3.monDom5.synNet.maf.gz .
+
+ cd /hive/data/genomes/calJac3/bed/multiz13way
+ mkdir penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
+
+ # set the db and pairs directories here
+ cat > autoMultiz.csh << '_EOF_'
+#!/bin/csh -ef
+set db = calJac3
+set topDir = /hive/data/genomes/$db/bed/multiz13way
+set c = $1
+set result = $2
+set pennBin = $topDir/penn
+set run = `/bin/pwd`
+set tmp = /scratch/tmp/$db/multiz.$c
+set pairs = $topDir/splitMaf
+/bin/rm -fr $tmp
+/bin/mkdir -p $tmp
+/bin/cp -p $topDir/tree.nh $topDir/species.list $tmp
+pushd $tmp > /dev/null
+foreach s (`/bin/sed -e "s/^$db //" species.list`)
+ set in = $pairs/$s/$c.maf
+ set out = $db.$s.sing.maf
+ if (-e $in.gz) then
+ /bin/zcat $in.gz > $out
+ if (! -s $out) then
+ echo "##maf version=1 scoring=autoMZ" > $out
+ endif
+ else if (-e $in) then
+ /bin/ln -s $in $out
+ else
+ echo "##maf version=1 scoring=autoMZ" > $out
+ endif
+end
+set path = ($pennBin $path); rehash
+$pennBin/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf \
+ > /dev/null
+popd > /dev/null
+/bin/rm -f $result
+/bin/cp -p $tmp/$c.maf $result
+/bin/rm -fr $tmp
+/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
+'_EOF_'
+# << happy emacs
+ chmod +x autoMultiz.csh
+
+ cat << '_EOF_' > template
+#LOOP
+./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/calJac3/bed/multiz13way/run/maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+# << happy emacs
+
+ find ../splitMaf -type f | grep "/[0-9][0-9][0-9].maf" \
+ | xargs -L 1 basename | sort -u > chr.part.list
+ gensub2 chr.part.list single template jobList
+ para -ram=8g create jobList
+
+ # put the split mafs back together into a single result
+ head -q -n 1 maf/000.maf > calJac3.13way.maf
+ for F in maf/*.maf
+do
+ grep -h -v "^#" ${F} >> calJac3.13way.maf
+done
+ tail -q -n 1 maf/000.maf >> calJac3.13way.maf
+ tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
+
+ # real 13m32.340s
+
+ # load tables for a look
+ mkdir -p /gbdb/calJac3/multiz13way/maf
+ cd /hive/data/genomes/calJac3/bed/multiz13way/maf
+ ln -s `pwd`/calJac3.13way.maf \
+ /gbdb/calJac3/multiz13way/maf/multiz13way.maf
+
+ # this generates an immense multiz13way.tab file in the directory
+ # where it is running. Best to run this over in scratch.
+ cd /data/tmp
+ time nice -n +19 hgLoadMaf \
+ -pathPrefix=/gbdb/calJac3/multiz13way/maf calJac3 multiz13way
+ # Loaded 13316945 mafs in 1 files from /gbdb/calJac3/multiz13way/maf
+ # real 9m9.365s
+
+ # load summary table
+ time nice -n +19 cat /gbdb/calJac3/multiz13way/maf/*.maf \
+ | hgLoadMafSummary calJac3 -minSize=30000 -verbose=2 \
+ -mergeGap=1500 -maxSize=200000 multiz13waySummary stdin
+# Created 2330531 summary blocks from 99659162 components and
+# 13316945 mafs from stdin
+ # real 17m54.685s
#####################################################################
+# all.joiner update, downloads and in pushQ - (DONE - 2010-04-01 - Hiram)
+ cd $HOME/kent/src/hg/makeDb/schema
+ # fixup all.joiner until this is a clean output
+ joinerCheck -database=calJac3 -all all.joiner
+
+ mkdir /hive/data/genomes/calJac3/goldenPath
+ cd /hive/data/genomes/calJac3/goldenPath
+ time nice -n +19 makeDownloads.pl calJac3 > do.log 2>&1
+ # real 22m30.329s
+
+ # now ready for pushQ entry
+ mkdir /hive/data/genomes/calJac3/pushQ
+ cd /hive/data/genomes/calJac3/pushQ
+ time nice -n +19 makePushQSql.pl calJac3 > calJac3.pushQ.sql 2> stderr.out
+ real 2m52.193s
+
+ # check for errors in stderr.out, some are OK, e.g.:
+# WARNING: calJac3 does not have seq
+# WARNING: calJac3 does not have extFile
+
+# WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of
+# supporting and genbank tables) which tracks to assign these tables to:
+# bosTau4ChainPileUp
+
+ # copy it to hgwbeta
+ scp -p calJac3.pushQ.sql hgwbeta:/tmp
+ ssh hgwbeta
+ cd /tmp
+ hgsql qapushq < calJac3.pushQ.sql
+ # in that pushQ entry walk through each entry and see if the
+ # sizes will set properly
+
+############################################################################