src/hg/makeDb/doc/hg19.txt 1.17
1.17 2009/05/22 21:31:45 hiram
Fugu Fr2, Bushbaby OtoGar1 lastz done, Baboon PapHam1 running, SGP and GeneID Genes loaded. 4-way multiz done for UCSC gene build
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 4 -r1.16 -r1.17
--- src/hg/makeDb/doc/hg19.txt 14 May 2009 23:44:55 -0000 1.16
+++ src/hg/makeDb/doc/hg19.txt 22 May 2009 21:31:45 -0000 1.17
@@ -2415,9 +2415,9 @@
cat fb.hg19.chainPetMar1Link.txt
# 31347143 bases of 2897316137 (1.082%) in intersection
##############################################################################
-# LASTZ Fugu Fr2 (WORKING - 2009-05-14 - Hiram)
+# LASTZ Fugu Fr2 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzFr2.2009-05-14
cat << '_EOF_' > DEF
@@ -2460,9 +2460,18 @@
-qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
> do.log 2>&1 &
-XXX - running Thu May 14 10:18:26 PDT 2009
+ # real 5797m9.288s
+ # had a small problem finishing the fundamental batch run, continuing:
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -continue=cat -qRepeats=windowmaskerSdust \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=encodek \
+ > cat.log 2>&1 &
+ cat fb.hg19.chainFr2Link.txt
+ # 49309456 bases of 2897316137 (1.702%) in intersection
##############################################################################
# LASTZ Tetraodon TetNig1 (DONE - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzTetNig1.2009-05-14
@@ -2629,13 +2638,74 @@
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
-XXX - running
-Thu May 14 16:30:50 PDT 2009
+ # real 1724m48.032s
+XXX # need to load the chain table manually:
+ # mySQL error 1114: The table 'chainTarSyr1Link' is full
+ cd /hive/data/genomes/hg19/bed/lastzTarSyr1.2009-05-14/axtChain
+ wc -l *.tab
+ # 21882142 chain.tab
+ # 165017606 link.tab
+ # 186899748 total
+ awk '{print length($0)}' link.tab | sort | uniq -c | less
+ 4 23
+ 9 24
+ 27 25
+ 105 26
+ 767 27
+ 1401 28
+ 5020 29
+ 8472 30
+ 24390 31
+ 117666 32
+ 264774 33
+ 776095 34
+1632393 35
+2672187 36
+7125988 37
+16831901 38
+34905113 39
+45218159 40
+31570706 41
+13746548 42
+5868689 43
+2460114 44
+1118556 45
+ 420826 46
+ 106674 47
+ 36770 48
+ 40719 49
+ 36955 50
+ 19389 51
+ 5571 52
+ 1557 53
+ 61 54
+
+ time nice -n +19 hgsql -e "DROP TABLE chainTarSyr1Link;" hg19
+
+ cat << '_EOF_' | hgsql hg19
+ CREATE TABLE chainTarSyr1Link (
+ bin smallint(5) unsigned NOT NULL default 0,
+ tName varchar(255) NOT NULL default '',
+ tStart int(10) unsigned NOT NULL default 0,
+ tEnd int(10) unsigned NOT NULL default 0,
+ qStart int(10) unsigned NOT NULL default 0,
+ chainId int(10) unsigned NOT NULL default 0,
+ KEY tName (tName(16),bin),
+ KEY chainId (chainId)
+ ) ENGINE=MyISAM max_rows=166000000 avg_row_length=42 pack_keys=1 CHARSET=latin1;
+'_EOF_'
+ # << happy emacs
+
+ time nice -n +19 hgsql -e \
+ "load data local infile \"link.tab\" into table chainTarSyr1Link;" hg19
+ # this one took a number of hours
+ # real 272m44.943s
+
#########################################################################
-# LASTZ Bushbaby OtoGar1 (WORKING - 2009-05-14 - Hiram)
+# LASTZ Bushbaby OtoGar1 (DONE - 2009-05-14,15 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
cd /hive/data/genomes/hg19/bed/lastzOtoGar1.2009-05-14
cat << '_EOF_' > DEF
@@ -2665,9 +2735,11 @@
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
-XXX - running Thu May 14 16:36:50 PDT 2009
+ # real 762m56.055s
+ cat fb.hg19.chainOtoGar1Link.txt
+ # 1264492372 bases of 2897316137 (43.644%) in intersection
#########################################################################
# LASTZ Mouse lemur MicMur1 (WORKING - 2009-05-14 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzMicMur1.2009-05-14
@@ -2700,7 +2772,258 @@
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
> do.log 2>&1 &
-XXX - running Thu May 14 16:36:50 PDT 2009
+ # real 5429m52.082s
+ # there is one unusual long running job having trouble
+XXX - running Wed May 20 16:13:58 PDT 2009
+
+#########################################################################
+# LASTZ Baboon PapHam1 (DONE - 2009-05-13 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
+ cd /hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
+
+ cat << '_EOF_' > DEF
+# human vs baboon
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+# other parameters from panTro2 vs hg18 lastz on advice from Webb
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/hg19.2bit
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=100000000
+SEQ1_LAP=10000
+SEQ1_IN_CONTIGS=0
+
+# QUERY: Baboon papHam1
+SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit
+SEQ2_LEN=/scratch/data/papHam1/chrom.sizes
+SEQ2_CHUNK=20000000
+SEQ2_LIMIT=300
+SEQ2_LAP=0
+SEQ2_IN_CONTIGS=0
+
+BASE=/hive/data/genomes/hg19/bed/lastzPapHam1.2009-05-20
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ # forgot that the synNet was not needed here, use recip best as below
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -syntenicNet \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ cat fb.hg19.chainPapHam1Link.txt
+ # 2399269031 bases of 2897316137 (82.810%) in intersection
+
+ time doRecipBest.pl -buildDir=`pwd` hg19 papHam1 > rbest.log 2>&1
+XXX - running Fri May 22 11:25:41 PDT 2009
+
+#########################################################################
+# SGP GENES (DONE - 2009-05-22 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/sgpGene
+ cd /hive/data/genomes/hg19/bed/sgpGene
+ mkdir download
+ cd download
+for C in `cut -f1 ../../../chrom.sizes`
+do
+ echo $C
+ wget --timestamping \
+http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.gtf
+ wget --timestamping \
+http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902_x_mm9/SGP/${C}.prot
+done
+
+ cd ..
+ cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 sgpGene stdin
+
+ # Read 33994 transcripts in 291782 lines in 1 files
+ # 33994 groups 85 seqs 1 sources 3 feature types
+ # 33994 gene predictions
+ nice -n +19 featureBits -enrichment hg19 refGene:CDS sgpGene
+# refGene:CDS 1.181%, sgpGene 1.295%, both 1.011%, cover 85.59%, enrich 66.08x
+
+###########################################################################
+# GENEID GENE PREDICTIONS (DONE - 2009-05-22 - Hiram)
+ ssh hgwdev
+ mkdir /hive/data/genomes/hg19/bed/geneid
+ cd /hive/data/genomes/hg19/bed/geneid
+ mkdir download
+ cd download
+ for C in `cut -f1 ../../../chrom.sizes`
+ do
+ echo $C
+ wget --timestamping \
+http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.gtf
+ wget --timestamping \
+http://genome.crg.es/genepredictions/H.sapiens/golden_path_200902/geneid_v1.3/${C}.prot
+ done
+
+ cd ..
+ cat download/*.gtf | ldHgGene -gtf -genePredExt hg19 geneid stdin
+ # Read 33428 transcripts in 277332 lines in 1 files
+ # 33428 groups 92 seqs 1 sources 3 feature types
+ # 33428 gene predictions
+
+##########################################################################
+## 4-Way Multiz for UCSC Genes construction (DONE - 2009-05-22 - Hiram)
+ ssh hgwdev
+ mkdir /hive/data/genomes/hg19/bed/multiz4way
+ cd /hive/data/genomes/hg19/bed/multiz4way
+
+ # extract our 4 organisms from the 44-way on hg18:
+ ln -s /hive/data/genomes/hg18/bed/multiz44way/44way.4d.nh ./44way.nh
+
+ /cluster/bin/phast/tree_doctor \
+ --prune-all-but hg18,mm9,canFam2,rheMac2 44way.nh \
+ | sed -e "s/hg18/hg19/" > 4way.nh
+
+ # this looks like:
+ cat 4way.nh
+(((hg19:0.032973,rheMac2:0.036199):0.109706,mm9:0.352605):0.020666,canFam2:0.193569);
+
+
+ # Use this specification in the phyloGif tool:
+ # http://genome.ucsc.edu/cgi-bin/phyloGif
+ # to obtain a gif image for htdocs/images/phylo/hg19_4way.gif
+
+ /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
+ # Use this output to create the table below
+ grep -y hg19 4way.distances.txt | sort -k3,3n
+#
+# If you can fill in all the numbers in this table, you are ready for
+# the multiple alignment procedure
+#
+# featureBits chainLink measures
+# chainOryLat1Link chain linearGap
+# distance on hg19 on other minScore
+# 1 0.069172 - rhesus rheMac2 (% 82.744) (% xx.xxx) 5000 medium
+# 2 0.356914 - dog canFam2 (% 52.879) (% xx.xxx) 3000 medium
+# 3 0.495284 - mouse mm9 (% 35.299) (% 38.693) 3000 medium
+
+ # using the syntenic nets
+ cd /cluster/data/hg19/bed/multiz4way
+ mkdir mafLinks
+ cd mafLinks
+ mkdir rheMac2 canFam2 mm9
+
+ cd mm9
+ ln -s ../../../lastz.mm9/mafSynNet/*.maf.gz .
+ cd ../canFam2
+ ln -s ../../../lastz.canFam2/mafSynNet/*.maf.gz .
+ cd ../rheMac2
+ ln -s ../../../lastz.rheMac2/mafSynNet/*.maf.gz .
+
+ # determine what is the newest version of multiz and use that
+ cd /hive/data/genomes/hg19/bed/multiz4way
+ mkdir penn
+ cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
+ cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
+ cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
+
+ # the autoMultiz cluster run
+ ssh swarm
+ cd /hive/data/genomes/hg19/bed/multiz4way
+
+ # create species list and stripped down tree for autoMZ
+ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
+ 4way.nh > tmp.nh
+ echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh
+ sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
+
+ mkdir run maf
+ cd run
+
+ # NOTE: you need to set the db and multiz dirname properly in this script
+ cat > autoMultiz << '_EOF_'
+#!/bin/csh -ef
+set db = hg19
+set c = $1
+set maf = $2
+set binDir = /hive/data/genomes/hg19/bed/multiz4way/penn
+set tmp = /scratch/tmp/$db/multiz.$c
+set pairs = /hive/data/genomes/hg19/bed/multiz4way/mafLinks
+rm -fr $tmp
+mkdir -p $tmp
+cp ../{tree.nh,species.lst} $tmp
+pushd $tmp
+foreach s (`cat species.lst`)
+ set in = $pairs/$s/$c.maf
+ set out = $db.$s.sing.maf
+ if ($s == $db) then
+ continue
+ endif
+ if (-e $in.gz) then
+ zcat $in.gz > $out
+ else if (-e $in) then
+ cp $in $out
+ else
+ echo "##maf version=1 scoring=autoMZ" > $out
+ endif
+end
+set path = ($binDir $path); rehash
+$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
+popd
+cp $tmp/$c.maf $maf
+rm -fr $tmp
+'_EOF_'
+ # << happy emacs
+ chmod +x autoMultiz
+
+cat << '_EOF_' > template
+#LOOP
+./autoMultiz $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz4way/maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ cut -f1 /cluster/data/hg19/chrom.sizes > chrom.lst
+ gensub2 chrom.lst single template jobList
+ para create jobList
+ # 93 jobs
+ para try ... check ... push ... etc ...
+# Completed: 93 of 93 jobs
+# CPU time in finished jobs: 24282s 404.70m 6.75h 0.28d 0.001 y
+# IO & Wait Time: 2362s 39.36m 0.66h 0.03d 0.000 y
+# Average job time: 286s 4.77m 0.08h 0.00d
+# Longest finished job: 2235s 37.25m 0.62h 0.03d
+# Submission to last job: 2241s 37.35m 0.62h 0.03d
+
+ # combine results into a single file for loading and gbdb reference
+ cd /hive/data/genomes/hg19/bed/multiz4way
+ time nice -n +19 catDir maf > multiz4way.maf
+ # real 3m27.561s
+
+ # makes a 8.5 Gb file:
+ # -rw-rw-r-- 1 9026080732 May 22 11:11 multiz4way.maf
+
+ # Load into database
+ ssh hgwdev
+ cd /hive/data/genomes/hg19/bed/multiz4way
+ mkdir /gbdb/hg19/multiz4way
+ ln -s /hive/data/genomes/hg19/bed/multiz4way/multiz4way.maf \
+ /gbdb/hg19/multiz4way
+ # the hgLoadMaf generates huge tmp files, locate them in /scratch/tmp/
+ cd /scratch/tmp
+ time nice -n +19 hgLoadMaf hg19 multiz4way
+ # real 5m31.883s
+ # Loaded 5788627 mafs in 1 files from /gbdb/hg19/multiz4way
+
+ cd /hive/data/genomes/hg19/bed/multiz4way
+ time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
+ -maxSize=50000 hg19 multiz4waySummary multiz4way.maf
+ # Created 1238721 summary blocks from 11959676 components
+ # and 5788627 mafs from multiz4way.maf
+ # real 6m33.936s
#########################################################################