src/hg/makeDb/doc/hg19.txt 1.26
1.26 2009/06/19 21:37:32 hiram
Cytobands done, Venter Poodle chain/net done, starting 46-way conservation, exoniphy lifted from hg18, liftOver to hg18 done.
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.25
retrieving revision 1.26
diff -b -B -U 4 -r1.25 -r1.26
--- src/hg/makeDb/doc/hg19.txt 11 Jun 2009 18:17:32 -0000 1.25
+++ src/hg/makeDb/doc/hg19.txt 19 Jun 2009 21:37:32 -0000 1.26
@@ -1942,12 +1942,12 @@
# database. Had to work on this program to get it past what is
# evidently a bad entry in hbrc.fixed where columns of information
# are missing for one clone in particular
time fishClones -verbose=2 -fhcrc=fhcrc.sts -noBin hg19 \
- /hive/data/outside/ncbi/fishClones/fishClones.2009-04/fixed.hbrc.txt \
+ /hive/data/genomes/hg19/bed/ncbiCytoBand/contig/fixed.hbrc.txt \
/hive/data/outside/ncbi/fishClones/fishClones.2009-04/clac.out \
./cl_acc_gi_len \
- /hive/data/genomes/hg19/bed/bacends/bacEnds.lifted.psl \
+ /hive/data/genomes/hg19/bed/bacends/bacEnds.load.psl \
fishClones
# real 2m4.708s
# Reading Fish Clones file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/hbrc.fixed
# reading fishInfo file /hive/data/genomes/ncbi/fishClones/fishClones.2006-01/fixed.hbrc.txt
@@ -1971,8 +1971,57 @@
hg19 fishClones fishClones.bed
# Loaded 9461 elements of size 16
##############################################################################
+# CytoBands from Wonhee Jang at NCBI (DONE - 2009-06-10 - Hiram)
+
+ mkdir /hive/data/genomes/hg19/bed/ncbiCytoBand
+ cd /hive/data/genomes/hg19/bed/ncbiCytoBand
+ # received the following files via email:
+ ls -ogrt
+# -rw-rw-r-- 1 187930 Jun 10 13:53 ideogram
+# -rw-rw-r-- 1 672327 Jun 8 09:55 fish.markers.bed
+
+ # created cytobands.bed from the ideogram file with:
+ cat << '_EOF_' > ideoToCytoBand.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+open (FH,"<ideogram") or die "can not read ideogram";
+
+while (my $line = <FH>) {
+ next if $line =~ m/^#/;
+ chomp $line;
+ my ($chr, $arm, $location, $a, $b, $start, $end, $stain) =
+ split('\s+',$line);
+ next if ($location =~ m/[a-z]$/);
+//g;$stain =~ s/
+ $start -= 1 if ($start == 1);
+ printf "chr%s\t%d\t%d\t%s%s\t%s\n", $chr, $start, $end, $arm, $location,
+ $stain;
+}
+
+close (FH);
+'_EOF_'
+ # << happy emacs
+ chmod +x ideoToCytoBand.pl
+ ./ideoToCytoBand.pl > cytobands.bed
+
+ hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBand.sql \
+ hg19 cytoBand cytobands.bed
+
+ hgLoadBed -noBin -tab -sqlTable=${HOME}/src/hg/lib/cytoBandIdeo.sql \
+ hg19 cytoBandIdeo cytobands.bed
+ # checking coverage:
+ featureBits -noRandom -noHap -countGaps hg19 cytoBand
+ # 3095677412 bases of 3095693983 (99.999%) in intersection
+ # that is everything except chrM:
+ echo 3095693983-3095677412 | bc -q
+ # 16571
+
+##############################################################################
# UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
mkdir /hive/data/genomes/hg19/ensembl
cd /hive/data/genomes/hg19/ensembl
wget --timestamping \
@@ -3294,8 +3343,19 @@
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
# real 20m29.049s
+ mkdir /hive/data/genomes/monDom5/bed/blastz.hg19.swap
+ cd /hive/data/genomes/monDom5/bed/blastz.hg19.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/hg19/bed/lastzMonDom5.2009-05-23/DEF \
+ -noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -swap -syntenicNet > swap.log 2>&1 &
+ # real 297m13.041s
+ cat fb.monDom5.chainHg19Link.txt
+ # 406727849 bases of 3501660299 (11.615%) in intersection
+
##############################################################################
# LASTZ Armadillo DasNov2 (DONE - 2009-05-23,28 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
cd /hive/data/genomes/hg19/bed/lastzDasNov2.2009-05-23
@@ -4041,8 +4101,21 @@
time doRecipBest.pl -buildDir=`pwd` hg19 dipOrd1 > rbest.log 2>&1
# real 140m42.014s
##############################################################################
+# LIFTOVER TO Hg18 (DONE - 2009-06-04 - Hiram )
+ mkdir /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
+ cd /hive/data/genomes/hg19/bed/blat.hg18.2009-06-04
+ # -debug run to create run dir, preview scripts...
+ # verifies files can be found
+ doSameSpeciesLiftOver.pl -debug hg19 hg18
+ # Real run:
+ time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
+ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
+ hg19 hg18 > do.log 2>&1
+ # real 115m26.071s
+
+#############################################################################
# BLASTZ/CHAIN/NET/ETC 11 GENOMES TO HG19 (DONE, Andy 2009-06-06)
ssh hgwdev
cd /hive/data/genomes/hg19/bed
mkdir lastz{SpeTri1,FelCat3,CavPor3,BosTau4,PteVam1,EquCab2,VicPac1,MyoLuc1,TurTru1,ChoHof1}.2009-06-04
@@ -4284,9 +4357,594 @@
-chainMinScore=3000 -chainLinearGap=medium -swap cavPor3.DEF >& cavPor3.doSwap.log
# [detach screen]
real 192m39.792s
+##########################################################################
+# LASTZ Venter's Poodle canFamPoodle1 (DONE - 2009-06-05,10 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
+ cd /hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
+
+ cat << '_EOF_' > DEF
+# human vs Venter's poodle
+# TARGET: Human Hg19
+SEQ1_DIR=/scratch/data/hg19/nib
+SEQ1_LEN=/scratch/data/hg19/chrom.sizes
+SEQ1_CHUNK=10000000
+SEQ1_LAP=10000
+
+# QUERY: Dog CanFam2
+SEQ2_DIR=/scratch/data/canFamPoodle1/canFamPoodle1.2bit
+SEQ2_LEN=/scratch/data/canFamPoodle1/chrom.sizes
+SEQ2_CHUNK=40000000
+SEQ2_LAP=0
+SEQ2_LIMIT=600
+
+BASE=/hive/data/genomes/hg19/bed/lastzCanFamPoodle1.2009-06-05
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << happy emacs
+
+ # establish a screen to control this job
+ screen
+ time nice -n +19 doBlastzChainNet.pl \
+ -verbose=2 \
+ `pwd`/DEF \
+ -noDbNameCheck -noLoadChainSplit \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -chainMinScore=3000 -chainLinearGap=medium
+ # real 5162m58.743s
+ cat fb.hg19.chainCanFamPoodle1Link.txt
+ # 898034247 bases of 2897316137 (30.995%) in intersection
+ # the original canFam2 measured:
+ # 1532073507 bases of 2897316137 (52.879%) in intersection
+
+ time nice -n +19 doRecipBest.pl -buildDir=`pwd` \
+ hg19 canFamPoodle1 > rbest.log 2>&1 &
+ # real 811m27.965s
+
+##############################################################################
+## 46-Way Multiz (WORKING - 2009-06-09 - Hiram)
+ mkdir /hive/data/genomes/hg19/bed/multiz46way
+ cd /hive/data/genomes/hg19/bed/multiz46way
+
+ # starting with the 46way tree created from 44 way tree
+ cat << '_EOF_' > 46way.nh
+(((((((((((((((((
+((hg19:0.006591,panTro2:0.006639):0.002184,gorGor1:0.009411):0.009942,
+ponAbe2:0.018342):0.014256,rheMac2:0.036199):0.021496,papHam1:0.04):0.02,
+calJac1:0.066389):0.056911,tarSyr1:0.135169):0.011307,
+(micMur1:0.091452,otoGar1:0.128984):0.035463):0.015304,
+tupBel1:0.183583):0.004688,(((((mm9:0.083220,rn4:0.090564):0.196605,
+dipOrd1:0.209532):0.022555,cavPor3:0.223415):0.009828,
+speTri1:0.146894):0.025042,
+(oryCun1:0.116009,ochPri2:0.198295):0.100037):0.015355):0.020666,
+(((vicPac1:0.105252,(turTru1:0.064182,bosTau4:0.121911):0.025111):0.039691,
+((equCab2:0.107726,(felCat3:0.097971,canFam2:0.100888):0.049486):0.006252,
+(myoLuc1:0.141155,pteVam1:0.111787):0.033187):0.004179):0.011699,
+(eriEur1:0.220580,sorAra1:0.266859):0.056117):0.021065):0.023276,
+(((loxAfr2:0.083775,proCap1:0.152633):0.026190,echTel1:0.240221):0.049905,
+(dasNov2:0.115179,choHof1:0.096272):0.052373):0.006713):0.132748,
+macEug1:0.3):0.1,
+monDom5:0.325899):0.072430,ornAna1:0.453916):0.109903,
+((galGal3:0.166386,taeGut1:0.170717):0.199763,
+anoCar1:0.509545):0.108130):0.166150,xenTro2:0.852482):0.300396,
+(((tetNig1:0.224774,fr2:0.205294):0.191836,
+(gasAcu1:0.313967,oryLat2:0.478451):0.058404):0.322824,
+danRer5:0.731166):0.155214):0.511293,petMar1:0.511293);
+'_EOF_'
+ # << happy emacs
+
+ # Use this specification in the phyloGif tool:
+ # http://genome.ucsc.edu/cgi-bin/phyloGif
+ # to obtain a gif image for htdocs/images/phylo/hg19_46way.gif
+
+ /cluster/bin/phast/all_dists 46way.nh > 46way.distances.txt
+ # Use this output to create the table below, with this perl script:
+ cat << '_EOF_' > sizeStats.pl
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+open (FH, "grep -y hg19 46way.distances.txt | sort -k3,3n|") or
+ die "can not read 46way.distances.txt";
+
+my $count = 0;
+while (my $line = <FH>) {
+ chomp $line;
+ my ($hg19, $D, $dist) = split('\s+', $line);
+ my $chain = "chain" . ucfirst($D);
+ my $B="/hive/data/genomes/hg19/bed/lastz.$D/fb.hg19." .
+ $chain . "Link.txt";
+ my $chainLinkMeasure =
+ `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
+ chomp $chainLinkMeasure;
+ $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
+ $chainLinkMeasure =~ s/\%//;
+ my $swapFile="/hive/data/genomes/${D}/bed/blastz.hg19.swap/fb.${D}.chainHg19Link.txt";
+ my $swapMeasure = "N/A";
+ if ( -s $swapFile ) {
+ $swapMeasure =
+ `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
+ chomp $swapMeasure;
+ $swapMeasure = 0.0 if (length($swapMeasure) < 1);
+ $swapMeasure =~ s/\%//;
+ }
+ my $orgName=
+ `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`;
+ chomp $orgName;
+ if (length($orgName) < 1) {
+ $orgName="N/A";
+ }
+ ++$count;
+ if ($swapMeasure eq "N/A") {
+ printf "# %02d %.4f - %s %s\t(%% %.3f) (%s)\n", $count, $dist,
+ $orgName, $D, $chainLinkMeasure, $swapMeasure
+ } else {
+ printf "# %02d %.4f - %s %s\t(%% %.3f) (%% %.3f)\n", $count, $dist,
+ $orgName, $D, $chainLinkMeasure, $swapMeasure
+ }
+}
+close (FH);
+'_EOF_'
+ # << happy emacs
+ chmod +x ./sizeStats.pl
+ ./sizeStats.pl
+#
+# If you can fill in all the numbers in this table, you are ready for
+# the multiple alignment procedure
+#
+# featureBits chainLink measures
+# chainOryLat1Link chain linearGap
+# distance on hg19 on other minScore
+# 01 0.0132 - Chimp panTro2 (% 94.846) (% 94.908)
+# 02 0.0182 - Gorilla gorGor1 (% 59.484) (N/A)
+# 03 0.0371 - Orangutan ponAbe2 (% 91.350) (% 89.617)
+# 04 0.0692 - Rhesus rheMac2 (% 82.744) (% 87.422)
+# 05 0.0945 - Baboon papHam1 (% 82.810) (N/A)
+# 06 0.1409 - Marmoset calJac1 (% 70.860) (% 71.897)
+# 07 0.2665 - Tarsier tarSyr1 (% 47.830) (N/A)
+# 08 0.2696 - Mouse lemur micMur1 (% 46.519) (N/A)
+# 09 0.3071 - Bushbaby otoGar1 (% 43.644) (N/A)
+# 10 0.3343 - Horse equCab2 (% 57.050) (% 66.774)
+# 11 0.3416 - TreeShrew tupBel1 (% 36.156) (N/A)
+# 12 0.3451 - Dolphin turTru1 (% 48.398) (N/A)
+# 13 0.3500 - Squirrel speTri1 (% 35.713) (N/A)
+# 14 0.3611 - Alpaca vicPac1 (% 39.399) (N/A)
+# 15 0.3620 - Sloth choHof1 (% 34.377) (N/A)
+# 16 0.3653 - Megabat pteVam1 (% 45.414) (N/A)
+# 17 0.3732 - Elephant loxAfr2 (% 35.153) (N/A)
+# 18 0.3740 - Cat felCat3 (% 35.713) (% 61.104)
+# 19 0.3769 - Dog canFam2 (% 52.879) (% 62.055)
+# 20 0.3809 - Armadillo dasNov2 (% 33.543) (N/A)
+# 21 0.3941 - Rabbit oryCun1 (% 33.676) (N/A)
+# 22 0.3946 - Microbat myoLuc1 (% 33.174) (N/A)
+# 23 0.4028 - Cow bosTau4 (% 46.506) (% 50.297)
+# 24 0.4363 - Guinea Pig cavPor3 (% 43.680) (N/A)
+# 25 0.4421 - Rock hyrax proCap1 (% 30.864) (N/A)
+# 26 0.4450 - Kangaroo rat dipOrd1 (% 27.161) (N/A)
+# 27 0.4764 - Pika ochPri2 (% 27.768) (N/A)
+# 28 0.4811 - Hedgehog eriEur1 (% 19.362) (N/A)
+# 29 0.5035 - Tenrec echTel1 (% 23.120) (N/A)
+# 30 0.5153 - Mouse mm9 (% 35.299) (% 38.693)
+# 31 0.5226 - Rat rn4 (% 32.879) (% 36.860)
+# 32 0.5274 - Shrew sorAra1 (% 19.760) (N/A)
+# 33 0.6394 - Wallaby macEug1 (% 6.011) (N/A)
+# 34 0.7653 - Opossum monDom5 (% 14.358) (N/A)
+# 35 0.9657 - Platypus ornAna1 (% 7.627) (% 11.259)
+# 36 1.0960 - Chicken galGal3 (% 3.591) (% 8.786)
+# 37 1.1003 - Zebra finch taeGut1 (% 3.496) (% 7.795)
+# 38 1.2394 - Lizard anoCar1 (% 3.591) (% 5.146)
+# 39 1.6403 - X. tropicalis xenTro2 (% 3.176) (% 6.773)
+# 40 1.9387 - Stickleback gasAcu1 (% 1.916) (% 11.175)
+# 41 1.9634 - Fugu fr2 (% 1.702) (% 10.929)
+# 42 1.9746 - Zebrafish danRer5 (% 2.562) (% 5.144)
+# 43 1.9829 - Tetraodon tetNig1 (% 2.003) (% 14.443)
+# 44 2.1031 - Medaka oryLat2 (% 1.849) (% 6.705)
+# 45 2.1108 - Lamprey petMar1 (% 1.082) (% 3.200)
+
+ # create species list and stripped down tree for autoMZ
+ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
+ 46way.nh > tmp.nh
+ echo `cat tmp.nh` > tree-commas.nh
+ echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
+ sed 's/[()]//g; s/,/ /g' tree.nh > species.list
+
+ cd /hive/data/genomes/hg19/bed/multiz46way
+ # bash shell syntax here ...
+ export H=/hive/data/genomes/hg19/bed
+ mkdir mafLinks
+ for G in `sed -e "s/hg19 //" species.list`
+ do
+ mkdir mafLinks/$G
+ if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then
+ echo "$G - recipBest"
+ ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G
+ else
+ if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then
+ echo "$G - synNet"
+ ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G
+ else
+ if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then
+ echo "$G - mafNet"
+ ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
+ else
+ echo "missing directory lastz.${G}/*Net"
+ fi
+ fi
+ fi
+ done
+
+ # verify the alignment type is correct:
+ for D in `cat /hive/users/hiram/bigWayHg19/ordered.list`
+do
+ ls -l mafLinks/$D/chr1.maf.gz | awk '{print $NF}'
+done
+ # compare to the list at:
+ # http://genomewiki.ucsc.edu/index.php/Hg19_Genome_size_statistics
+
+ # need to split these things up into smaller pieces for
+ # efficient kluster run.
+ cd /hive/data/genomes/hg19/bed/multiz46way
+ mkdir mafSplit
+ cd mafSplit
+ # mafSplitPos splits on gaps or repeat areas that will not have
+ # any chains, approx 5 Mbp intervals, gaps at least 10,000
+ mafSplitPos -minGap=10000 hg19 5stdout | sort -u \
+ | sort -k1,1 -k2,2n > mafSplit.bed
+ # There is a splitRegions.pl script here (copied from previous 44way)
+ # that can create a custom track from this mafSplit.bed file.
+ # Take a look at that in the browser and see if it looks OK,
+ # check the number of sections on each chrom to verify none are
+ # too large. Despite the claim above, it does appear that some
+ # areas are split where actual chains exist.
+
+ # run a small kluster job to split them all
+ ssh memk
+ cd /hive/data/genomes/hg19/bed/multiz46way/mafSplit
+ cat << '_EOF_' > runOne
+#!/bin/csh -ef
+set G = $1
+set C = $2
+mkdir -p $G
+pushd $G > /dev/null
+if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then
+ rm -f hg19_${C}.*.maf
+ mafSplit ../mafSplit.bed hg19_ ../../mafLinks/${G}/${C}.maf.gz
+ gzip hg19_${C}.*.maf
+else
+ touch hg19_${C}.00.maf
+ gzip hg19_${C}.00.maf
+endif
+popd > /dev/null
+'_EOF_'
+ # << happy emacs
+ chmod +x runOne
+
+ cat << '_EOF_' > template
+#LOOP
+runOne $(root1) $(root2) {check out line $(root1)/hg19_$(root2).00.maf}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ for G in `sed -e "s/hg19 //" ../species.list`
+do
+ echo $G
+done > species.list
+ cut -f 1 ../../../chrom.sizes > chr.list
+
+ gensub2 species.list chr.list template jobList
+ para create jobList
+ para try ... check ... push ... etc...
+# Completed: 4185 of 4185 jobs
+# CPU time in finished jobs: 27549s 459.15m 7.65h 0.32d 0.001 y
+# IO & Wait Time: 15763s 262.71m 4.38h 0.18d 0.000 y
+# Average job time: 10s 0.17m 0.00h 0.00d
+# Longest finished job: 158s 2.63m 0.04h 0.00d
+# Submission to last job: 1647s 27.45m 0.46h 0.02d
+
+ # the autoMultiz cluster run
+ ssh swarm
+ cd /hive/data/genomes/hg19/bed/multiz46way/
+
+ mkdir splitRun
+ cd splitRun
+ mkdir maf run
+ cd run
+ mkdir penn
+ cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn
+ cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn
+ cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn
+
+ # set the db and pairs directories here
+ cat > autoMultiz.csh << '_EOF_'
+#!/bin/csh -ef
+set db = hg19
+set c = $1
+set result = $2
+set run = `pwd`
+set tmp = $run/tmp/$db/multiz.$c
+set pairs = /hive/data/genomes/hg19/bed/multiz46way/mafSplit
+/bin/rm -fr $tmp
+/bin/mkdir -p $tmp
+/bin/cp -p ../../tree.nh ../../species.list $tmp
+pushd $tmp
+foreach s (`sed -e "s/ $db//" species.list`)
+ set in = $pairs/$s/$c.maf
+ set out = $db.$s.sing.maf
+ if (-e $in.gz) then
+ /bin/zcat $in.gz > $out
+ if (! -s $out) then
+ echo "##maf version=1 scoring=autoMZ" > $out
+ endif
+ else if (-e $in) then
+ ln -s $in $out
+ else
+ echo "##maf version=1 scoring=autoMZ" > $out
+ endif
+end
+set path = ($run/penn $path); rehash
+$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
+popd
+/bin/rm -f $result
+/bin/cp -p $tmp/$c.maf $result
+/bin/rm -fr $tmp
+/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
+/bin/rmdir --ignore-fail-on-non-empty $run/tmp
+'_EOF_'
+# << happy emacs
+ chmod +x autoMultiz.csh
+
+ cat << '_EOF_' > template
+#LOOP
+./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+# << happy emacs
+
+ gensub2 chr.part.list single template jobList
+ para -ram=8g create jobList
+ # initial run experience suggest some of the big jobs reach 8 Gb
+ # of memory usage, so, tell parasol to limit the number of jobs per
+ # node to avoid thrashing
+ para -ram=8g try
+ para -ram=8g push
+
+# Completed: 504 of 504 jobs
+# CPU time in finished jobs: 3013519s 50225.31m 837.09h 34.88d 0.096 y
+# IO & Wait Time: 1450670s 24177.84m 402.96h 16.79d 0.046 y
+# Average job time: 8858s 147.63m 2.46h 0.10d
+# Longest finished job: 30846s 514.10m 8.57h 0.36d
+# Submission to last job: 47247s 787.45m 13.12h 0.55d
+
+ # put the split maf results back together into a single maf file
+ # eliminate duplicate comments
+ ssh hgwdev
+ cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
+ mkdir ../maf
+ # the sed edits take out partitioning name information from the comments
+ # so the multiple parts will condense to smaller number of lines
+ # this takes almost 2 hours of time, resulting in a bit over 150 Gb,
+ # almost all chrom files over 1 Gb, up to almost 10 Gb for chr2
+ # HOWEVER, this is actually not necessary to maintain these comments,
+ # they are lost during the mafAddIRows
+
+ # plus, these things shouldn't be gzipped, they need to be
+ # ordinary files for loading into database
+ cat << '_EOF_' >> runOne
+#!/bin/csh -fe
+set C = $1
+if ( -s ../maf/${C}.maf.gz ) then
+ rm -f ../maf/${C}.maf.gz
+endif
+head -q -n 1 maf/hg19_${C}.*.maf | sort -u > ../maf/${C}.maf
+grep -h "^#" maf/hg19_${C}.*.maf | egrep -v "maf version=1|eof maf" | \
+ sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \
+ | sort -u >> ../maf/${C}.maf
+grep -h -v "^#" `ls maf/hg19_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf
+tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
+gzip ../maf/${C}.maf
+'_EOF_'
+ # << happy emacs
+ chmod +x runOne
+
+ cat << '_EOF_' >> template
+#LOOP
+runOne $(root1) {check out exists+ ../maf/$(root1).maf.gz}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ cut -f1 ../../../chrom.sizes > chr.list
+ ssh memk
+ cd /hive/data/genomes/hg19/bed/multiz46way/splitRun
+ gensub2 chr.list single template jobList
+ para create jobList
+ para try ... check ... push ... etc ...
+XXX - running Fri Jun 12 16:09:24 PDT 2009
+# Completed: 92 of 93 jobs
+# Crashed: 1 jobs
+# CPU time in finished jobs: 15960s 266.01m 4.43h 0.18d 0.001 y
+# IO & Wait Time: 61004s 1016.73m 16.95h 0.71d 0.002 y
+# Average job time: 837s 13.94m 0.23h 0.01d
+# Longest finished job: 5799s 96.65m 1.61h 0.07d
+# Submission to last job: 8608s 143.47m 2.39h 0.10d
+ # one of the results is completely empty and didn't make any final answer:
+ # ../maf/chrUn_gl000226.maf.gz does not exist
+ # that small contig is completely repeat masked out, it has no alignments
+ # it was there, but it wasn't gzipped, all it has is comments
+
+ # load tables for a look
+ ssh hgwdev
+ mkdir -p /gbdb/hg19/multiz46way/maf
+ cd /hive/data/genomes/hg19/bed/multiz46way/maf
+ ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf
+
+ # this generates an immense multiz46way.tab file in the directory
+ # where it is running. Best to run this over in scratch.
+ cd /data/tmp
+ time nice -n +19 hgLoadMaf \
+ -pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
+ # real 81m26.382s
+ # Loaded 74158519 mafs in 93 files from /gbdb/hg19/multiz46way/maf
+ # load summary table
+ time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
+ | hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
+ -maxSize=200000 multiz46waySummary stdin
+XXX - running with limits in place to avoid out of memory situation
+Got up to about 50 Gb in memory, I wonder if it has a memory leak
+XXX - Thu Jun 18 14:43:52 PDT 2009
+XXX - failed the first time:
+Created 9192683 summary blocks from 1296420257 components and 74158519 mafs from stdin
+Loading into hg19 table multiz46waySummary...
+Loading completeAdvisory lock has been released
+
+real 67m5.748s
+user 56m27.129s
+sys 4m30.870s
+
+# Indexing and tabulating stdin
+# needLargeMem: Out of memory - request size 4128 bytes, errno: 12
+# real 43m32.011s
+# user 39m33.083s
+# sys 3m32.477s
+
+ # real 2m39.822
+ # Created 353577 summary blocks from 2852890 components and 1197504 mafs
+ # from stdin
+
+ # Gap Annotation
+ # prepare bed files with gap info
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/anno
+ cd /hive/data/genomes/hg19/bed/multiz46way/anno
+ mkdir maf run
+
+ # most of these will already exist from previous multiple alignments
+ # remove the echo from in front of the twoBitInfo command to get them
+ # to run if this loop appears to be correct
+ for DB in `cat ../species.list`
+do
+ CDIR="/hive/data/genomes/${DB}"
+ if [ ! -f ${CDIR}/${DB}.N.bed ]; then
+ echo "creating ${DB}.N.bed"
+ echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
+ else
+ ls -og ${CDIR}/${DB}.N.bed
+ fi
+done
+
+ cd run
+ rm -f nBeds sizes
+ for DB in `sed -e "s/hg19 //" ../../species.list`
+do
+ echo "${DB} "
+ ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
+ echo ${DB}.bed >> nBeds
+ ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
+ echo ${DB}.len >> sizes
+done
+
+ # the annotation step requires large memory, run on memk nodes
+ ssh memk
+ cd /hive/data/genomes/hg19/bed/multiz46way/anno/run
+ ls ../../maf | sed -e "s/.maf//" > chr.list
+ cat << '_EOF_' > template
+#LOOP
+./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ cat << '_EOF_' > anno.csh
+#!/bin/csh -fe
+
+set inMaf = ../../maf/$1.maf
+set outMaf = ../maf/$1.maf
+rm -f $outMaf
+mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/hg19/hg19.2bit $outMaf
+'_EOF_'
+ # << happy emacs
+ chmod +x anno.csh
+
+ gensub2 chr.list single template jobList
+ para create jobList
+ # specify lots of ram to get one job per node
+ para -ram=30g push
+
+ ssh hgwdev
+ rm -fr /gbdb/hg19/multiz46way/maf
+ mkdir /gbdb/hg19/multiz46way/maf
+ cd /hive/data/genomes/hg19/bed/multiz46way/anno/maf
+ ln -s `pwd`/*.maf /gbdb/hg19/multiz46way/maf/
+ # by loading this into the table multiz46way, it will replace the
+ # previously loaded table with the unannotated mafs
+ # huge temp files are made, do them on local disk
+ cd /data/tmp
+ time nice -n +19 hgLoadMaf \
+ -pathPrefix=/gbdb/hg19/multiz46way/maf hg19 multiz46way
+ # with final set of quality annotated files:
+ # Loaded 33320838 mafs in 49 files from /gbdb/hg19/multiz46way/maf
+ # real 91m46.889s
+ # running on Irow annotated mafs Fri Nov 28 00:28:09 PST 2008
+ # Loaded 33320675 mafs in 49 files from /gbdb/hg19/multiz46way/maf
+ # real 236m15.279s
+ # running on bare bones mafs Thu Nov 27 19:29:44 PST 2008
+ # Loaded 33273351 mafs in 49 files from /gbdb/hg19/multiz46way/maf
+ # real 198m55.761s - while swarm busy with rebalancing
+ # from before the fixed multiz:
+ # Loaded 35154852 mafs in 49 files from /gbdb/hg19/multiz46way/maf
+ # real 71m5.594s
+
+ time nice -n +19 cat /gbdb/hg19/multiz46way/maf/*.maf \
+ | hgLoadMafSummary hg19 -minSize=30000 -mergeGap=1500 \
+ -maxSize=200000 multiz46waySummary stdin
+ # with the quality annotated mafs, and mem interference on hgwdev:
+ # Created 8514381 summary blocks from 600504256 components \
+ # and 33320838 mafs from stdin
+ # real 169m56.936s
+
+ # with the Irow annotations after the multiz fix:
+ # Created 8514380 summary blocks from 600499937
+ # components and 33298894 mafs from stdin
+ # real 184m42.893s
+ # user 70m44.431s
+ # sys 8m7.970s
+
+ # Created 8514078 summary blocks from 604683213 components
+ # and 35125649 mafs from stdin
+ # real 130m55.115s
+ # user 71m37.409s
+ # sys 8m5.110s
+
+ # by loading this into the table multiz46waySummary, it will replace
+ # the previously loaded table with the unannotated mafs
+ # remove the multiz46way*.tab files in this /data/tmp directory
+# -rw-rw-r-- 1 1949221892 Nov 15 14:04 multiz46way.tab
+# -rw-rw-r-- 1 417994189 Nov 15 20:57 multiz46waySummary.tab
+ wc -l multiz46way*.tab
+ # 33964377 multiz46way.tab
+ # 8514078 multiz46waySummary.tab
+ # 42478455 total
+ rm multiz46way*.tab
+
+ # create some downloads
+ mkdir -p /hive/data/genomes/hg19/bed/multiz46way/download/maf
+ cd /hive/data/genomes/hg19/bed/multiz46way/download/maf
+ time cp -p ../../anno/maf/chr*.maf .
+ # real 72m46.514s
+ # user 0m1.293s
+ # sys 5m15.981s
+ time gzip --rsyncable *.maf
+ time gzip --rsyncable *.maf
+ # real 185m37.884s
+ # user 179m51.161s
+ # sys 3m48.016s
+ time md5sum *.gz > md5sum.txt
+ # real 3m59.009s
+ # user 1m19.338s
+ # sys 0m18.976s
##############################################################################
# LASTZ Sea Hare aplCal1 (STARTING - 2009-06-08 - Galt)
mkdir /hive/data/genomes/hg19/bed/lastzAplCal1.2009-06-08
@@ -4376,4 +5034,39 @@
cat fb.aplCal1.chainHg19Link.txt
# 14163455 bases of 619228098 (2.287%) in intersection
+#########################################################################
+# EXONIPHY Hg19, lifted from hg18 (DONE - 2009-06-19 - Hiram)
+# needed for uscsGenes11 building
+ # create a syntenic liftOver chain file
+ cd /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06
+ time nice -n +19 netSyntenic run.chain/hg18.hg19.noClass.net.gz stdout \
+ | netFilter -syn stdin | netChainSubset -verbose=0 stdin \
+ run.chain/hg18.hg19.all.chain.gz stdout \
+ | chainStitchId stdin stdout | gzip -c > hg18.hg19.syn.chain.gz
+ # memory usage 55492608, utime 3 s/100, stime 3
+ # real 2m35.613s
+
+ # real 5m55.575s
+ # slightly smaller than the ordinary liftOver chain file:
+# -rw-rw-r-- 1 137245 Mar 6 17:37 hg18ToHg19.over.chain.gz
+# -rw-rw-r-- 1 96115 Jun 19 14:30 hg18.hg19.syn.chain.gz
+
+ # exoniphyHg19.gp is prepared as follows
+ mkdir /cluster/data/hg19/bed/exoniphy
+ cd /cluster/data/hg19/bed/exoniphy
+ hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp
+ time nice -n +19 liftOver -genePred exoniphyHg18.gp \
+ /hive/data/genomes/hg18/bed/blat.hg19.2009-03-06/hg18.hg19.syn.chain.gz \
+ exoniphyHg19.gp unmapped
+ wc -l *
+ # 178162 exoniphyHg18.gp
+ # 178109 exoniphyHg19.gp
+ # 106 unmapped
+
+ nice -n +19 hgLoadGenePred -genePredExt hg19 exoniphy exoniphyHg19.gp
+ nice -n +19 featureBits hg19 exoniphy
+ # 27421336 bases of 2897316137 (0.946%) in intersection
+ nice -n +19 featureBits hg18 exoniphy
+ # 27475705 bases of 2881515245 (0.954%) in intersection
+#########################################################################