src/hg/makeDb/doc/calJac1.txt 1.22
1.22 2010/02/12 23:42:33 hiram
liftOver to calJac3 completed
Index: src/hg/makeDb/doc/calJac1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/calJac1.txt,v
retrieving revision 1.21
retrieving revision 1.22
diff -b -B -U 1000000 -r1.21 -r1.22
--- src/hg/makeDb/doc/calJac1.txt 25 Nov 2009 21:48:38 -0000 1.21
+++ src/hg/makeDb/doc/calJac1.txt 12 Feb 2010 23:42:33 -0000 1.22
@@ -1,1656 +1,1668 @@
# for emacs: -*- mode: sh; -*-
# This file describes browser build for the Marmoset
# genome, July 2007
#
# "$Id$"
#
######################################################################
## DOWNLOAD SEQUENCE (DONE - 2007-08-21 - Hiram)
ssh kkstore06
mkdir /cluster/store4/calJac1
ln -s /cluster/store4/calJac1 /cluster/data/calJac1
mkdir /cluster/data/calJac1/wustl
cd /cluster/data/calJac1/wustl
for F in supercontigs.agp.gz supercontigs.fa.gz contigs.fa.gz contigs.fa.qual.gz
do
wget --timestamping \
/pub/organism/Primates/Callithrix_jacchus/assembly/Callithrix_jacchus-2.0.2/output/${F} \
-O ${F}
done
# real 50m13.535s
ls -ogrt
# -rw-rw-r-- 1 6656649 Jun 19 17:03 supercontigs.agp.gz
# -rw-rw-r-- 1 521109271 Jun 19 17:03 contigs.fa.qual.gz
# -rw-rw-r-- 1 781437003 Jun 19 18:30 contigs.fa.gz
# -rw-rw-r-- 1 851641082 Aug 21 13:29 supercontigs.fa.gz
##########################################################################
# fetch photograph (DONE - 2007-08-21 - Hiram)
mkdir /cluster/data/calJac1/photo
cd /cluster/data/calJac1/photo
wget --timestamping \
http://www.genome.gov/Images/press_photos/highres/82-300.jpg \
-O nhgri.original.82-300.jpg
convert -geometry 300x200 -quality 80 nhgri.original.82-300.jpg \
Callithrix_jacchus.jpg
# check this .jpg image into the source tree browser/images/ directory
#######################################################################
## create config.ra and run makeGenomeDb.pl
ssh kkstore06
cd /cluster/data/calJac1
cat << '_EOF_' > calJac1.config.ra
# Config parameters for makeGenomeDb.pl:
db calJac1
scientificName Callithrix jacchus
commonName Marmoset
assemblyDate Jun. 2007
assemblyLabel WUSTL 2.0.2
orderKey 40
clade mammal
genomeCladePriority 16
mitoAcc none
fastaFiles /cluster/data/calJac1/wustl/supercontigs.fa.gz
agpFiles /cluster/data/calJac1/wustl/supercontigs.agp.gz
# qualFiles /dev/null
dbDbSpeciesDir marmoset
'_EOF_'
# << happy emacs
time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \
-stop=agp calJac1.config.ra > makeGenomeDb.out 2>&1 &
# real 24m24.468s
time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \
-continue=db calJac1.config.ra > db.continue.out 2>&1 &
# add the trackDb files to the source tree and to the trackDb/makefile
##########################################################################
## Repeat masker (DONE - 2007-08-21 - Hiram)
ssh kkstore06
## use screen for this
mkdir /cluster/data/calJac1/bed/RepeatMasker
cd /cluster/data/calJac1/bed/RepeatMasker
time nice -n +19 ~/kent/src/hg/utils/automation/doRepeatMasker.pl \
-bigClusterHub=kk \
-buildDir=/cluster/data/calJac1/bed/RepeatMasker calJac1 > do.out 2>&1 &
##############################################################################
## simpleRepeat masking (DONE - 2007-09-05 - Hiram)
## create a kki kluster run
ssh kkr1u00
mkdir /iscratch/i/calJac1
cd /iscratch/i/calJac1
cp -p /cluster/data/calJac1/calJac1.unmasked.2bit .
cp -p /cluster/data/calJac1/chrom.sizes .
twoBitToFa calJac1.unmasked.2bit calJac1.unmasked.fa
mkdir split
# split sequence into about 1000 files, each about 3,000,000 bases
time nice -n +19 faSplit about calJac1.unmasked.fa 3000000 split/cj1_
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/calJac1/ kkr${R}u00:/iscratch/i/calJac1/
done
ssh kki
mkdir -p /cluster/data/calJac1/bed/simpleRepeat/trf
cd /cluster/data/calJac1/bed/simpleRepeat/trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set C = $1:r
set SRC = /iscratch/i/calJac1/split/$C.fa
mkdir -p /scratch/tmp/$C
cp -p $SRC /scratch/tmp/$C/$C.fa
pushd /scratch/tmp/$C
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $C.fa \
/dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C
popd
rm -f $C.bed
cp -p /scratch/tmp/$C/$C.bed .
rm -fr /scratch/tmp/$C
'_EOF_'
# << happy emacs
chmod +x runTrf
cat << '_EOF_' > template
#LOOP
./runTrf $(path1) {check out line $(root1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
ls /iscratch/i/calJac1/split > part.list
gensub2 part.list single template jobList
para create jobList
para try ... check ... push ... etc ...
# Completed: 947 of 947 jobs
# CPU time in finished jobs: 37242s 620.70m 10.35h 0.43d 0.001 y
# IO & Wait Time: 2842s 47.36m 0.79h 0.03d 0.000 y
# Average job time: 42s 0.71m 0.01h 0.00d
# Longest finished job: 1318s 21.97m 0.37h 0.02d
# Submission to last job: 3572s 59.53m 0.99h 0.04d
cat *.bed > ../simpleRepeat.bed
cd ..
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
ssh hgwdev
cd /cluster/data/calJac1/bed/simpleRepeat
time nice -n +19 hgLoadBed calJac1 simpleRepeat \
simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 846105 elements of size 16
# real 0m24.710s
nice -n +19 featureBits calJac1 simpleRepeat \
> fb.simpleRepeat.calJac1.txt 2>&1
cat fb.simpleRepeat.calJac1.txt
# 100489601 bases of 2929139385 (3.431%) in intersection
# add the trfMask to the rmsk masked sequence to get our final
# masked sequence
ssh kkstore06
cd /cluster/data/calJac1
time nice -n +19 cat bed/simpleRepeat/trfMask.bed \
| twoBitMask -add -type=.bed calJac1.rmsk.2bit stdin calJac1.2bit
# measure it
time nice -n +19 twoBitToFa calJac1.2bit stdout \
| faSize stdin > faSize.calJac1.2bit.txt 2>&1
grep masked faSize.calJac1.2bit.txt
# %45.93 masked total, %47.50 masked real
## clean up the /iscratch/i/calJac1/ directory
ssh kkr1u00
cd /iscratch/i/calJac1
rm -fr *
for R in 2 3 4 5 6 7 8
do
rsync -a --progress --delete --stats /iscratch/i/calJac1/ kkr${R}u00:/iscratch/i/calJac1/
done
cd ..
rmdir calJac1
for R in 2 3 4 5 6 7 8
do
ssh kkr${R}u00 rmdir /iscratch/i/calJac1
done
############################################################################
# BLATSERVERS ENTRY (DONE - 2007-09-06 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("calJac1", "blat13", "17786", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("calJac1", "blat13", "17787", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
## BLASTZ swap from hg18 alignments (2007-11-11 - markd)
ssh hgwdev
mkdir /cluster/data/calJac1/bed/blastz.hg18.swap
cd /cluster/data/calJac1/bed/blastz.hg18.swap
ln -s blastz.hg18.swap ../blastz.hg18
/cluster/bin/scripts/doBlastzChainNet.pl \
-swap /cluster/data/hg18/bed/blastz.calJac1.2007-10-07/DEF >& swap.out&
# fb.calJac1.chainHg18Link.txt:
# 2426684781 bases of 2929139385 (82.846%) in intersection
# running syntenic net (DONE - 2007-12-14 - Hiram)
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
/cluster/data/hg18/bed/blastz.calJac1.2007-10-07/DEF \
-bigClusterHub=pk -continue=syntenicNet -syntenicNet \
-swap -chainMinScore=3000 -chainLinearGap=medium > syntenicNet.log 2>&1 &
# real 8m24.277s
# failed during a chainSplit:
# Can't open chain/Contig836.chain to append: Too many open files
# create reciprocal best chains/nets for 9-way maf alignments
ssh hgwdev
cd /cluster/data/calJac1/bed/blastz.hg18.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 hg18 \
> rbest.log 2>&1 &
#########################################################################
## genscan run (DONE - 2007-11-08 - Hiram)
## create hard masked sequence
ssh kkstore06
cd /cluster/data/calJac1
twoBitToFa calJac1.2bit stdout \
| maskOutFa stdin hard stdout | faToTwoBit stdin calJac1.hard.2bit
# And, make sure there aren't any sequences in this lot that have
# become all N's with no sequence left in them. This drives genscan nuts
twoBitToFa calJac1.hard.2bit stdout \
| faCount stdin > faCount.hard.txt
# the lowest three are:
egrep -v "^#|^total" faCount.hard.txt \
| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
# there are a lot of these that turned into zero sequence left
# will sort this out when making the sequence to use on the Iservers
ssh kkr1u00
mkdir /iscratch/i/calJac1/hardMasked
cd /iscratch/i/calJac1/hardMasked
twoBitToFa /cluster/data/calJac1/calJac1.hard.2bit stdout \
| faSplit byname stdin ./
# more than 128 bases of sequence results in the removal of 1,208
# sequences that are too short, leaving 48,516 sequences
egrep -v "^#|^total" /cluster/data/calJac1/faCount.hard.txt \
| awk '{size=$2-$7; if (size < 128) {print $1}}' | while read F
do
rm -f "${F}.fa"
echo "${F}.fa"
done
mkdir ../hardChunks
cd ../hardChunks
# chunk them up into 4,000,000 base packages, no sequence is broken
catDir ../hardMasked \
| faSplit about stdin 4000000 c_
rm -fr ../hardMasked
for R in 2 3 4 5 6 7 8
do
rsync -a --progress ./ kkr${R}u00:/iscratch/i/calJac1/hardChunks/
done
ssh hgwdev
mkdir /cluster/data/calJac1/bed/genscan
cd /cluster/data/calJac1/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/calJac1/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
# Since we split on gaps, we have no chunks like that. You can
# verify with faCount on the chunks.
ls -1Sr /iscratch/i/calJac1/hardChunks/c_*.fa > genome.list
# Create script to run gsBig
cat << '_EOF_' > runGsBig
#!/bin/csh -fe
set runDir = `pwd`
set srcDir = $1
set inFile = $2
set fileRoot = $inFile:r
mkdir /scratch/tmp/$fileRoot
cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
pushd /scratch/tmp/$fileRoot
/cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
popd
cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
rm -fr /scratch/tmp/$fileRoot
'_EOF_'
# << happy emacs
chmod +x runGsBig
# template file for gensub2
cat << '_EOF_' > template
#LOOP
runGsBig /iscratch/i/calJac1/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.list single template jobList
para create jobList
para try, check, push, check, ...
# Completed: 720 of 720 jobs
# CPU time in finished jobs: 55325s 922.09m 15.37h 0.64d 0.002 y
# IO & Wait Time: 2063s 34.38m 0.57h 0.02d 0.000 y
# Average job time: 80s 1.33m 0.02h 0.00d
# Longest finished job: 132s 2.20m 0.04h 0.00d
# Submission to last job: 65396s 1089.93m 18.17h 0.76d
# cat and lift the results into single files
ssh kkstore06
cd /cluster/data/calJac1/bed/genscan
cat gtf/c_*.gtf > genscan.gtf
cat subopt/c_*.bed > genscanSubopt.bed
cat pep/c_*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/calJac1/bed/genscan
ldHgGene calJac1 -gtf genscan genscan.gtf
# Read 64005 transcripts in 344791 lines in 1 files
# 64005 groups 23602 seqs 1 sources 1 feature types
# 64005 gene predictions
hgPepPred calJac1 generic genscanPep genscan.pep
hgLoadBed calJac1 genscanSubopt genscanSubopt.bed
# Loaded 576960 elements of size 6
# check the numbers
time nice -n +19 featureBits calJac1 genscan
# 59205113 bases of 2929139385 (2.021%) in intersection
# the next closest genome with a genscan track
time nice -n +19 featureBits panTro2 genscan
# 53758386 bases of 2909485072 (1.848%) in intersection
time nice -n +19 featureBits mm9 genscan
# 55293837 bases of 2620346127 (2.110%) in intersection
############################################################################
# GENBANK AUTO UPDATE (DONE - 2007-11-21 - Hiram)
# Create a lift file as per the procedures for Chimp from the AGP:
ssh kolossus
cd /cluster/data/calJac1
# MAKE 11.OOC FILE FOR BLAT
blat calJac1.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
# Wrote 34303 overused 11-mers to 11.ooc
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add calJac1 just after panTro2
# calJac1
# Marmoset
calJac1.serverGenome = /cluster/data/calJac1/calJac1.2bit
calJac1.clusterGenome = /scratch/data/calJac1/calJac1.2bit
calJac1.ooc = /cluster/data/calJac1/calJac1/11.ooc
calJac1.lift = no
calJac1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
calJac1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
calJac1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
calJac1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
calJac1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
calJac1.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter}
calJac1.downloadDir = calJac1
calJac1.genbank.est.xeno.load = no
calJac1.refseq.mrna.native.load = yes
calJac1.refseq.mrna.xeno.load = yes
calJac1.refseq.mrna.xeno.loadDesc = yes
cvs ci -m "Added calJac1." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
# Edit src/lib/gbGenome.c to add new species. With these two lines:
# static char *calJacNames[] = {"Callithrix jacchus", NULL};
# {"calJac", calJacNames},
cvs ci -m "Added Callithrix jacchus (Marmoset)." src/lib/gbGenome.c
make install-server
ssh genbank
screen # control this business with a screen since it takes a while
cd /cluster/data/genbank
# This is a call to a script that will push our jobs out to the cluster
# since it's a big job.
time nice -n +19 bin/gbAlignStep -initial calJac1 &
# logFile: var/build/logs/2007.11.20-11:31:54.calJac1.initalign.log
# real 607m38.957s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad calJac1
# logFile: var/dbload/hgwdev/logs/2007.11.25-10:26:23.dbload.log
# real 26m30.926s
# enable daily alignment and update of hgwdev (DONE - 2007-11-21 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add calJac1 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added calJac1." etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# Blastz Platypus ornAna1 (DONE - 2007-11-14 - Hiram)
# this was done a second time, see ornAna1.txt for the second run
# since this run produced a null result for some unknown reason.
ssh kkstore06
screen # use screen to control this job
mkdir /cluster/data/calJac1/bed/blastzOrnAna1.2007-11-14
cd /cluster/data/calJac1/bed/blastzOrnAna1.2007-11-14
cat << '_EOF_' > DEF
# Orangutan vs. platypus
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# QUERY: Marmoset calJac1
SEQ1_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ1_LEN=/cluster/data/calJac1/chrom.sizes
SEQ1_CHUNK=20000000
SEQ2_LIMIT=400
SEQ1_LAP=0
# QUERY: Platypus ornAna1
SEQ2_DIR=/cluster/bluearc/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=400
SEQ2_LAP=0
BASE=/cluster/data/calJac1/bed/blastzOrnAna1.2007-11-14
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
# real 1927m20.962s - to the first pk crash
# this was a tough job to get finished. Several pk crashes,
# problems with garbage in the para.results file, and so forth.
# But, it did finish as of Monday afternoon 2007-11-19
# Completed: 899536 of 900180 jobs
# Crashed: 644 jobs
# CPU time in finished jobs: 131663141s 2194385.68m 36573.09h 1523.88d 4.175 y
# IO & Wait Time: 12592457s 209874.29m 3497.90h 145.75d 0.399 y
# Average job time: 160s 2.67m 0.04h 0.00d
# Longest finished job: 1795s 29.92m 0.50h 0.02d
# Submission to last job: 440290s 7338.17m 122.30h 5.10d
# despite the '644 jobs' crashed, they are actually done and all results
# are complete
# continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-continue=cat -bigClusterHub=pk > cat.log 2>&1 &
# real 31m45.069s
cat fb.calJac1.chainOrnAna1Link.txt
# 0 bases of 2929139385 (0.000%) in intersection
# This error was fixed in the script. It failed on a command in one of
# the ssh scripts that happened to run under the bash shell which did
# not detect the error in a set of piped commands. It was a shell
# wild-card expansion problem, changed to a 'find' to avoid that.
###########################################################################
## BLASTZ Mouse Mm9 swap (DONE - 2007-09-07 - Hiram
ssh kkstore06
# use a screen to control this job
screen
# the original alignment
cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
cat fb.mm9.chainCalJac1Link.txt
# 863961573 bases of 2620346127 (32.971%) in intersection
# the swap
mkdir /cluster/data/calJac1/bed/blastz.mm9.swap
cd /cluster/data/calJac1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
-stop=load -chainMinScore=3000 \
-swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 &
# real 217m10.835s
cat fb.calJac1.chainMm9Link.txt
# 887586922 bases of 2929139385 (30.302%) in intersection
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
-continue=download -chainMinScore=3000 \
-swap -chainLinearGap=medium -bigClusterHub=pk > download.log 2>&1 &
# real 1m9.876s
# run the syntenic net for multiple alignment (DONE - 2007-12-14 - Hiram)
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
-syntenicNet -continue=syntenicNet -chainMinScore=3000 \
-swap -chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 &
# real 7m23.683s
# failed during a chainSplit:
# Can't open chain/Contig1203.chain to append: Too many open files
# create reciprocal best chains/nets for 9-way multiple alignment
ssh hgwdev
cd /cluster/data/calJac1/bed/blastz.mm9.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 mm9 \
> rbest.log 2>&1 &
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 mm9 \
-continue=download > rbest.download.log 2>&1 &
###########################################################################
# Blastz swap Chimp panTro2 (DONE - 2007-11-14 - Hiram)
ssh kkstore06
screen # use screen to manage this job
cd /cluster/data/panTro2/bed/blastzCalJac1.2007-11-13
cat fb.panTro2.chainCalJac1Link.txt
# 2220169777 bases of 2909485072 (76.308%) in intersection
mkdir /cluster/data/calJac1/bed/blastz.panTro2.swap
cd /cluster/data/calJac1/bed/blastz.panTro2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/panTro2/bed/blastzCalJac1.2007-11-13/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# real 320m14.293s
cat fb.calJac1.chainPanTro2Link.txt
# 2264115411 bases of 2929139385 (77.296%) in intersection
# create reciprocal best chains/nets for 9-way maf alignments
ssh hgwdev
cd /cluster/data/calJac1/bed/blastz.panTro2.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 panTro2 \
> rbest.log 2>&1 &
###########################################################################
# SWAP BLASTZ Orangutan ponAbe2 (DONE - 2007-11-29 - Hiram)
# primary blastz result
cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
cat fb.ponAbe2.chainCalJac1Link.txt
# 2310720863 bases of 3093572278 (74.694%) in intersection
# and for the swap
ssh kkstore02
mkdir /cluster/data/calJac1/bed/blastz.ponAbe2.swap
cd /cluster/data/calJac1/bed/blastz.ponAbe2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# real 341m54.548s
cat fb.calJac1.chainPonAbe2Link.txt
# 2253236255 bases of 2929139385 (76.925%) in intersection
# reciprocal best for 9-way maf alignments
ssh hgwdev
cd /cluster/data/calJac1/bed/blastz.ponAbe2.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 ponAbe2 \
> rbest.log 2>&1 &
# real 96m17.285s
###########################################################################
# SWAP BLASTZ Dog canFam2 (DONE - 2007-11-30 - Hiram)
# primary blastz result
cd /cluster/data/canFam2/bed/blastzCalJac1.2007-11-28
cat fb.canFam2.chainCalJac1Link.txt
# 1369690756 bases of 2384996543 (57.429%) in intersection
mkdir /cluster/data/calJac1/bed/blastz.canFam2.swap
cd /cluster/data/calJac1/bed/blastz.canFam2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# encountered difficulties with /scratch/data/ on kolossus
# had to finish the netChains.csh script manually, then continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/canFam2/bed/blastzCalJac1.2007-11-28/DEF \
-continue=load -chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > load.log 2>&1 &
# real 56m44.375s
cat fb.calJac1.chainCanFam2Link.txt
# 1451345669 bases of 2929139385 (49.549%) in intersection
# reciprocal best for 9-way maf alignments
ssh hgwdev
# expects blastz.canFam2 to exist
cd /cluster/data/calJac1/bed
ln -s blastz.canFam2.swap blastz.canFam2
cd /cluster/data/calJac1/bed/blastz.canFam2
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 canFam2 \
> rbest.log 2>&1 &
# real 70m45.324s
###########################################################################
# SWAP BLASTZ Chimp rheMac2 (DONE - 2007-11-18 - Hiram)
# primary blastz result
cd /cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16
cat fb.rheMac2.chainCalJac1Link.txt
# 2055107003 bases of 2646704109 (77.648%) in intersection
# and the download
mkdir /cluster/data/calJac1/bed/blastz.rheMac2.swap
cd /cluster/data/calJac1/bed/blastz.rheMac2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/rheMac2/bed/blastzCalJac1.2007-11-16/DEF \
-swap -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > swap.log 2>&1 &
# real 349m36.073s
cat fb.calJac1.chainRheMac2Link.txt
# 2191300051 bases of 2929139385 (74.810%) in intersection
# reciprocal best for 9-way maf alignments
ssh hgwdev
# expects blastz.rheMac2 to exist
cd /cluster/data/calJac1/bed
ln -s blastz.rheMac2.swap blastz.rheMac2
cd /cluster/data/calJac1/bed/blastz.rheMac2
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl calJac1 rheMac2 \
> rbest.log 2>&1 &
# real 87m53.651s
#########################################################################
## 9-Way Multiz (DONE - 2007-12-21 - Hiram)
##
ssh hgwdev
mkdir /cluster/data/calJac1/bed/multiz9way
cd /cluster/data/calJac1/bed/multiz9way
# take the 30-way tree from mm9 and eliminate genomes not in
# this alignment
# rearrange to get calJac1 on the top of the graph
# paste this tree into the on-line phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to create the image for the tree diagram
# select the 9 organisms from the 30-way recently done on mouse mm9
/cluster/bin/phast/tree_doctor \
--prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Marmoset_calJac1,Rhesus_rheMac2,Orangutan_ponAbe2,Dog_canFam2,Platypus_ornAna1,Opossum_monDom4 \
/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
> 9-way.fullNames.nh
# looks something like this:
((((Mouse_mm9:0.325818,
((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
Orangutan_ponAbe2:0.020000):0.013037,
Rhesus_rheMac2:0.031973):0.036500,
Marmoset_calJac1:0.070000):0.058454):0.019763,
Dog_canFam2:0.187963):0.243550,
Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110);
(((
(Mouse_mm9:0.325818,
(Marmoset_calJac1:0.070000,
(((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
Orangutan_ponAbe2:0.020000):0.013037,
Rhesus_rheMac2:0.031973):0.036500):0.058454):0.019763,
Dog_canFam2:0.187963):0.243550,
Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110);
# rearrange to get Marmoset at the top:
# this leaves us with:
cat << '_EOF_' > calJac1.9-way.nh
(((((Marmoset_calJac1:0.070000,
(((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
Orangutan_ponAbe2:0.020000):0.013037,
Rhesus_rheMac2:0.031973):0.036500):0.058454,
Mouse_mm9:0.325818):0.019763,
Dog_canFam2:0.187963):0.243550,
Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110);
'_EOF_'
# << happy emacs
# create a species list from that file:
sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' calJac1.9-way.nh \
| sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
| sed -e "s/.*_//; s/:.*//" | sort > species.list
# verify that has 9 db names in it
# create a stripped down nh file for use in autoMZ run
echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' calJac1.9-way.nh \
| sed -e "s/ / /g"` > tree.9.nh
# that looks like, as a single line:
# (((((calJac1(((hg18 panTro2)ponAbe2)rheMac2)) mm9) canFam2) monDom4) ornAna1)
# verify all blastz's exists
cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/calJac1/bed/multiz9way
foreach db (`grep -v calJac1 species.list`)
set bdir = /cluster/data/calJac1/bed/blastz.$db
if (-e $bdir/mafRBestNet/calJac1.$db.rbest.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/calJac1.$db.net.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/calJac1.$db.net.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./listMafs.csh
# see what it says, the "mafs not found" should only show up on calJac1
./listMafs.csh
# canFam2 mafRBestNet
# hg18 mafRBestNet
# mm9 mafRBestNet
# monDom4 mafNet
# ornAna1 mafNet
# panTro2 mafRBestNet
# ponAbe2 mafRBestNet
# rheMac2 mafRBestNet
/cluster/bin/phast/all_dists calJac1.9-way.nh > 9way.distances.txt
grep -i caljac 9way.distances.txt | sort -k3,3n
Marmoset_calJac1 Human_hg18 0.138447
Marmoset_calJac1 Rhesus_rheMac2 0.138473
Marmoset_calJac1 Orangutan_ponAbe2 0.139537
Marmoset_calJac1 Chimp_panTro2 0.140242
Marmoset_calJac1 Dog_canFam2 0.336180
Marmoset_calJac1 Mouse_mm9 0.454272
Marmoset_calJac1 Opossum_monDom4 0.712488
Marmoset_calJac1 Platypus_ornAna1 0.968524
# use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser. Zebrafish ends up before
# tetraodon and fugu on the browser despite its distance.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chainCalJac1Link chain linearGap
# distance on CalJac1 on other minScore
# 1 0.138447 Human_hg18 (% 82.846) (% 78.351) 3000 medium
# 2 0.138473 Rhesus_rheMac2 (% 74.810) (% 77.648) 3000 medium
# 3 0.139537 Orangutan_ponAbe2 (% 76.925) (% 74.694) 3000 medium
# 4 0.140242 Chimp_panTro2 (% 77.296) (% 76.308) 3000 medium
# 5 0.336180 Dog_canFam2 (% 57.429) (% 49.549) 3000 medium
# 6 0.454272 Mouse_mm9 (% 30.302) (% 32.971) 3000 medium
# 6 0.712488 Opossum_monDom4 (% 13.357) (% 11.050) 5000 loose
# 7 0.968524 Platypus_ornAna1 (% 7.221) (% 10.619) 5000 loose
# copy net mafs to cluster-friendly storage, splitting chroms
mkdir mafLinks
cd mafLinks
# hint: obtained these links by altering listMafs.csh above
# add an echo statement to output these commands
ln -s ../../blastz.canFam2/mafRBestNet/calJac1.canFam2.rbest.maf.gz \
mafLinks/canFam2.maf.gz
ln -s ../../blastz.hg18/mafRBestNet/calJac1.hg18.rbest.maf.gz \
mafLinks/hg18.maf.gz
ln -s ../../blastz.mm9/mafRBestNet/calJac1.mm9.rbest.maf.gz \
mafLinks/mm9.maf.gz
ln -s ../../blastz.monDom4/mafNet/calJac1.monDom4.net.maf.gz \
mafLinks/monDom4.maf.gz
ln -s ../../blastz.ornAna1/mafNet/calJac1.ornAna1.net.maf.gz \
mafLinks/ornAna1.maf.gz
ln -s ../../blastz.panTro2/mafRBestNet/calJac1.panTro2.rbest.maf.gz \
mafLinks/panTro2.maf.gz
ln -s ../../blastz.ponAbe2/mafRBestNet/calJac1.ponAbe2.rbest.maf.gz \
mafLinks/ponAbe2.maf.gz
ln -s ../../blastz.rheMac2/mafRBestNet/calJac1.rheMac2.rbest.maf.gz \
mafLinks/rheMac2.maf.gz
# need to split these things up by Contig number for efficient kluster run
ssh kkstore06
cd /cluster/data/calJac1/bed/multiz9way/mafLinks
mkdir -p /san/sanvol1/scratch/calJac1/multiz9way/contigMaf
cd /scratch/tmp
for D in `grep -v calJac1 /cluster/data/calJac1/bed/multiz9way/species.list`
do
mkdir /scratch/tmp/${D}
cd /scratch/tmp/${D}
mafSplit -verbose=2 /dev/null -byTarget -useSequenceName Contig \
/cluster/data/calJac1/bed/multiz9way/mafLinks/${D}.maf.gz -outDirDepth=2
rsync -a --progress ./ \
/san/sanvol1/scratch/calJac1/multiz9way/contigMaf/${D}
cd /scratch/tmp
rm -fr ${D}
done
# create a run-time list of contigs to operate on, not all contigs
# exist in all alignments, but we want all contig names used in any
# alignment:
cd /san/sanvol1/scratch/calJac1/multiz9way/contigMaf
for D in *
do
cd "${D}"
find . -type f
cd ..
done | sort -u > /tmp/9-way.contig.list
wc -l /tmp/9-way.contig.list
# 36707 /tmp/9-way.contig.list
# ready for the multiz run
ssh pk
mkdir /cluster/data/calJac1/bed/multiz9way/splitRun
cd /cluster/data/calJac1/bed/multiz9way/splitRun
scp -p kkstore06:/tmp/9-way.contig.list .
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# set the db and pairs directories here
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = calJac1
set subdir = $1
set c = $2
set result = $3
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz9way/contigMaf
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.9.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
set in = $pairs/$s/$subdir/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.9.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(dir1) $(root1) {check out line+ /cluster/data/calJac1/bed/multiz9way/splitRun/maf/$(dir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
# it is a single job since everything is in the same maf file
time nice -n +19 ./autoMultiz.csh calJac1
XXX - running 2007-12-21 16:32 on mkr0u3
sed -e "s/^\.\///" ../9-way.contig.list \
| gensub2 stdin single template jobList
para create jobList
para try ... check ... push ... etc
# Completed: 36707 of 36707 jobs
# CPU time in finished jobs: 244659s 4077.65m 67.96h 2.83d 0.008 y
# IO & Wait Time: 115457s 1924.29m 32.07h 1.34d 0.004 y
# Average job time: 10s 0.16m 0.00h 0.00d
# Longest finished job: 249s 4.15m 0.07h 0.00d
# Submission to last job: 2454s 40.90m 0.68h 0.03d
# put the split maf results back together into a single maf file
# eliminate duplicate comments
ssh kkstore06
cd /cluster/data/calJac1/bed/multiz9way
mkdir togetherMaf
grep "^##maf version" splitRun/maf/0/0/Contig00000.maf \
| sort -u > togetherMaf/calJac1.9way.maf
for F in `find ./splitRun/maf -type f -depth`
do
grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
| sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
done | sort -u >> togetherMaf/calJac1.9way.maf
for F in `find ./splitRun/maf -type f -depth`
do
grep -v -h "^#" "${F}"
done >> togetherMaf/calJac1.9way.maf
grep "^##eof maf" splitRun/maf/0/0/Contig00000.maf \
| sort -u >> togetherMaf/calJac1.9way.maf
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/calJac1/multiz9way/maf
ln -s /cluster/data/calJac1/bed/multiz9way/togetherMaf/*.maf \
/gbdb/calJac1/multiz9way/maf/multiz9way.maf
# this generates an immense multiz9way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /scratch/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/calJac1/multiz9way/maf calJac1 multiz9way
# real 5m6.330s
# Loaded 8484286 mafs in 1 files from /gbdb/calJac1/multiz9way/maf
# load summary table
time nice -n +19 cat /gbdb/calJac1/multiz9way/maf/*.maf \
| hgLoadMafSummary calJac1 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz9waySummary stdin
# real 5m58.150s
# Created 121083 summary blocks from 3410157 components
# and 693943 mafs from stdin
# Gap Annotation
# prepare bed files with gap info
ssh kkstore02
mkdir /cluster/data/calJac1/bed/multiz9way/anno
cd /cluster/data/calJac1/bed/multiz9way/anno
mkdir maf run
# these actually already all exist from previous multiple alignments
for DB in `cat ../species.list`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `grep -v calJac1 ../../species.list`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
ssh memk
# temporarily copy the calJac1.9way.maf file onto the memk
# nodes /scratch/data/calJac1/maf/ directory
for R in 0 1 2 3 4 5 6 7
do
ssh mkr0u${R} rsync -a --progress \
/cluster/data/calJac1/bed/multiz9way/togetherMaf/calJac1.9way.maf.gz \
/scratch/data/calJac1/maf/
done
mkdir /cluster/data/calJac1/bed/multiz9way/anno/splitMaf
# need to split up the single maf file into individual
# per-scaffold maf files to run annotation on
cd /cluster/data/calJac1/bed/multiz9way/anno/splitMaf
# create bed files to list approximately 1553 scaffolds in
# a single list, approximately 33 lists
cat << '_EOF_' > mkBedLists.pl
#!/usr/bin/env perl
use strict;
use warnings;
my $bedCount = 0;
my $i = 0;
my $bedFile = sprintf("file_%d.bed", $bedCount);
open (BF,">$bedFile") or die "can not write to $bedFile $!";
open (FH,"</cluster/data/calJac1/chrom.sizes") or
die "can not read /cluster/data/calJac1/chrom.sizes $!";
while (my $line = <FH>) {
chomp $line;
if ( (($i + 1) % 1553) == 0 ) {
printf "%s\n", $line;
close (BF);
++$bedCount;
$bedFile = sprintf("file_%d.bed", $bedCount);
open (BF,">$bedFile") or die "can not write to $bedFile $!";
}
++$i;
my ($chr, $size) = split('\s+',$line);
printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
}
close (FH);
close (BH);
'_EOF_'
# << happy emacs
chmod +x mkBedLists.pl
./mkBedLists.pl
# now, run a mafsInRegion on each one of those lists
cat << '_EOF_' > runOne
#!/bin/csh -fe
set runDir = "/cluster/data/calJac1/bed/multiz9way/anno/splitMaf"
set resultDir = $1
set bedFile = $resultDir.bed
mkdir -p $resultDir
mkdir -p /scratch/tmp/calJac1/$resultDir
pushd /scratch/tmp/calJac1/$resultDir
mafsInRegion $runDir/$bedFile -outDir . \
/scratch/data/calJac1/maf/calJac1.9way.maf
popd
rsync -q -a /scratch/tmp/calJac1/$resultDir/ ./$resultDir/
rm -fr /scratch/tmp/calJac1/$resultDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/calJac1
'_EOF_'
# << happy emacs
chmod +x runOne
cat << '_EOF_' > template
#LOOP
./runOne $(root1)
#ENDLOOP
'_EOF_'
# << happy emacs
ls file*.bed > runList
gensub2 runList single template jobList
para create jobList
para try ... check ... push ... etc
# Completed: 33 of 33 jobs
# CPU time in finished jobs: 11075s 184.58m 3.08h 0.13d 0.000 y
# IO & Wait Time: 22992s 383.20m 6.39h 0.27d 0.001 y
# Average job time: 1032s 17.21m 0.29h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2019s 33.65m 0.56h 0.02d
# Submission to last job: 22051s 367.52m 6.13h 0.26d
cd /cluster/data/calJac1/bed/multiz9way/anno/run
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set outDir = ../maf/$2
set result = $3
set input = $1
mkdir -p $outDir
cat $input | \
nice mafAddIRows -nBeds=nBeds stdin /scratch/data/calJac1/calJac1.2bit $result
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
find ../splitMaf -type f -name "*.maf > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 36707 of 36707 jobs
# CPU time in finished jobs: 94093s 1568.22m 26.14h 1.09d 0.003 y
# IO & Wait Time: 94674s 1577.90m 26.30h 1.10d 0.003 y
# Average job time: 5s 0.09m 0.00h 0.00d
# Longest finished job: 12s 0.20m 0.00h 0.00d
# Submission to last job: 6129s 102.15m 1.70h 0.07d
ssh kkstore06
cd /cluster/data/calJac1/bed/multiz9way/anno
grep "^##maf version" maf/file_0/Contig0.maf \
| sort -u > calJac1.anno.9way.maf
find ./maf -type f -depth -name "*.maf" | while read F
do
grep -v -h "^#" "${F}"
done >> calJac1.anno.9way.maf
echo "##eof maf" >> calJac1.anno.9way.maf
ssh hgwdev
cd /cluster/data/calJac1/bed/multiz9way/anno
mkdir -p /gbdb/calJac1/multiz9way/anno
ln -s `pwd`/calJac1.anno.9way.maf \
/gbdb/calJac1/multiz9way/anno/multiz9way.maf
# by loading this into the table multiz9way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /scratch/tmp
time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/calJac1/multiz9way/anno \
calJac1 multiz9way
# Loaded 9243378 mafs in 1 files from /gbdb/calJac1/multiz9way/anno
# real 5m39.367s
# normally filter this for chrom size > 1,000,000 and only load
# those chroms. But this is a scaffold assembly, load everything:
hgLoadMafSummary calJac1 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz9waySummary \
/gbdb/calJac1/multiz9way/anno/multiz9way.maf
# Created 121083 summary blocks from 3410157 components and 749940 mafs
# from /gbdb/calJac1/multiz9way/anno/multiz9way.maf
# by loading this into the table multiz9waySummary, it will replace
# the previously loaded table with the unannotated mafs
# remove the multiz9way*.tab files in this /scratch/tmp directory
rm multiz9way*.tab
# And, you can remove the previously loaded non-annotated maf file link:
rm /gbdb/calJac1/multiz9way/maf/multiz9way.maf
rmdir /gbdb/calJac1/multiz9way/maf
###########################################################################
## Annotate 9-way multiple alignment with gene annotations
## (DONE - 2008-01-08 - Hiram)
# Gene frames
## given previous survey done for 8-way alignment on Orangutan,
## try using the following tables for this gene annotation
# use knownGene for hg18, mm9
# use ensGene for monDom4, ornAna1, panTro2, rheMac2
# new try with xenoMrna for ponAbe2, canFam2 and calJac1
ssh hgwdev
mkdir /cluster/data/calJac1/bed/multiz9way/frames
cd /cluster/data/calJac1/bed/multiz9way/frames
mkdir genes
# knownGene
for DB in hg18 mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
for DB in monDom4 ornAna1 panTro2 rheMac2
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# use xenoMrna for ponAbe2, canFam2, calJac1
# loxAfr1 oryCun1 ponAbe2
for DB in ponAbe2 canFam2 calJac1
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \
from xenoMrna,gbCdnaInfo,cds \
where (xenoMrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done
ls -og genes
# -rw-rw-r-- 1 2697811 Jan 8 15:43 calJac1.gp.gz
# -rw-rw-r-- 1 2551552 Jan 8 15:41 canFam2.gp.gz
# -rw-rw-r-- 1 2008806 Jan 8 15:33 hg18.gp.gz
# -rw-rw-r-- 1 1965274 Jan 8 15:33 mm9.gp.gz
# -rw-rw-r-- 1 1751726 Jan 8 15:33 monDom4.gp.gz
# -rw-rw-r-- 1 1232719 Jan 8 15:33 ornAna1.gp.gz
# -rw-rw-r-- 1 1980696 Jan 8 15:33 panTro2.gp.gz
# -rw-rw-r-- 1 2703247 Jan 8 15:39 ponAbe2.gp.gz
# -rw-rw-r-- 1 1935916 Jan 8 15:33 rheMac2.gp.gz
ssh kkstore06
cd /cluster/data/calJac1/bed/multiz9way/frames
# anything to annotate is in a pair, e.g.: calJac1 genes/calJac1.gp.gz
time (cat ../anno/calJac1.anno.9way.maf | nice -n +19 genePredToMafFrames calJac1 stdin stdout calJac1 genes/calJac1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz canFam2 genes/canFam2.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz9way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz9way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 206370 hg18
# 208834 panTro2
# 211731 rheMac2
# 224988 calJac1
# 225518 canFam2
# 225632 mm9
# 261163 ponAbe2
# 417544 ornAna1
# 462890 monDom4
# load the resulting file
ssh hgwdev
cd /cluster/data/calJac1/bed/multiz9way/frames
time nice -n +19 hgLoadMafFrames calJac1 multiz9wayFrames \
multiz9way.mafFrames.gz
# real 0m38.282s
# enable the trackDb entries:
# frames multiz9wayFrames
# irows on
#############################################################################
# phastCons 9-way (DONE - 2007-10-16 - Hiram)
# split 9way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh memk
mkdir /cluster/data/calJac1/bed/multiz9way/msa.split
cd /cluster/data/calJac1/bed/multiz9way/msa.split
mkdir -p /san/sanvol1/scratch/calJac1/multiz9way/cons/ss
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/calJac1/bed/multiz9way/anno/maf
set WINDOWS = /san/sanvol1/scratch/calJac1/multiz9way/cons/ss
pushd $WINDOWS
set resultDir = $1
set c = $2
rm -fr $resultDir/$c
mkdir -p $resultDir
twoBitToFa -seq=$c /scratch/data/calJac1/calJac1.2bit /scratch/tmp/calJac1.$c.fa
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$resultDir/$c.maf -i MAF \
-M /scratch/tmp/calJac1.$c.fa \
-o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/calJac1.$c.fa
popd
mkdir -p $resultDir
date > $resultDir/$c.out
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
# << happy emacs
# create list of maf files:
(cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... etc
# CPU time in finished jobs: 5250s 87.50m 1.46h 0.06d 0.000 y
# IO & Wait Time: 94631s 1577.18m 26.29h 1.10d 0.003 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 9s 0.15m 0.00h 0.00d
# Submission to last job: 3697s 61.62m 1.03h 0.04d
# take the cons and noncons trees from the mouse 30-way
# Estimates are not easy to make, probably more correctly,
# take the 30-way .mod file, and re-use it here.
ssh hgwdev
cd /cluster/data/calJac1/bed/multiz9way
cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh memk
mkdir -p /cluster/data/calJac1/bed/multiz9way/cons/run.cons
cd /cluster/data/calJac1/bed/multiz9way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all gliers placentals
# Well, that's what it was when used in the Mm9 30-way,
# in this instance, there is only the directory "all"
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set subDir = $1
set f = $2
set c = $2:r
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/calJac1/bed/multiz9way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/calJac1/multiz9way/cons
if (-s $cons/$grp/$grp.non-inf) then
cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
cp -p $cons/$grp/$grp.mod $tmp
cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
sleep 4
touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
rm -f $san/$grp/pp/$subDir/$f.pp
rm -f $san/$grp/bed/$subDir/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$subDir
mv $tmp/$f.bed $san/$grp/bed/$subDir
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/calJac1/multiz9way/cons/all/pp/$(root1)/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/calJac1/multiz9way/cons
find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
> /cluster/data/calJac1/bed/multiz9way/cons/ss.list
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
/cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
--prune-all-but=calJac1,hg18,panTro2,rheMac2,ponAbe2,mm9,canFam2,monDom4,ornAna1 \
> all.mod
cd ../run.cons/all
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../run.cons/doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/calJac1/multiz9way/cons/all/pp/$(lastDir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 28485 of 28485 jobs
# CPU time in finished jobs: 14082s 234.70m 3.91h 0.16d 0.000 y
# IO & Wait Time: 188534s 3142.23m 52.37h 2.18d 0.006 y
# Average job time: 7s 0.12m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 17s 0.28m 0.00h 0.00d
# Submission to last job: 72420s 1207.00m 20.12h 0.84d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/calJac1/multiz9way/cons/all
find ./bed -type f -name "Contig*.bed" | xargs cat \
| sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 3 minutes
cp -p mostConserved.bed /cluster/data/calJac1/bed/multiz9way/cons/all
# load into database
ssh hgwdev
cd /cluster/data/calJac1/bed/multiz9way/cons/all
time nice -n +19 hgLoadBed calJac1 phastConsElements9way mostConserved.bed
# Loaded 1297014 elements of size 5
# Try for 5% overall cov, and 70% CDS cov
# We don't have any gene tracks to compare CDS coverage
# --rho .31 --expected-length 45 --target-coverage .3
featureBits calJac1 phastConsElements9way
# 141561229 bases of 2929139385 (4.833%) in intersection
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/calJac1/multiz9way/cons/all
mkdir -p phastCons9wayScores
for D in `ls -1d pp/file* | sort -t_ -k2n`
do
F=${D/pp\/}
out=phastCons9wayScores/${F}.data.gz
echo "${D} > ${F}.data.gz"
ls -S ${D}/*.pp | xargs cat | gzip > ${out}
done
# real 38m22.760s
# copy the phastCons9wayScores to:
# /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/phastConsScores
# for hgdownload downloads
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/calJac1/multiz9way/cons/all
ls -1 phastCons9wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \
| wigEncode -noOverlap stdin phastCons9way.wig phastCons9way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
time nice -n +19 cp -p *.wi? /cluster/data/calJac1/bed/multiz9way/cons/all
# real 1m4.483s
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/calJac1/bed/multiz9way/cons/all
ln -s `pwd`/phastCons9way.wib /gbdb/calJac1/multiz9way/phastCons9way.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/calJac1/multiz9way calJac1 \
phastCons9way phastCons9way.wig
# real 0m56.271s
# remove garbage
rm wiggle.tab
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/calJac1/bed/multiz9way/cons/all
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=calJac1 phastCons9way > histogram.data 2>&1
# real 5m0.608s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Orangutan PonAbe2 Histogram phastCons9way track"
set xlabel " phastCons9way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# These trackDb entries turn on the wiggle phastCons data track:
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons9way
# spanList 1
# autoScale Off
# windowingFunction mean
# pairwiseHeight 12
# yLineOnOff Off
#############################################################################
# Downloads (DONE - 2008-01-11 - Hiram)
# Let's see if the downloads will work
ssh hgwdev
/cluster/data/calJac1
# expecting to find repeat masker .out file here:
ln -s bed/RepeatMasker/calJac1.fa.out .
time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
-workhorse=hgwdev calJac1 > jkStuff/downloads.log 2>&1
# real 24m3.210s
# failed making upstream sequences:
# featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout
# setpriority: Permission denied.
# the 'nice' from my bash shell causes trouble inside the csh
# script which uses nice. Finish off the install step manually
# with the mgcGenes upstreams ...
#############################################################################
# PushQ entries (DONE - 2008-01-11 - Hiram)
ssh hgwdev
/cluster/data/calJac1
/cluster/bin/scripts/makePushQSql.pl calJac1 > jkStuff/pushQ.sql
# output warnings:
# calJac1 does not have seq
# calJac1 does not have gbMiscDiff
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
# genscanPep
#############################################################################
# Create 9-way downloads (DONE - 2008-03-28 - Hiram)
ssh hgwdev
mkdir -p /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way
cd /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way
cp -p \
/san/sanvol1/scratch/calJac1/multiz9way/cons/all/phastCons9wayScores/* .
ln -s ../../cons/all/all.mod ./9way.mod
cp /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/README.txt .
# edit that README.txt to be correct for this 9-way alignment
cd ..
mkdir multiz9way
cd multiz9way
cp -p /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/README.txt .
# edit that README.txt to be correct for this 9-way alignment
ssh kkstore06
mkdir -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way
cd /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way
ln -s ../../calJac1.9-way.nh ./9way.nh
time nice -n +19 gzip -c ../../../anno/calJac1.anno.9way.maf \
> calJac1.9way.maf.gz
# real 310m12.800s
# unusual long time due to nice +19 and conflice with other long-running
# jobs on kkstore06
ssh hgwdev
cd /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way
# creating upstream files from xenoRefGene, bash script:
cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=calJac1
GENE=xenoRefGene
NWAY=multiz9way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
stdin stdout \
-orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
'_EOF_'
# << happy emacs
chmod +x ./mkUpstream.sh
time nice -n +19 ./mkUpstream.sh
# real 119m5.562s
# -rw-rw-r-- 1 42975041 Mar 28 14:27 upstream1000.maf.gz
# -rw-rw-r-- 1 76363192 Mar 28 15:03 upstream2000.maf.gz
# -rw-rw-r-- 1 303870318 Mar 28 15:42 upstream5000.maf.gz
# check the names in these upstream files to ensure sanity:
zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
| sort | uniq -c | sort -rn | less
# should be a list of the other 4 species with a high count,
# then xenoRefGene names, e.g.:
# 51588 rheMac2
# 51588 ponAbe2
# 51588 panTro2
# 51588 ornAna1
# 51588 monDom4
# 51588 mm9
# 51588 hg18
# 51588 canFam2
# 18 NM_001033610
# 17 NM_016957
# 17 NM_000992
# 16 NM_181722
ssh kkstore06
cd /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way
md5sum *.maf.gz > md5sum.txt
cd ../phastCons9way
md5sum *.data.gz *.mod > md5sum.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/calJac1/multiz9way
mkdir /usr/local/apache/htdocs/goldenPath/calJac1/phastCons9way
cd /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/calJac1/multiz9way
cd ../phastCons9way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/calJac1/phastCons9way
# if your ln -s `pwd`/* made extra links to files you don't want there,
# check the goldenPath locations and remove those extra links
#############################################################################
# N-SCAN gene predictions (nscanGene) - (2008-04-03 markd)
# obtained NSCAN predictions from michael brent's group
# at WUSTL
cd /cluster/data/calJac1/bed/nscan/
wget http://mblab.wustl.edu/predictions/marmoset/calJac1/calJac1.gtf
wget http://mblab.wustl.edu/predictions/marmoset/calJac1/calJac1.prot.fa
wget http://mblab.wustl.edu/predictions/marmoset/calJac1/readme.html
bzip2 calJac1.*
chmod a-w *
# load track
gtfToGenePred -genePredExt calJac1.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt calJac1 nscanGene stdin
hgPepPred calJac1 generic nscanPep calJac1.prot.fa.bz2
rm *.tab
# update trackDb; need a calJac1-specific page to describe informants
marmoset/calJac1/nscanGene.html (copy from readme.html)
marmoset/calJac1/trackDb.ra
# set search regex to
termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
#############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
calJac1.upstreamGeneTbl = xenoRefGene
calJac1.upstreamMaf = multiz9way /hive/data/genomes/calJac1/bed/multiz9way/species.list
############################################################################
# QUALITY TRACK (DONE - 2008-11-25 - Hiram)
mkdir /hive/data/genomes/calJac1/bed/qual
cd /hive/data/genomes/calJac1/bed/qual
# the qac file was created by Rico during 28-way annotations
qacToWig -fixed ../quality/calJac1.qac stdout \
| wigEncode stdin qual.wig qual.wib
ln -s `pwd`/qual.wib /gbdb/calJac1/wib
hgLoadWiggle -pathPrefix=/gbdb/calJac1/wib calJac1 quality qual.wig
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
+
############################################################################
+# LIFTOVER TO calJac3 (DONE - 2010-02-11 - Hiram )
+ mkdir /hive/data/genomes/calJac1/bed/blat.calJac3.2010-02-11
+ cd /hive/data/genomes/calJac1/bed/blat.calJac3.2010-02-11
+ # -debug run to create run dir, preview scripts...
+ doSameSpeciesLiftOver.pl -debug calJac1 calJac3
+ # Real run:
+ time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
+ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \
+ calJac1 calJac3 > do.log 2>&1
+ # real 36m16.693s
+#############################################################################