src/hg/makeDb/doc/ponAbe2.txt 1.23
1.23 2010/02/12 23:51:36 hiram
last runs to Marmoset calJac3
Index: src/hg/makeDb/doc/ponAbe2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ponAbe2.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/ponAbe2.txt 20 Sep 2009 17:16:46 -0000 1.22
+++ src/hg/makeDb/doc/ponAbe2.txt 12 Feb 2010 23:51:36 -0000 1.23
@@ -1,2083 +1,2144 @@
# for emacs: -*- mode: sh; -*-
# This file describes browser build for the Orangutan
# genome, July 31 2007
#
# "$Id$"
#
######################################################################
## DOWNLOAD SEQUENCE (DONE - 2007-09-06 - Hiram)
ssh kkstore02
mkdir /cluster/store5/ponAbe2
ln -s /cluster/store5/ponAbe2 /cluster/data/ponAbe2
mkdir /cluster/data/ponAbe2/wustl
cd /cluster/data/ponAbe2/wustl
# LaDeana Hillier provided us with a private copy before it got to the
# public site
wget --timestamping \
"http://genome.wustl.edu/pub/user/lhillier/private/P_abelii/orang_070731.tar.gz"
tar xvzf orang_070731.tar.gz
# creates a directory FINAL_070731
ls -ogrt
# drwxrwsr-x 2 4096 Sep 6 03:29 FINAL_070731
# -rw-rw-r-- 1 906641840 Sep 6 05:01 orang_070731.tar.gz
# chrX.agp and chr20.agp had blank lines at the end, remove those manually
#######################################################################
## create config.ra and run makeGenomeDb.pl (DONE - 2007-09-12 - Hiram)
ssh kkstore02
cd /cluster/data/ponAbe2
cat << '_EOF_' > ponAbe2.config.ra
# Config parameters for makeGenomeDb.pl:
db ponAbe2
scientificName Pongo pygmaeus abelii
commonName Orangutan
assemblyDate Jul. 2007
assemblyLabel WUSTL 3.0
orderKey 31
# clade already on in ponAbe1
# clade mammal
# genomeCladePriority 11
# mitoAcc gi:1743294
mitoAcc X97707.1
fastaFiles /cluster/data/ponAbe2/wustl/FINAL_070731/chr*.fa
agpFiles /cluster/data/ponAbe2/wustl/FINAL_070731/chr*.agp
# qualFiles /dev/null
dbDbSpeciesDir orangutan
'_EOF_'
# << happy emacs
time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \
ponAbe2.config.ra > makeGenomeDb.out 2>&1 &
# real 19m49.887s
# failed in the makeDb due to chrM vs. AGP confusion. Fixed the code,
# and ran the jkStuff/makeDb.csh manually. Then continuing:
time nice -n +19 ~/kent/src/hg/utils/automation/makeGenomeDb.pl \
-continue=dbDb -stop=dbDb ponAbe2.config.ra > dbDb.out 2>&1 &
# no need to run the trackDb business since we can simply
# copy the ponAbe1 directory in the source tree
##########################################################################
## Repeat masker (DONE - 2007-09-15 - Hiram)
ssh kkstore02
## use screen for this
mkdir /cluster/data/ponAbe2/bed/RepeatMasker
cd /cluster/data/ponAbe2/bed/RepeatMasker
time nice -n +19 ~/kent/src/hg/utils/automation/doRepeatMasker.pl \
-bigClusterHub=pk \
-buildDir=/cluster/data/ponAbe2/bed/RepeatMasker ponAbe2 > do.out 2>&1 &
# real 1831m46.301s
##############################################################################
## simpleRepeat masking (DONE - 2007-09-19 - Hiram)
## create a kki kluster run
ssh kkr1u00
mkdir /iscratch/i/ponAbe2
cd /iscratch/i/ponAbe2
cp -p /cluster/data/ponAbe2/ponAbe2.unmasked.2bit .
cp -p /cluster/data/ponAbe2/chrom.sizes .
twoBitToFa ponAbe2.unmasked.2bit ponAbe2.unmasked.fa
mkdir split
# split sequence into individual chrom sequences
faSplit sequence ponAbe2.unmasked.fa 100 split/s_
rm ponAbe2.unmasked.fa ponAbe2.unmasked.2bit
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/ponAbe2/ kkr${R}u00:/iscratch/i/ponAbe2/
done
ssh kki
mkdir -p /cluster/data/ponAbe2/bed/simpleRepeat/trf
cd /cluster/data/ponAbe2/bed/simpleRepeat/trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set C = $1:r
set SRC = /iscratch/i/ponAbe2/split/$C.fa
mkdir -p /scratch/tmp/$C
cp -p $SRC /scratch/tmp/$C/$C.fa
pushd /scratch/tmp/$C
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/x86_64/trf $C.fa \
/dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C
popd
rm -f $C.bed
cp -p /scratch/tmp/$C/$C.bed .
rm -fr /scratch/tmp/$C
'_EOF_'
# << happy emacs
chmod +x runTrf
cat << '_EOF_' > template
#LOOP
./runTrf $(path1) {check out line $(root1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
ls /iscratch/i/ponAbe2/split > part.list
gensub2 part.list single template jobList
para create jobList
para try ... check ... push ... etc ...
# Completed: 55 of 55 jobs
# CPU time in finished jobs: 57854s 964.23m 16.07h 0.67d 0.002 y
# IO & Wait Time: 473s 7.88m 0.13h 0.01d 0.000 y
# Average job time: 1060s 17.67m 0.29h 0.01d
# Longest finished job: 32538s 542.30m 9.04h 0.38d
# Submission to last job: 32538s 542.30m 9.04h 0.38d
ssh kkstore02
cd /cluster/data/ponAbe2/bed/simpleRepeat/trf
cat s_*.bed > ../simpleRepeat.bed
cd ..
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
ssh hgwdev
cd /cluster/data/ponAbe2/bed/simpleRepeat
time nice -n +19 hgLoadBed ponAbe2 simpleRepeat \
simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 1036457 elements of size 16
nice -n +19 featureBits ponAbe2 simpleRepeat \
> fb.simpleRepeat.ponAbe2.txt 2>&1 &
cat fb.simpleRepeat.ponAbe2.txt
# 120386097 bases of 3093572278 (3.891%) in intersection
# add the trfMask to the rmsk masked sequence to get our final
# masked sequence
ssh kkstore02
cd /cluster/data/ponAbe2
time nice -n +19 cat bed/simpleRepeat/trfMask.bed \
| twoBitMask -add -type=.bed ponAbe2.rmsk.2bit stdin ponAbe2.2bit
# can safely ignore the warning about >= 13 fields
# measure it
time nice -n +19 twoBitToFa ponAbe2.2bit stdout \
| faSize stdin > faSize.ponAbe2.2bit.txt 2>&1
grep masked faSize.ponAbe2.2bit.txt
# %45.68 masked total, %50.89 masked real
## clean up the /iscratch/i/ponAbe2/ directory
ssh kkr1u00
cd /iscratch/i/ponAbe2
rm -fr *
for R in 2 3 4 5 6 7 8
do
rsync -a --progress --delete --stats /iscratch/i/ponAbe2/ kkr${R}u00:/iscratch/i/ponAbe2/
ssh kkr${R}u00 rmdir /iscratch/i/ponAbe2
done
cd ..
rmdir ponAbe2
############################################################################
# BLATSERVERS ENTRY (DONE - 2007-09-19 - Hiram)
# Replaced the ponAbe1 blat server with this sequence
ssh hgwdev
hgsql -e 'update blatServers set db="ponAbe2" where db="ponAbe1"' \
hgcentraltest
# Do not need to do this insert, just the update above
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("ponAbe2", "blat13", "17784", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("ponAbe2", "blat13", "17785", "0", "1");' \
hgcentraltest
# test it with some sequence, i.e. the GABRA3 protein from Hg18
# to establish the defaultPos next:
############################################################################
# Reset default position to GABRA3 gene location (like Hg18)
hgsql -e \
'update dbDb set defaultPos="chrX:152299762-152501081" where name="ponAbe2";' \
hgcentraltest
############################################################################
# SWAP BLASTZ Mouse Mm9 (DONE - 2007-09-20 - Hiram)
# the original
ssh kkstore02
screen # control with a screen
cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
cat fb.mm9.chainPonAbe2Link.txt
# 914561309 bases of 2620346127 (34.902%) in intersection
# And, for the swap
mkdir /cluster/data/ponAbe2/bed/blastz.mm9.swap
cd /cluster/data/ponAbe2/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
-chainMinScore=3000 -swap -chainLinearGap=medium \
-bigClusterHub=pk > swap.log 2>&1 &
# real 102m23.209s
cat fb.ponAbe2.chainMm9Link.txt
# 948458190 bases of 3093572278 (30.659%) in intersection
# create syntenicNets for 7-way conservation maf use
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
-continue=syntenicNet -syntenicNet \
-chainMinScore=3000 -swap -chainLinearGap=medium \
-bigClusterHub=pk > synNet.log 2>&1 &
# real 80m44.564s
############################################################################
# GENBANK AUTO UPDATE (DONE - 2007-02-20 - Hiram)
# Create a lift file as per the procedures for Chimp from the AGP:
ssh kolossus
cd /cluster/data/ponAbe2
cat ponAbe2.agp | /cluster/bin/scripts/agpToLift -revStrand \
> jkStuff/genbank.lft
# MAKE 11.OOC FILE FOR BLAT
blat ponAbe2.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1024
# Wrote 36396 overused 11-mers to 11.ooc
# copy that over to /iscratch/i/ponAbe2/ and get that directory copied
# to all the Iservers
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add ponAbe2 just after panTro2
# ponAbe2
# Orangutan
ponAbe2.serverGenome = /cluster/data/ponAbe2/ponAbe2.2bit
ponAbe2.clusterGenome = /iscratch/i/ponAbe2/ponAbe2.2bit
ponAbe2.ooc = /iscratch/i/ponAbe2/11.ooc
ponAbe2.align.unplacedChroms = chrUn,chr*_random
ponAbe2.lift = /cluster/data/ponAbe2/jkStuff/genbank.lft
ponAbe2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
ponAbe2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
ponAbe2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
ponAbe2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
ponAbe2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
ponAbe2.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter}
ponAbe2.downloadDir = ponAbe2
ponAbe2.genbank.est.xeno.load = no
ponAbe2.refseq.mrna.native.load = yes
ponAbe2.refseq.mrna.xeno.load = yes
ponAbe2.refseq.mrna.xeno.loadDesc = yes
cvs ci -m "Added ponAbe2." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
# Edit src/lib/gbGenome.c to add new species. With these two lines:
# static char *ponAbeNames[] = {"Pongo abelii", "Pongo pygmaeus",
# "Pongo pygmaeus pygmaeus", "Pongo pygmaeus abelii", NULL};
cvs ci -m "Added Pongo pygmaeus abelii (Orangutan)." src/lib/gbGenome.c
make install-server
ssh genbank
screen # control this business with a screen since it takes a while
cd /cluster/data/genbank
# 2008-02-14 12:28 add other Pongo names to gbGeneom.c,
# ssh to genbank machine, remove data/aligned/*/ponAbe2
# and start this build again.
# This is a call to a script that will push our jobs out to the cluster
# since it's a big job.
time nice -n +19 bin/gbAlignStep -initial ponAbe2 &
# logFile: var/build/logs/2008.02.14-12:28:36.ponAbe2.initalign.log
# real 702m19.137s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ponAbe2
# logFile: var/dbload/hgwdev/logs/2008.02.22-17:08:01.dbload.log
# real 21m12.354s
# enable daily alignment and update of hgwdev (DONE - 2007-11-09 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add ponAbe2 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added ponAbe2." etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (SCRIPT DONE 10/1/07 angie -- README EDITS TODO)
cd /cluster/data/ponAbe2
# Thought this was done in most chrom-based assemblies:
ssh -x kkstore02 splitFileByColumn \
`pwd`/bed/simpleRepeat/trfMask.bed `pwd`/bed/simpleRepeat/trfMaskChrom
# And these aren't quite where expected, so make some symbolic links:
foreach c (5_h2_hap1 \
6_cox_hap1 6_qbl_hap2 6_cox_hap1_random 6_qbl_hap2_random)
set cOldBase = `echo $c | sed -e 's/_random$//;'`
set cNewBase = `echo $c | sed -e 's/_.*//;'`
ln -s `pwd`/$cOldBase/chr$c.agp $cNewBase/chr$c.agp
ln -s `pwd`/$cOldBase/chr$c.fa.out $cNewBase/chr$c.fa.out
end
ln -s bed/RepeatMasker/ponAbe2.fa.out .
makeDownloads.pl ponAbe2 -verbose=2 \
>& jkStuff/downloads.log & tail -f jkStuff/downloads.log
#TODO:
# *** Edit each README.txt to resolve any notes marked with "***":
# /cluster/data/ponAbe2/goldenPath/database/README.txt
# /cluster/data/ponAbe2/goldenPath/bigZips/README.txt
# /cluster/data/ponAbe2/goldenPath/chromosomes/README.txt
# (The htdocs/goldenPath/ponAbe2/*/README.txt "files" are just links to those.)
# *** If you have to make any edits that would always apply to future
# assemblies from the same sequencing center, please edit them into
# ~/kent/src/hg/utils/automation/makeDownloads.pl (or ask Angie for help).
#########################################################################
## genscan run (DONE - 2007-11-08 - Hiram)
## create hard masked sequence
ssh kkstore02
cd /cluster/data/ponAbe2
for C in `cut -f1 chrom.sizes`
do
CN=${C/_*}
CN=${CN/chr}
echo "${CN}/${C}.hard.fa"
twoBitToFa -seq=${C} ponAbe2.2bit stdout \
| maskOutFa stdin hard ${CN}/${C}.hard.fa
ls -ld "${CN}/${C}.hard.fa"
done
# And, make sure there aren't any sequences in this lot that have
# become all N's with no sequence left in them. This drives genscan nuts
faCount ?/*hard.fa ??/*hard.fa > faCount.hard.txt
# the lowest three are:
egrep -v "^#|^total" faCount.hard.txt \
| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
# chr6_qbl_hap2_random 44531
# chr5_h2_hap1 25615
# chrM 16079
# this is OK, the numbers mean there is plenty of sequence in those bits
ssh kkr1u00
mkdir /iscratch/i/ponAbe2/hardChunks
cd /iscratch/i/ponAbe2/hardChunks
# creating 4,000,000 sized chunks, the chroms stay together as
# single pieces. The contigs get grouped together into 4,000,000
# sized fasta files. You don't want to break these things up
# because genscan will be doing its own internal 2.4 million
# window on these pieces, and the gene names are going to be
# constructed from the sequence name in these fasta files.
cat /cluster/data/ponAbe2/?/*.hard.fa /cluster/data/ponAbe2/??/*.hard.fa \
| faSplit about stdin 4000000 c_
for R in 2 3 4 5 6 7 8
do
rsync -a --progress ./ kkr${R}u00:/iscratch/i/ponAbe2/hardChunks/
done
ssh hgwdev
mkdir /cluster/data/ponAbe2/bed/genscan
cd /cluster/data/ponAbe2/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh kki
cd /cluster/data/ponAbe2/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
# Since we split on gaps, we have no chunks like that. You can
# verify with faCount on the chunks.
ls -1Sr /iscratch/i/ponAbe2/hardChunks/c_*.fa > genome.list
# Create run-time script to operate gsBig in a cluster safe manner
cat << '_EOF_' > runGsBig
#!/bin/csh -fe
set runDir = `pwd`
set srcDir = $1
set inFile = $2
set fileRoot = $inFile:r
mkdir /scratch/tmp/$fileRoot
cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
pushd /scratch/tmp/$fileRoot
/cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
popd
cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
rm -fr /scratch/tmp/$fileRoot
'_EOF_'
# << happy emacs
chmod +x runGsBig
cat << '_EOF_' > template
#LOOP
runGsBig /iscratch/i/ponAbe2/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.list single template jobList
para create jobList
para try, check, push, check, ...
# Completed: 44 of 48 jobs
# Crashed: 4 jobs
# CPU time in finished jobs: 327299s 5454.99m 90.92h 3.79d 0.010 y
# IO & Wait Time: 1091s 18.18m 0.30h 0.01d 0.000 y
# Average job time: 7463s 124.39m 2.07h 0.09d
# Longest finished job: 93797s 1563.28m 26.05h 1.09d
# Submission to last job: 104573s 1742.88m 29.05h 1.21d
# four jobs will not finish. They don't finish on kolossus either.
# The sequence involved is:
# c_12.fa -> chr8
# c_08.fa -> chr6
# c_06.fa -> chr5
# c_04.fa -> chr4
# So, split up these chroms on a couple of the non-bridged gaps
ssh kkr1u00
cd /iscratch/i/ponAbe2/hardChunks
mkdir ../genscanSplit
# examine the gap sizes to see if there is a good size range to
# break them up on, c_12 turns out to be about 30,000
faGapSizes c_12.fa
# this makes 6 or 7 pieces for each of these
faSplit -minGapSize=29999 -lift=../genscanSplit/c_12.lift gap c_12.fa \
40000000 ../genscanSplit/c_12_
faSplit -minGapSize=29999 -lift=../genscanSplit/c_08.lift gap c_08.fa \
40000000 ../genscanSplit/c_08_
faSplit -minGapSize=29999 -lift=../genscanSplit/c_06.lift gap c_06.fa \
40000000 ../genscanSplit/c_06_
faSplit -minGapSize=29999 -lift=../genscanSplit/c_04.lift gap c_04.fa \
40000000 ../genscanSplit/c_04_
# copy the genscanSplit directory to the other Iservers
cd ../genscanSplit
for R in 2 3 4 5 6 7 8
do
rsync -a --progress ./ kkr${R}u00:/iscratch/i/ponAbe2/genscanSplit/
done
# and run these four items in a small kluster job
# can re-use the runGsBig script from before, just a different
# source directory location
ssh kki
mkdir /cluster/data/ponAbe2/bed/genscan/split
cd /cluster/data/ponAbe2/bed/genscan/split
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
cat << '_EOF_' > template
#LOOP
../runGsBig /iscratch/i/ponAbe2/genscanSplit $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
ls -1Sr /iscratch/i/ponAbe2/genscanSplit/c_*.fa > genome.list
gensub2 genome.list single template jobList
para create jobList
# even after this split, three of these jobs would not complete
# Completed: 22 of 25 jobs
# Crashed: 3 jobs
# CPU time in finished jobs: 17154s 285.89m 4.76h 0.20d 0.001 y
# IO & Wait Time: 77s 1.29m 0.02h 0.00d 0.000 y
# Average job time: 783s 13.05m 0.22h 0.01d
# Longest finished job: 1135s 18.92m 0.32h 0.01d
# Submission to last job: 22398s 373.30m 6.22h 0.26d
# so, take those three jobs and split them up and run them like this
# after all that is done, lifting all results back to their chroms
for F in c_04 c_08 c_12
do
sort -k1,1 -k4,4n gtf/${F}_*.gtf ../secondSplit/${F}_?.gtf | liftUp -type=.gtf stdout split.lift error stdin \
| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
| sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
> ${F}.gtf
sort -k1,1 -k2,2n subopt/${F}_*.bed ../secondSplit/${F}_?.bed | liftUp -type=.bed stdout split.lift error stdin \
| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
| sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
> ${F}.bed
cat pep/${F}_*.pep ../secondSplit/${F}_?.pep \
| sed -e "s/${F}_\([0-9][0-9]*\)\./chr${F}.\1/g" \
| sed -e "s/chrc_04/chr4/g; s/chrc_08/chr6/g; s/chrc_12/chr8/g" \
> ${F}.pep
echo $F
done
# that includes the secondary splits for three jobs too.
# the result of all this is copied to ../gtf ../pep and ../subopt
# to fully populate the original result, now continuing with the
# processing of all results
# cat and lift the results into single files
ssh kkstore02
cd /cluster/data/ponAbe2/bed/genscan
sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf
sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed
cat pep/c_*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/ponAbe2/bed/genscan
ldHgGene ponAbe2 -gtf genscan genscan.gtf
# Read 42190 transcripts in 314789 lines in 1 files
# 42190 groups 54 seqs 1 sources 1 feature types
# 42190 gene predictions
hgPepPred ponAbe2 generic genscanPep genscan.pep
hgLoadBed ponAbe2 genscanSubopt genscanSubopt.bed
# Loaded 517874 elements of size 6
# check the numbers
time nice -n +19 featureBits ponAbe2 genscan
# 52587049 bases of 3093572278 (1.700%) in intersection
###########################################################################
# Blastz Platypus ornAna1 (DONE - 2007-11-20 - Hiram)
ssh kkstore05
screen # use screen to control this job
mkdir /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
cd /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
cat << '_EOF_' > DEF
# Orangutan vs. platypus
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# TARGET: Orangutan PonAbe2
SEQ1_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Platypus ornAna1
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 2585m50.826s
# some of the jobs got confused during kk parasol hub reboots, where they
# think they aren't done, but they actually are. A 'para recover' finds
# nothing to re-run, all the result files are present.
# Completed: 263242 of 263488 jobs
# Crashed: 246 jobs
# CPU time in finished jobs: 48351127s 805852.12m 13430.87h 559.62d 1.533 y
# IO & Wait Time: 7609407s 126823.45m 2113.72h 88.07d 0.241 y
# Average job time: 213s 3.54m 0.06h 0.00d
# Longest finished job: 2146s 35.77m 0.60h 0.02d
# Submission to last job: 259048s 4317.47m 71.96h 3.00d
# after verifying all jobs are actually finished, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-continue=cat -bigClusterHub=kk > cat.log 2>&1 &
# real 53m18.757s
cat fb.ponAbe2.chainOrnAna1Link.txt
# 214557704 bases of 3093572278 (6.936%) in intersection
mkdir /cluster/data/ornAna1/bed/blastz.ponAbe2.swap
cd /cluster/data/ornAna1/bed/blastz.ponAbe2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# real 120m20.860s
cat fb.ornAna1.chainPonAbe2Link.txt
# 201368027 bases of 1842236818 (10.931%) in intersection
###########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-19 - Hiram)
ssh kkstore02
screen # use screen to control this job
mkdir /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
cat << '_EOF_' > DEF
# Orangutan vs. Marmoset
BLASTZ_M=50
# TARGET: Orangutan PonAbe2
SEQ1_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100
# QUERY: Marmoset calJac1
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > do.log 2>&1 &
# real 1441m
# Completed: 97520 of 97520 jobs
# CPU time in finished jobs: 21651347s 360855.79m 6014.26h 250.59d 0.687 y
# IO & Wait Time: 10087025s 168117.08m 2801.95h 116.75d 0.320 y
# Average job time: 325s 5.42m 0.09h 0.00d
# Longest finished job: 5645s 94.08m 1.57h 0.07d
# Submission to last job: 67173s 1119.55m 18.66h 0.78d
cat fb.ponAbe2.chainCalJac1Link.txt
# 2310720863 bases of 3093572278 (74.694%) in intersection
# reciprocal best net mafs for 7-way multiz
time nice -n +19 doRecipBest.pl -workhorse=hgwdev ponAbe2 calJac1 \
> rbest.log 2>&1
# this failed at the download step, expects to run on hgwdev
# So, let's see what it makes on hgwdev
ssh hgwdev
cd /cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18
time nice -n +19 doRecipBest.pl -continue=download -workhorse=hgwdev \
ponAbe2 calJac1 > rbest.download.log 2>&1 &
# it populates the directory:
# /usr/local/apache/htdocs/goldenPath/ponAbe2/vsCalJac1/reciprocalBest
ssh kkstore02
mkdir /cluster/data/calJac1/bed/blastz.ponAbe2.swap
cd /cluster/data/calJac1/bed/blastz.ponAbe2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/ponAbe2/bed/blastzCalJac1.2007-11-18/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# real 341m54.548s
cat fb.calJac1.chainPonAbe2Link.txt
# 2253236255 bases of 2929139385 (76.925%) in intersection
#########################################################################
## 8-Way Multiz (DONE - 2007-11-27 - Hiram)
##
ssh hgwdev
mkdir /cluster/data/ponAbe2/bed/multiz8way
cd /cluster/data/ponAbe2/bed/multiz8way
# take the 30-way tree from mm9 and eliminate genomes not in
# this alignment
# rearrange to get ponAbe2 on the top of the graph
# paste this tree into the on-line phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to create the image for the tree diagram
# select the 8 organisms from the 30-way recently done on mouse mm9
/cluster/bin/phast/tree_doctor \
--prune-all-but Human_hg18,Mouse_mm9,Chimp_panTro2,Orangutan_ponAbe2,Rhesus_rheMac2,Marmoset_calJac1,Platypus_ornAna1,Opossum_monDom4 \
/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
> 8-way.fullNames.nh
# looks something like this:
(((Mouse_mm9:0.325818,
((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
Orangutan_ponAbe2:0.020000):0.013037,Rhesus_rheMac2:0.031973):0.036500,
Marmoset_calJac1:0.070000):0.058454):0.263313,
Opossum_monDom4:0.320721):0.088647,Platypus_ornAna1:0.488110);
# rearrange to get Orangutan at the top:
# this leaves us with:
cat << '_EOF_' > ponAbe2.8-way.nh
((((((Orangutan_ponAbe2:0.020000,
(Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037):0.013037,
Rhesus_rheMac2:0.031973):0.036500,
Marmoset_calJac1:0.070000):0.058454,
Mouse_mm9:0.325818):0.263313,
Opossum_monDom4:0.320721):0.088647,
Platypus_ornAna1:0.488110);
'_EOF_'
# << happy emacs
# create a species list from that file:
sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' ponAbe2.8-way.nh \
| sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
| sed -e "s/.*_//; s/:.*//" | sort > species.list
# verify that has 8 db names in it
# create a stripped down nh file for use in autoMZ run
echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' ponAbe2.8-way.nh \
| sed -e "s/ / /g"` > tree.8.nh
# that looks like, as a single line:
# ((((((ponAbe2 (hg18 panTro2)) rheMac2) calJac1) mm9) monDom4) ornAna1)
# verify all blastz's exists
cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/ponAbe2/bed/multiz8way
foreach db (`cat species.list`)
set bdir = /cluster/data/ponAbe2/bed/blastz.$db
if (-e $bdir/mafRBestNet/chr1.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/chr1.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/chr1.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./listMafs.csh
# see what it says, the "mafs not found" should only show up on ponAbe2
./listMafs.csh
# calJac1 mafRBestNet
# hg18 mafSynNet
# mm9 mafSynNet
# monDom4 mafNet
# ornAna1 mafNet
# panTro2 mafSynNet
# ponAbe2 mafs not found
# rheMac2 mafSynNet
/cluster/bin/phast/all_dists ponAbe2.8-way.nh > 8way.distances.txt
grep -i ponabe 8way.distances.txt | sort -k3,3n
# Orangutan_ponAbe2 Human_hg18 0.038910
# Orangutan_ponAbe2 Chimp_panTro2 0.040705
# Orangutan_ponAbe2 Rhesus_rheMac2 0.065010
# Orangutan_ponAbe2 Marmoset_calJac1 0.139537
# Orangutan_ponAbe2 Mouse_mm9 0.453809
# Orangutan_ponAbe2 Opossum_monDom4 0.712025
# Orangutan_ponAbe2 Platypus_ornAna1 0.968061
# use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser. Zebrafish ends up before
# tetraodon and fugu on the browser despite its distance.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chainPonAbe2Link chain linearGap
# distance on PonAbe2 on other minScore
# 1 0.038910 Human_hg18 (% 91.302) (% 92.892) 3000 medium
# 2 0.040705 Chimp_panTro2 (% 89.431) (% 91.033) 3000 medium
# 3 0.065010 Rhesus_rheMac2 (% 82.623) (% 88.157) 3000 medium
# 4 0.139537 Marmoset_calJac1 (% 74.694) (% 76.925) 3000 medium
# 5 0.453809 Mouse_mm9 (% 30.659) (% 34.902) 3000 medium
# 6 0.712025 Opossum_monDom4 (% 11.306) (% 13.137) 5000 loose
# 7 0.968061 Platypus_ornAna1 (% 6.936) (% 10.931) 5000 loose
# copy net mafs to cluster-friendly storage, splitting chroms
# into 50MB chunks to improve run-time
# NOTE: splitting will be different for scaffold-based reference asemblies
ssh hgwdev
mkdir /cluster/data/ponAbe2/bed/multiz8way/run.split
cd /cluster/data/ponAbe2/bed/multiz8way/run.split
# this works by examining the rmsk table for likely repeat areas
# that won't be used in blastz
mafSplitPos ponAbe2 50 mafSplit.bed
ssh kki
cd /cluster/data/ponAbe2/bed/multiz8way/run.split
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set targDb = "ponAbe2"
set db = $1
set sdir = /san/sanvol1/scratch/$targDb/splitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
echo "directory $sdir/$db already exists -- remove and retry"
exit 1
endif
set bdir = /cluster/data/$targDb/bed/blastz.$db
if (! -e $bdir) then
echo "directory $bdir not found"
exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
set mdir = $bdir/mafNet
else
echo "$bdir maf dir not found"
exit 1
endif
echo $mdir
foreach f ($mdir/*)
set c = $f:t:r:r
echo " $c"
nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
grep -v ponAbe2 ../species.list > split.list
cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'_EOF_'
gensub2 split.list single template jobList
para create jobList
# 7 jobs
# start these gently, this is a good load on the san filesystem
para -maxPush=3 push
# wait a while, verify these are running OK
para push
# let that run to a couple completions, a few minutes, then again:
para try
# etc ...
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 6048s 100.81m 1.68h 0.07d 0.000 y
# IO & Wait Time: 593s 9.88m 0.16h 0.01d 0.000 y
# Average job time: 949s 15.81m 0.26h 0.01d
# Longest finished job: 1451s 24.18m 0.40h 0.02d
# Submission to last job: 1471s 24.52m 0.41h 0.02d
# ready for the multiz run
ssh pk
cd /cluster/data/ponAbe2/bed/multiz8way
# actually, the result directory here should be maf.split instead of maf
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# list chrom chunks, any db dir will do; better would be for the
# splitter to generate this file
# We temporarily use __ instead of . to delimit chunk in filename
# so we can use $(root) to get basename
find /san/sanvol1/scratch/ponAbe2/splitStrictMafNet -type f \
| while read F; do basename $F; done \
| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.list
wc -l chromChunks.list
# 104
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = ponAbe2
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
rm -fr $tmp
mkdir -p $tmp
cp ../tree.8.nh ../species.list $tmp
pushd $tmp
foreach s (`cat species.list`)
set c2 = `echo $c | sed 's/__/./'`
set in = $pairs/$s/$c2.maf
set out = $db.$s.sing.maf
if ($s == ponAbe2) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.8.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/ponAbe2/bed/multiz8way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
gensub2 chromChunks.list single template jobList
para create jobList
# the 104 jobs run time:
# Completed: 104 of 104 jobs
# CPU time in finished jobs: 174748s 2912.47m 48.54h 2.02d 0.006 y
# IO & Wait Time: 2952s 49.20m 0.82h 0.03d 0.000 y
# Average job time: 1709s 28.48m 0.47h 0.02d
# Longest finished job: 3340s 55.67m 0.93h 0.04d
# Submission to last job: 4413s 73.55m 1.23h 0.05d
# put the split maf results back together into single chroms
ssh kkstore02
cd /cluster/data/ponAbe2/bed/multiz8way
# here is where the result directory maf should have already been maf.split
mv maf maf.split
mkdir maf
# going to sort out the redundant header garbage to leave a cleaner maf
for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
do
echo ${C}
head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
done
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/ponAbe2/multiz8way/maf
ln -s /cluster/data/ponAbe2/bed/multiz8way/maf/*.maf \
/gbdb/ponAbe2/multiz8way/maf
# this generates a large 1 Gb multiz8way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /scratch/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/ponAbe2/multiz8way/maf ponAbe2 multiz8way
# real 6m37.534s
# Loaded 6981959 mafs in 55 files from /gbdb/ponAbe2/multiz8way/maf
# load summary table
time nice -n +19 cat /gbdb/ponAbe2/multiz8way/maf/*.maf \
| hgLoadMafSummary ponAbe2 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz8waySummary stdin
# Created 1417364 summary blocks from 29928557 components
# and 6981421 mafs from stdin
# real 21m35.057s
# Gap Annotation
# prepare bed files with gap info
ssh kkstore02
mkdir /cluster/data/ponAbe2/bed/multiz8way/anno
cd /cluster/data/ponAbe2/bed/multiz8way/anno
mkdir maf run
# these actually already all exist from previous multiple alignments
for DB in `cat ../species.list`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `grep -v ponAbe2 ../../species.list`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
ssh kki
cd /cluster/data/ponAbe2/bed/multiz8way/anno/run
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /cluster/data/ponAbe2/bed/multiz8way
set c = $1
cat $dir/maf/${c}.maf | \
nice mafAddIRows -nBeds=nBeds stdin /cluster/data/ponAbe2/ponAbe2.2bit $2
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/ponAbe2/bed/multiz8way/anno/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /cluster/data/ponAbe2/chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 55 of 55 jobs
# CPU time in finished jobs: 2278s 37.97m 0.63h 0.03d 0.000 y
# IO & Wait Time: 9909s 165.15m 2.75h 0.11d 0.000 y
# Average job time: 222s 3.69m 0.06h 0.00d
# Longest finished job: 3340s 55.67m 0.93h 0.04d
# Submission to last job: 3897s 64.95m 1.08h 0.05d
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way/anno
mkdir -p /gbdb/ponAbe2/multiz8way/anno/maf
ln -s /cluster/data/ponAbe2/bed/multiz8way/anno/maf/*.maf \
/gbdb/ponAbe2/multiz8way/anno/maf
# by loading this into the table multiz8way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /scratch/tmp
time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/ponAbe2/multiz8way/anno/maf \
ponAbe2 multiz8way
# Loaded 7331265 mafs in 55 files from /gbdb/ponAbe2/multiz8way/anno/maf
# real 8m31.092s
cat /cluster/data/ponAbe2/chrom.sizes | \
awk '{if ($2 > 1000000) { print $1 }}' |
while read C
do
echo /gbdb/ponAbe2/multiz8way/anno/maf/$C.maf
done | xargs cat | \
hgLoadMafSummary ponAbe2 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz8waySummary stdin
# Created 1417364 summary blocks from 29928557 components and 7330669
# mafs from stdin
# by loading this into the table multiz8waySummary, it will replace
# the previously loaded table with the unannotated mafs
# remove the multiz8way*.tab files in this /scratch/tmp directory
rm multiz8way*
#############################################################################
## Annotate 8-way multiple alignment with gene annotations
## (DONE - 2007-11-27 - Hiram)
# Gene frames
## survey all genomes to see what type of gene track to use
ssh hgwdev
mkdir /cluster/data/ponAbe2/bed/multiz8way/frames
cd /cluster/data/ponAbe2/bed/multiz8way/frames
# dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`cat ../species.list`)
echo -n "${db}: "
echo -n "Tables: "
set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
foreach table ($tables)
if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
$table == "knownGene") then
set count = `hgsql $db -N -e "select count(*) from $table"`
echo -n "${table}: ${count}, "
endif
end
set orgName = `hgsql hgcentraltest -N -e \
"select scientificName from dbDb where name='$db'"`
set orgId = `hgsql ponAbe2 -N -e \
"select id from organism where name='$orgName'"`
if ($orgId == "") then
echo "Mrnas: 0"
else
set count = `hgsql ponAbe2 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
echo "Mrnas: ${count}"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./showGenes.csh
# given this output, manually sorted for this display:
# hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 29037,
# refGene: 25902, Mrnas: 209126
# mm9: Tables: knownGene: 49409, mgcGenes: 22950, refGene: 21146, Mrnas: 242485
# monDom4: Tables: ensGene: 33878, refGene: 163, Mrnas: 398
# ornAna1: Tables: ensGene: 25981, refGene: 3, Mrnas: 141
# panTro2: Tables: ensGene: 32852, refGene: 26179, Mrnas: 1297
# rheMac2: Tables: ensGene: 38561, refGene: 426, Mrnas: 3202
# calJac1: Tables: Mrnas: 952
# ponAbe2: Tables: Mrnas: 2
# from this information, conclusions:
# use knownGene for hg18, mm9
# use ensGene for monDom4, ornAna1, panTro2, rheMac2
# no annotations for calJac1, ponAbe2
mkdir genes
# knownGene
for DB in hg18 mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
for DB in monDom4 ornAna1 panTro2 rheMac2
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
ls -og genes
# -rw-rw-r-- 1 2008806 Nov 27 14:37 hg18.gp.gz
# -rw-rw-r-- 1 1965274 Nov 27 14:37 mm9.gp.gz
# -rw-rw-r-- 1 1751726 Nov 27 14:37 monDom4.gp.gz
# -rw-rw-r-- 1 1232719 Nov 27 14:37 ornAna1.gp.gz
# -rw-rw-r-- 1 1980696 Nov 27 14:37 panTro2.gp.gz
# -rw-rw-r-- 1 1935916 Nov 27 14:37 rheMac2.gp.gz
ssh kkstore02
cd /cluster/data/ponAbe2/bed/multiz8way/frames
# leaving out calJac1 ponAbe2
time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames ponAbe2 stdin stdout hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz rheMac2 genes/rheMac2.gp.gz panTro2 genes/panTro2.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz8way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz8way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 219627 hg18
# 221459 panTro2
# 230108 rheMac2
# 238734 mm9
# 412384 ornAna1
# 459521 monDom4
# load the resulting file
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way/frames
time nice -n +19 hgLoadMafFrames ponAbe2 multiz8wayFrames \
multiz8way.mafFrames.gz
# real 0m45.837s
# enable the trackDb entries:
# frames multiz8wayFrames
# irows on
#############################################################################
# phastCons 8-way (DONE - 2007-10-16 - Hiram)
# split 8way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh kki
mkdir /cluster/data/ponAbe2/bed/multiz8way/msa.split
cd /cluster/data/ponAbe2/bed/multiz8way/msa.split
mkdir -p /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/ponAbe2/bed/multiz8way/maf
set WINDOWS = /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss
pushd $WINDOWS
set c = $1
rm -fr $c
mkdir $c
twoBitToFa -seq=$c /scratch/data/ponAbe2/ponAbe2.2bit /scratch/tmp/ponAbe2.$c.fa
# need to truncate odd-ball scaffold/chrom names that include dots
# as phastCons utils can't handle them
set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF
/cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \
-M /scratch/tmp/ponAbe2.$c.fa \
-o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
rm -f $CLEAN_MAF /scratch/tmp/ponAbe2.$c.fa
popd
date >> $c.done
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
# << happy emacs
# do the easy ones first to see some immediate results
ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... etc
# push these with a -maxPush number that would put a single job on
# each iserver, you don't want two jobs at a time on each iserver,
# they consume too much memory and cause swapping to the point where
# they slow to a crawl and take hours to finish.
# Completed: 55 of 55 jobs
# CPU time in finished jobs: 2679s 44.65m 0.74h 0.03d 0.000 y
# IO & Wait Time: 2980s 49.67m 0.83h 0.03d 0.000 y
# Average job time: 103s 1.71m 0.03h 0.00d
# Longest finished job: 716s 11.93m 0.20h 0.01d
# Submission to last job: 10391s 173.18m 2.89h 0.12d
# XXXX Estimates were attempted, not really very useful, instead, as seen
# below, merely take the cons and noncons trees from the mouse 30-way
# Estimate phastCons parameters
# see also:
# http://compgen.bscb.cornell.edu/~acs/phastCons-HOWTO.html
# Create a list of .ss files over 3,000,000 in length
# this is almost everything
cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss
ls -1l chr*/chr*.ss | egrep -v "_hap|chrUn|random" | \
awk '$5 > 3000000 {print $9;}' > ../tuningRun.list
# Set up parasol directory to calculate trees on these 50 regions
ssh kk
mkdir /cluster/data/ponAbe2/bed/multiz8way/treeRun2
cd /cluster/data/ponAbe2/bed/multiz8way/treeRun2
mkdir tree log most
# Tuning this loop should come back to here to recalculate
# Create script that calls phastCons with right arguments
cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set SAN="/san/sanvol1/scratch/ponAbe2/multiz8way/cons"
set SS=$1
set C=$1:h
set F=$1:t
set tmpDir="/scratch/tmp/pA2_$2"
rm -fr $tmpDir
mkdir $tmpDir
mkdir -p log/${C} tree/${C} most/${C}
scp -p hiram@pk:$SAN/ss/$1 $tmpDir/$F
scp -p hiram@pk:$SAN/estimate/starting-tree.mod $tmpDir
pushd $tmpDir
/cluster/bin/phast/$MACHTYPE/phastCons $F starting-tree.mod \
--gc 0.355 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-length 45 --target-coverage 0.3 --most-conserved $F.most \
--quiet --log $F.log --estimate-trees $F.tree
popd
cp -p $tmpDir/$F.log log/$C
cp -p $tmpDir/$F.most most/$C
cp -p $tmpDir/$F.tree.*cons.mod tree/$C
rm -fr $tmpDir
'_EOF_'
# << happy emacs
chmod a+x makeTree.csh
# Create gensub file
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1) $(num1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Make cluster job and run it
scp -p hiram@pk:/san/sanvol1/scratch/ponAbe2/multiz8way/cons/tuningRun.list .
gensub2 tuningRun.list single template jobList
para create jobList
para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs: 51439s 857.32m 14.29h 0.60d 0.002 y
# IO & Wait Time: 948s 15.79m 0.26h 0.01d 0.000 y
# Average job time: 1048s 17.46m 0.29h 0.01d
# Longest finished job: 1388s 23.13m 0.39h 0.02d
# Submission to last job: 4376s 72.93m 1.22h 0.05d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ls -1 tree/chr*/*.cons.mod > cons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
--output-average ave.cons.mod > cons_summary.txt
ls -1 tree/chr*/*.noncons.mod > noncons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
--output-average ave.noncons.mod > noncons_summary.txt
sort -k1,1 -k2,2n most/chr*/*.most > mostConserved.bed
wc -l mostConserved.bed
# at --target_coverage .3 and --expected_length 45
# 1159312 most conserved elements
# primates only at --target_coverage .75 and --expected_length 1000
# measuring entropy
# consEntopy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
/cluster/bin/phast/$MACHTYPE/consEntropy .3 45 --NH 9.78 \
ave.cons.mod ave.noncons.mod
# ( Solving for new omega: 45.000000 21.095519 13.646969 11.146376
# 10.499722 10.437794 10.437185 )
# Transition parameters: gamma=0.300000, omega=45.000000,
# mu=0.022222, nu=0.009524
# Relative entropy: H=0.415553 bits/site
# Expected min. length: L_min=30.634230 sites
# Expected max. length: L_max=24.942235 sites
# Phylogenetic information threshold: PIT=L_min*H=12.730148 bits
# Recommended expected length: omega=10.437185 sites (for L_min*H=9.780000)
# and checking the most conserved areas, on hgwdev:
# at --target_coverage .3 and --expected_length 45
featureBits -noRandom -noHap ponAbe2 mostConserved.bed
# 221759187 bases of 2723012131 (8.144%) in intersection
featureBits -noRandom -noHap -enrichment ponAbe2 genscan:cds \
mostConserved.bed
# genscan:cds 1.688%, mostConserved.bed 8.144%, both 0.146%,
# cover 8.63%, enrich 1.06x
# primates only at --target_coverage .75 and --expected_length 1000
featureBits -noRandom -noHap ponAbe2 mostConserved.bed
# 524207695 bases of 2723012131 (19.251%) in intersection
featureBits -enrichment -noRandom -noHap ponAbe2 genscan:cds \
mostConserved.bed
# genscan:cds 1.688%, mostConserved.bed 19.251%, both 0.314%,
# cover 18.61%, enrich 0.97x
# Estimates could be made, but more correctly, take the 30-way
# .mod file, and re-use it here.
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way
cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
ssh hgwdev
screen # use screen to manage this long running job
cd /cluster/data/ponAbe2/bed/multiz8way/msa.split
# using one of the sections of chr1 as an example
time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
/san/sanvol1/scratch/ponAbe2/multiz8way/cons/ss/chr1/chr1.50000209-60000000.ss \
--tree \
"((((((ponAbe2,(hg18,panTro2)),rheMac2),calJac1),mm9),monDom4),ornAna1)" \
--out-root starting-tree
# real 1m35.979s
# create a single ss file from a whole chrom maf file
/cluster/bin/phast/$MACHTYPE/msa_split chr1.maf -i MAF -M chr1.fa \
-o SS -r chr1 -w 300000000,0 -I 1000 -B 5000
# run a phyloFit on that large .ss file to find a starting-tree
time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
chr1.1-229942017.ss --tree \
"((((((ponAbe2,(hg18,panTro2)),rheMac2),calJac1),mm9),monDom4),ornAna1)" \
--out-root starting-tree
# Reading alignment from chr1.1-229942017.ss ...
# Compacting sufficient statistics ...
# Fitting tree model to chr1.1-229942017.ss using REV ...
# Writing model to starting-tree.mod ...
# Done.
# real 5m30.470s
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.355
# This 0.355 is used in the --gc argument below
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh pk
mkdir -p /cluster/data/ponAbe2/bed/multiz8way/cons/run.cons
cd /cluster/data/ponAbe2/bed/multiz8way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all gliers placentals
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/ponAbe2/bed/multiz8way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/ponAbe2/multiz8way/cons
if (-s $cons/$grp/$grp.non-inf) then
cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf .
cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
cp -p $cons/$grp/$grp.mod .
cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 4
touch $san/$grp/pp/$c $san/$grp/bed/$c
rm -f $san/$grp/pp/$c/$f.pp
rm -f $san/$grp/bed/$c/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
# it may be better to have the result check file be the .pp file, since
# this .bed file is sometimes empty even though there are .pp results
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/ponAbe2/multiz8way/cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/ponAbe2/bed/multiz8way/cons/ss.list
popd
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
/cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
--prune-all-but=ponAbe2,hg18,panTro2,rheMac2,calJac1,mm9,monDom4,ornAna1 \
> all.mod
cd ../run.cons/all
# root1 == chrom name, file1 == ss file name without .ss suffix
# it may be better to have the result check file be the .pp file, since
# this .bed file is sometimes empty even though there are .pp results
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 368 of 368 jobs
# CPU time in finished jobs: 11954s 199.23m 3.32h 0.14d 0.000 y
# IO & Wait Time: 3688s 61.47m 1.02h 0.04d 0.000 y
# Average job time: 43s 0.71m 0.01h 0.00d
# Longest finished job: 52s 0.87m 0.01h 0.00d
# Submission to last job: 577s 9.62m 0.16h 0.01d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
time nice -n +19 cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/ponAbe2/bed/multiz8way/cons/all
# load into database
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
time nice -n +19 hgLoadBed ponAbe2 phastConsElements8way mostConserved.bed
# Loaded 1096981 elements of size 5
# Try for 5% overall cov, and 70% CDS cov
# We don't have and gene tracks to compare CDS coverage
# --rho .31 --expected-length 45 --target-coverage .3
featureBits ponAbe2 phastConsElements8way
# 134611760 bases of 3093572278 (4.351%) in intersection
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons8wayScores
for D in pp/chr*
do
C=${D/pp\/}
out=phastCons8wayScores/${C}.data.gz
echo "${D} > ${C}.data.gz"
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 38m22.760s
# copy the phastCons8wayScores to:
# /cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores
# for hgdownload downloads
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/all
time nice -n +19 ls phastCons8wayScores/*.data.gz | xargs zcat \
| wigEncode -noOverlap stdin phastCons8way.wig phastCons8way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 23m55.813s
time nice -n +19 cp -p *.wi? /cluster/data/ponAbe2/bed/multiz8way/cons/all
# real 2m2.107s
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
ln -s `pwd`/phastCons8way.wib /gbdb/ponAbe2/multiz8way/phastCons8way.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/multiz8way ponAbe2 \
phastCons8way phastCons8way.wig
# real 1m27.666s
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way/cons/all
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=ponAbe2 phastCons8way > histogram.data 2>&1
# real 5m0.608s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Orangutan PonAbe2 Histogram phastCons8way track"
set xlabel " phastCons8way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
### Create a phastCons data set for primates-only
# XXX - this was only a test, this data was not used
# setup primates-only run
ssh pk
cd /cluster/data/ponAbe2/bed/multiz8way/cons
mkdir primates run.cons/primates
cd primates
# primates-only: exclude all but these for phastCons tree:
/cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
--prune-all-but=ponAbe2,hg18,panTro2,rheMac2,calJac1 \
> primates.mod
# and place the removed ones in the non-inf file so phastCons will
# truly ignore them:
echo "monDom4,ornAna1" > primates.non-inf
cd ../run.cons/primates
# root1 == chrom name, file1 == ss file name without .ss suffix
# it may be better to have the result check file be the .pp file, since
# this .bed file is sometimes empty even though there are .pp results
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Four of these jobs fail to produce any output:
# chr5_h2_hap1/chr5_h2_hap1.1-191932.bed
# chr6_cox_hap1_random/chr6_cox_hap1_random.1-239941.bed
# chr6_qbl_hap2_random/chr6_qbl_hap2_random.1-120011.bed
# chrM/chrM.1-16499.bed
# this chrM missing one is interesting. It should have something ?
# I don't know. They all have data in their pp files.
# Completed: 364 of 368 jobs
# Crashed: 4 jobs
# CPU time in finished jobs: 11969s 199.49m 3.32h 0.14d 0.000 y
# IO & Wait Time: 3757s 62.61m 1.04h 0.04d 0.000 y
# Average job time: 43s 0.72m 0.01h 0.00d
# Longest finished job: 56s 0.93m 0.02h 0.00d
# Submission to last job: 2362s 39.37m 0.66h 0.03d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/euarchontoglires
cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/ponAbe2/bed/multiz8way/cons/primates
# load into database
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way/cons/primates
time nice -n +19 hgLoadBed ponAbe2 phastConsElements8wayPrimates \
mostConserved.bed
# Loaded 298598 elements of size 5
# real 0m8.768s
# verify coverage
featureBits ponAbe2 phastConsElements8wayPrimates
# 92315637 bases of 3093572278 (2.984%) in intersection
# Create the downloads .pp files, from which the phastCons wiggle data
# is calculated
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates
mkdir downloads
for D in pp/chr*
do
C=${D/pp\//}
ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
> downloads/${C}.primates.pp.data.gz
echo $D $C done
done
# Create merged posterier probability file and wiggle track data files
cd /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates
time nice -n +19 ls downloads/chr*.data.gz | xargs zcat \
| wigEncode -noOverlap stdin \
phastCons8wayPrimates.wig phastCons8wayPrimates.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
## load table with wiggle data
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way/cons/primates
cp -p /san/sanvol1/scratch/ponAbe2/multiz8way/cons/primates/*.wi? .
ln -s `pwd`/phastCons8wayEuarch.wib \
/gbdb/ponAbe2/multiz8way/phastCons8wayPrimates.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/multiz8way ponAbe2 \
phastCons8wayPrimates phastCons8wayPrimates.wig
# real 0m44.161s
# Create histogram to get an overview of all the data
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=ponAbe2 phastCons8wayPrimates > histogram.data 2>&1
# real 5m15.581s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Orantutan PonAbe2 Histogram phastCons8wayPrimates track"
set xlabel " phastCons8wayPrimates score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
#############################################################################
## Create downloads for 8-way business (DONE - 2007-12-27 - Hiram)
ssh kkstore02
mkdir -p /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
cp -p /cluster/data/ponAbe2/bed/multiz8way/anno/maf/*.maf .
time nice -n +19 gzip -v *.maf
# creating upstream files
ssh hgwdev
cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
# creating upstream files from xenoRefGene, bash script:
DB=ponAbe2
GENE=xenoRefGene
NWAY=multiz8way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
stdin stdout \
-orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
#
ssh kkstore02
cd /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf
md5sum *.maf.gz > md5sum.txt
cd ..
ln -s ../../ponAbe2.8-way.nh ./8way.nh
ln -s $HOME/kent/src/hg/makeDb/trackDb/orangutan/ponAbe2/multiz8way.html .
# copy README.txt from mm9 30-way downloads and edit to represent
# this directory
cd ..
mkdir -p phastCons8way/phastConsScores
cd phastCons8way/phastConsScores
cp -p \
/san/sanvol1/scratch/ponAbe2/multiz8way/cons/all/phastCons8wayScores/*.data.gz .
md5sum *.data.gz > md5sum.txt
cd ..
ln -s ../../cons/all/all.mod 8way.mod
# copy README.txt from mm9 30way downloads and edit to represent
# this directory
# enable them for hgdownload rsync transfer
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/ponAbe2
mkdir multiz8way phastCons8way
cd multiz8way
ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/8way.nh .
ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/*.html .
ln -s /cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/R*.txt .
mkdir maf
cd maf
ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf/*.maf.gz .
ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/multiz8way/maf/md5sum.txt
cd ../../phastCons8way
mkdir phastConsScores
cd phastConsScores
ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores/*.gz .
ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/phastConsScores/md5sum.txt .
cd ..
ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/8way.mod .
ln -s \
/cluster/data/ponAbe2/bed/multiz8way/downloads/phastCons8way/README.txt .
###########################################################################
# BLASTZ SWAP Human Hg18 (DONE - 2007-10-05 - Hiram)
ssh kkstore02
screen # use a screen to control this job
# the original blastz
cd /cluster/data/hg18/bed/blastzPonAbe2.2007-10-02
cat fb.hg18.chainPonAbe2Link.txt
# 2676696124 bases of 2881515245 (92.892%) in intersection
# And the swap
mkdir /cluster/data/ponAbe2/bed/blastz.hg18.swap
cd /cluster/data/ponAbe2/bed/blastz.hg18.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# real 123m9.197s
cat fb.ponAbe2.chainHg18Link.txt
# 2824501297 bases of 3093572278 (91.302%) in intersection
# create syntenicNets for 7-way conservation maf use
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/hg18/bed/blastzPonAbe2.2007-10-02/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-continue=syntenicNet -syntenicNet \
-swap -bigClusterHub=pk > synNet.log 2>&1 &
# real 91m18.097s
###########################################################################
# BLASTZ SWAP Chimp PanTro2 (DONE - 2007-10-16 - Hiram)
ssh kkstore04
screen # use screen to control this job
# the original blastz
cd /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13
cat fb.panTro2.chainPonAbe2Link.txt
# 2648603020 bases of 2909485072 (91.033%) in intersection
# and, for the swap
mkdir /cluster/data/ponAbe2/bed/blastz.panTro2.swap
cd /cluster/data/ponAbe2/bed/blastz.panTro2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# real 160m51.639s
cat fb.ponAbe2.chainPanTro2Link.txt
# 2766615040 bases of 3093572278 (89.431%) in intersection
# real 2597m24.771s
# create syntenicNets for 7-way conservation maf use
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \
-continue=syntenicNet -syntenicNet \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > synNet.log 2>&1 &
# real 131m30.045s
###########################################################################
# BLASTZ SWAP Rhesus rheMac2 (DONE - 2007-10-18 - Hiram)
ssh kkstore01
screen # use screen to control this job
cd /cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16
cat fb.rheMac2.chainPonAbe2Link.txt
# 2333264093 bases of 2646704109 (88.157%) in intersection
# and, for the swap
mkdir /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
cd /cluster/data/ponAbe2/bed/blastz.rheMac2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
-swap -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > swap.log 2>&1 &
# real 289m9.479s
cat fb.ponAbe2.chainRheMac2Link.txt
# 2556010573 bases of 3093572278 (82.623%) in intersection
# create syntenicNets for 7-way conservation maf use
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/rheMac2/bed/blastzPonAbe2.2007-11-16/DEF \
-continue=syntenicNet -syntenicNet \
-swap -chainMinScore=3000 -chainLinearGap=medium \
-workhorse=hgwdev -bigClusterHub=kk > synNet.log 2>&1 &
# real 62m10.816s
###########################################################################
# BLASTZ SWAP Opossum monDom4 (DONE - 2007-11-26 - Hiram)
ssh kkstore04
screen # use screen to control this job
# the original blastz
cd /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22
cat fb.monDom4.chainPonAbe2Link.txt
# 395887393 bases of 3501643220 (11.306%) in intersection
# and for the swap
mkdir /cluster/data/ponAbe2/bed/blastz.monDom4.swap
cd /cluster/data/ponAbe2/bed/blastz.monDom4.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22/DEF \
-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> swap.log 2>&1 &
# real 223m38.325s
cat fb.ponAbe2.chainMonDom4Link.txt
# 406409435 bases of 3093572278 (13.137%) in intersection
###########################################################################
## PUSHQ entry creation (DONE - 2007-12-12 - Hiram)
ssh hgwdev
mkdir /cluster/data/ponAbe2/pushQ
cd /cluster/data/ponAbe2/pushQ
# had to touch
# goldenPath/ponAbe2/liftOver/ponAbe2ToPonAbe1.over.chain.gz
# to get it to pass that check. (there isn't any ponAbe1 release)
/cluster/bin/scripts/makePushQSql.pl -noGenbank ponAbe2 \
> ponAbe2.sql 2> stderr.out
## check the stderr.out for anything that needs to be fixed
## copy ponAbe2.sql to hgwbeta:/tmp
scp ponAbe2.sql hgwbeta:/tmp
## then on hgwbeta
ssh hgwbeta
cd /tmp
### XXX remember to set the pushQ id number in the
# last statement in this SQL file
hgsql qapushq < ponAbe2.sql
###########################################################################
# Blastz Chicken galGal3 (DONE - 2007-12-13 - Hiram)
ssh kkstore03
screen # use screen to control this job
mkdir /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13
cd /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13
cat << '_EOF_' > DEF
# Orangutan vs. chicken
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# TARGET: Orangutan PonAbe2
SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
SEQ1_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/hg/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# testing some changes to the script
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
`pwd`/DEF -chainMinScore=5000 \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# real 407m0.026s
# some of the jobs failed due to out of memory on kk nodes
# did a para recover and ran the last set of jobs on pk
# then, continuing and testing the new memk kluster:
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-continue=cat `pwd`/DEF -chainMinScore=5000 -smallClusterHub=memk \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
# failed when it came to the kolossus run because kkr1u00 was out
# which provides some filesystems to kolossus. Got that fixed
# and finished the netChains.csh script manually.
# real 6m2.271s
# Then, continuing:
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-continue=load `pwd`/DEF -chainMinScore=5000 -smallClusterHub=memk \
-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > load.log 2>&1 &
# real 4m21.364s
cat fb.ponAbe2.chainGalGal3Link.txt
# 134448942 bases of 3093572278 (4.346%) in intersection
mkdir /cluster/data/galGal3/bed/blastz.ponAbe2.swap
cd /cluster/data/galGal3/bed/blastz.ponAbe2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-verbose=2 /cluster/data/ponAbe2/bed/blastzGalGal3.2007-12-13/DEF \
-chainMinScore=5000 -chainLinearGap=loose -smallClusterHub=memk \
-swap -bigClusterHub=kk > swap.log 2>&1 &
cat fb.galGal3.chainPonAbe2Link.txt
# 115578761 bases of 1042591351 (11.086%) in intersection
###########################################################################
# Add quality score track (DONE - 2008-03-10 - Hiram)
ssh kkstore02
cd /cluster/data/ponAbe2/wustl
# use mScore=98 since the "Finished" contigs are not in the qual file
time nice -n +19 zcat contigs.fa.qual.gz \
| qaToQac stdin stdout \
| qacAgpLift -mScore=98 ../ponAbe2.agp stdin ponAbe2.qual.qac
# 3m35.075s
mkdir /cluster/data/ponAbe2/bed/qual
cd /cluster/data/ponAbe2/bed/qual
time nice -n +19 qacToWig -fixed \
/cluster/data/ponAbe2/wustl/ponAbe2.qual.qac stdout \
| wigEncode stdin qual.wig qual.wib
# real 21m33.899s
ssh hgwdev
cd /cluster/data/ponAbe2/bed/qual
rm -f /gbdb/ponAbe2/wib/qual.wib
ln -s /cluster/data/ponAbe2/bed/qual/qual.wib /gbdb/ponAbe2/wib/
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/ponAbe2/wib \
ponAbe2 quality qual.wig
# real 1m30.877s
#############################################################################
# N-SCAN gene predictions (nscanGene) - (2008-03-11 markd)
# obtained NSCAN predictions from michael brent's group
# at WUSTL
cd /cluster/data/ponAbe2/bed/nscan/
wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/ponAbe2.gtf
wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/ponAbe2.prot.fa
wget http://mblab.wustl.edu/predictions/orang_utan/ponAbe2/readme.html
bzip2 ponAbe2.*
chmod a-w *
# load track
gtfToGenePred -genePredExt ponAbe2.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt ponAbe2 nscanGene stdin
hgPepPred ponAbe2 generic nscanPep ponAbe2.prot.fa.bz2
rm *.tab
# update trackDb; need a ponAbe2-specific page to describe informants
orangutan/ponAbe2/nscanGene.html (copy from readme.html)
orangutan/ponAbe2/trackDb.ra
# set search regex to
termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
# Create quality download files (DONE - 2008-10-14 - Hiram)
cd /hive/data/genomes/ponAbe2/bed/qual
qacToQa /hive/data/genomes/ponAbe2/wustl/ponAbe2.qual.qac ponAbe2.qa.fa
cat << '_EOF_' > splitByName.pl
#!/usr/bin/env perl
use warnings;
use strict;
my $chrom = "";
while (my $line = <>) {
if ($line =~ m/^>/) {
close FH if (length($chrom) > 0);
chomp $line;
$chrom = $line;
$chrom =~ s/>//;
$chrom =~ s/\s+//g;
open (FH, "| gzip -c > qa/$chrom.qa.fa.gz") or
die "can not write to qa/$chrom.qa.fa.gz";
print FH ">$chrom\n";
} else {
print FH "$line";
}
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x ./splitByName.pl
mkdir qa
./splitByName.pl < ponAbe2.qa.fa
cd qa
tar cvzf ../ponAbe2.quality.fa.gz ./*.fa.gz
# check byte counts with these files to make sure they are OK
for F in *.fa.gz; do zcat $F; done | wc
# 172337823 3446754951 10512603041
cd ..
wc ponAbe2.qa.fa
# 172337823 3446754951 10512603041 ponAbe2.qa.fa
# identical counts, OK.
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
+
############################################################################
+# calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram)
+ screen # use a screen to manage this multi-day job
+ mkdir /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11
+ cd /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11
+
+ # same kind of parameters as used in human<->marmoset
+ cat << '_EOF_' > DEF
+# Orangutan vs. marmoset
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Orangutan PonAbe2
+SEQ1_DIR=/scratch/data/ponAbe2/ponAbe2.2bit
+SEQ1_LEN=/scratch/data/ponAbe2/chrom.sizes
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=5
+
+# QUERY: Marmoset (calJac3)
+SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
+SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
+SEQ2_LIMIT=50
+SEQ2_CHUNK=10000000
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << this line keeps emacs coloring happy
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -syntenicNet \
+ -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ # real 329m55.081s
+ cat fb.ponAbe2.chainCalJac3Link.txt
+ # 2086557592 bases of 3093572278 (67.448%) in intersection
+
+ mkdir /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap
+ cd /hive/data/genomes/calJac3/bed/blastz.ponAbe2.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/ponAbe2/bed/lastzCalJac3.2010-02-11/DEF \
+ -swap -syntenicNet \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
+ # real 146m12.301s
+ cat fb.calJac3.chainHg19Link.txt
+ # 1978857628 bases of 2752505800 (71.893%) in intersection
+############################################################################
+