src/hg/makeDb/doc/monDom4.txt 1.17
1.17 2009/09/20 17:16:45 markd
transmap build
Index: src/hg/makeDb/doc/monDom4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/monDom4.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/monDom4.txt 1 Jul 2008 16:53:02 -0000 1.16
+++ src/hg/makeDb/doc/monDom4.txt 20 Sep 2009 17:16:45 -0000 1.17
@@ -1,3415 +1,3424 @@
# for emacs: -*- mode: sh; -*-
# Creating the assembly for Monodelphis domestica
# South American, Short-tailed Opossum
# http://www.genome.gov/11510687
# http://www.genome.gov/12512285
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+---------------------------------+
# | tableName | type |
# +-------------+---------------------------------+
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred xenoRefPep xenoRefMrna |
# | ensGene | genePred ensPep |
# | genscan | genePred genscanPep |
# +-------------+---------------------------------+
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2005-08-18 - Hiram)
ssh kkstore01
mkdir -p /cluster/store9/monDom4/broad.mit.edu
ln -s /cluster/store9/monDom4 /cluster/data/monDom4
cd /cluster/data/monDom4/broad.mit.edu
time wget --timestamping --force-directories --directory-prefix=. \
--dont-remove-listing --recursive --level=2 --no-parent \
--no-host-directories --cut-dirs=5 \
ftp://ftp.broad.mit.edu/distribution/assemblies/mammals/monodelphis/monDom4/*
# That takes a number of hours, ending up with:
# -rw-rw-r-- 1 7492963 Jan 5 18:10 Monodelphis4.0.agp
# -rw-rw-r-- 1 1160 Jan 5 18:10 MappedAssembly.Table
# -rw-rw-r-- 1 716 Jan 5 18:10 ForDistribution.command
# -rw-rw-r-- 1 1215 Jan 5 18:10 BasicStats.out
# -rw-rw-r-- 1 1512 Jan 5 18:11 ReadUsage.Table
# -rw-rw-r-- 1 474573344 Jan 5 18:11 Monodelphis4.0.agp.chromosome.qual.gz
# -rw-rw-r-- 1 1201437533 Jan 5 18:11 Monodelphis4.0.agp.chromosome.fasta.gz
# -rw-rw-r-- 1 7647098 Jan 5 18:12 assembly.agp
# -rw-rw-r-- 1 14841256 Jan 5 18:13 assembly.markup
# -rw-rw-r-- 1 2838523 Jan 5 18:13 assembly.links
# -rw-rw-r-- 1 474281414 Jan 5 18:13 assembly.agp.qual.gz
# -rw-rw-r-- 1 1201095284 Jan 5 18:13 assembly.agp.fasta.gz
# -rw-rw-r-- 1 176159300 Jan 5 18:14 assembly.unplaced
# -rw-rw-r-- 1 1151905015 Jan 5 18:14 assembly.reads.gz
# -rw-rw-r-- 1 43 Jan 5 18:15 source
# -rw-rw-r-- 1 471180166 Jan 5 18:15 contigs.quals.gz
# -rw-rw-r-- 1 1198813602 Jan 5 18:15 contigs.bases.gz
# -rw-rw-r-- 1 757582328 Jan 5 18:16 unplaced.fasta.gz
# -rw-rw-r-- 1 2083924258 Jan 5 18:18 unplaced.qual.gz
# -rw-rw-r-- 1 1159288222 Jan 5 19:17 softmasked_monDom3.tgz
zcat broad.mit.edu/Monodelphis4.0.agp.chromosome.fasta.gz \
| faSplit -verbose=2 byname stdin broadSplit/
# combine the Broad split files into single chroms
for C in 1 2 3 4 5 6 7 8 X Un
do
mkdir -p ${C}
rm -f ${C}/chr${C}.fa
echo ">chr${C}" > ${C}/chr${C}.fa
echo -n "${C}/chr${C}.fa working ... "
ls broadSplit/${C}.*-*.fa | sort -t"." -k2,2n | while read F
do
grep -v "^>" ${F} >> ${C}/chr${C}.fa
done
echo "done"
done
# create a 2bit file to get a blastz hg18 started
# this will be replaced later after RepeatMasker with masked
# sequence
faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4.2bit
# generate chrom.sizes list in order by size, biggest first
# This listing is going to be used in a variety of procedures
# below
twoBitInfo monDom4.2bit stdout | sort -rn +1 > chrom.sizes
mkdir /san/sanvol1/scratch/monDom4
cp -p monDom4.2bit /san/sanvol1/scratch/monDom4
cp -p chrom.sizes /san/sanvol1/scratch/monDom4
# split into 500K chunks for RepeatMasker
mkdir split500K
for C in 1 2 3 4 5 6 7 8 X Un
do
mkdir split500K/${C}
faSplit -verbose=2 size ${C}/chr${C}.fa 500000 split500K/${C}/chr${C}_ \
-lift=split500K/${C}.lft
done
mkdir lifts
cat split500K/*.lft > lifts/lift500K.lft
###########################################################################
# RepeatMasker run (DONE 2006-02-06 - Hiram)
ssh kk
mkdir /cluster/data/monDom4/RMRun
mkdir /cluster/data/monDom4/RMOut
mkdir /cluster/data/monDom4/scripts
cd /cluster/data/monDom4/split500K
for C in 1 2 3 4 5 6 7 8 X Un
do
ls -1S ${C}/*.fa
done > ../RMRun/split.list
cd /cluster/data/monDom4/RMRun
cat << '_EOF_' > template
#LOOP
../scripts/RMOpossum $(dir1) $(root1) {check out line ../RMOut/$(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
# happy emacs
cat << '_EOF_' > ../scripts/RMOpossum
#!/bin/csh -fe
/bin/mkdir -p /scratch/tmp/monDom4/$1/$2
/bin/mkdir -p ../RMOut/$1
/bin/cp ../split500K/$1/$2.fa /scratch/tmp/monDom4/$1/$2/$2.fa
pushd /scratch/tmp/monDom4/$1/$2
/cluster/bluearc/RepeatMasker060120/RepeatMasker -s \
-species "Monodelphis domestica" $2.fa
popd
/bin/cp -p /scratch/tmp/monDom4/$1/$2/$2.fa.out $3
/bin/rm -fr /scratch/tmp/monDom4/$1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1/$2
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4
'_EOF_'
# happy emacs
chmod +x ../scripts/RMOpossum
gensub2 split.list single template jobList
para create jobList
para try ... check ... push ... etc ...
# Completed: 7170 of 7170 jobs
# CPU time in finished jobs: 26375191s 439586.52m 7326.44h 305.27d 0.836 y
# IO & Wait Time: 1584329s 26405.48m 440.09h 18.34d 0.050 y
# Average job time: 3900s 64.99m 1.08h 0.05d
# Longest finished job: 12608s 210.13m 3.50h 0.15d
# Submission to last job: 162734s 2712.23m 45.20h 1.88d
# lift results up to chrom coordinates
ssh kkstore01
cd /cluster/data/monDom4/RMOut
for C in 1 2 3 4 5 6 7 8 X Un
do
echo "chr${C} working ... "
(cat ${C}/chr${C}_000.out ${C}/chr${C}_0000.out 2> /dev/null; \
ls ${C}/chr${C}_*.out | egrep -v "_000.out|_0000.out" \
| xargs tail --quiet --lines=+4) | \
liftUp ../${C}/chr${C}.fa.out ../lifts/lift500K.lft error stdin
echo "chr${C} done"
done
# the seemingly duplicate mention of _000.out and _0000.out takes
# the seemingly duplicate mention of _000.out and _0000.out takes
# care of the case where one or the other exists, but both never
# exist. One will not exist and the cat will complain to stderr, hence
# the redirection of cat complaints to dev null.
###########################################################################
# BLASTZ HUMAN hg18 - to quickly investigate monDom4 to see if it has the same
# pile up problem that was in monDom2. Got this started right after the
# monDom4.2bit file was made from the original Broad chrom fasta
ssh pk
mkdir /cluster/store9/monDom4/bed/blastzHg18.2006-02-08
cd /cluster/store9/monDom4/bed/blastzHg18.2006-02-08
cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzHg18.2006-02-08
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
XXXX - running 2006-02-08 10:06
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
###########################################################################
# Initial database setup (DONE - 2006-02-06 - Hiram)
ssh hgwdev
cd /cluster/data/monDom4
mkdir /gbdb/monDom4
ln -s /cluster/data/monDom4/monDom4.2bit /gbdb/monDom4
twoBitInfo monDom4.2bit stdout \
| awk '{printf "%s\t%s\t/gbdb/monDom4/monDom4.2bit\n", $1, $2}' \
> chromInfo.tab
hgsql -e "create database monDom4;" mysql
hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" \
monDom4
hgsql monDom4 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' \
monDom4
hgsql monDom4 -N -e 'select chrom from chromInfo' > chrom.lst
# Enter monDom4 into dbDb and defaultDb so test browser knows about
# it:
hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
VALUES("monDom4", "Jan 2006", "/gbdb/monDom4", "Opossum", \
"chr4:344283621-363415496", 1, 34, "Opossum", \
"Monodelphis domestica", \
"/gbdb/monDom4/html/description.html", 0, 0, \
"Broad Inst. monDom4 Jan06");' \
-h localhost hgcentraltest
# do this update later when it is time to make this be the default
hgsql -e 'update defaultDb set name="monDom4" where genome="Opossum";' \
hgcentraltest
# update default position to a HOX region:
hgsql -e \
'update dbDb set defaultPos="chr8:293325654-293362847" where name="monDom4";' \
hgcentraltest
chr8:293,325,654-293,362,847
# Make trackDb table so browser knows what tracks to expect
mkdir /cluster/data/monDom4/html
ln -s /cluster/data/monDom4/html /gbdb/monDom4/html
mkdir $HOME/kent/src/hg/makeDb/trackDb/opossum/monDom4
cd $HOME/kent/src/hg/makeDb/trackDb/opossum
cvs add monDom4
cd monDom4
cp -p /cluster/data/monDom2/html/description.html ./description.html
# edit that description to update
cvs add description.html
cvs commit
cd $HOME/kent/src/hg/makeDb/trackDb
# add monDom4 to the makefile, then
make DBS=monDom4
# or
make DBS=monDom4 alpha
# to place it on the genome-test browser
#######################################################################
# SIMPLE REPEAT [TRF] TRACK (DONE - 2006-02-08 - Hiram)
# This can be done while the above repeat masker run is working
ssh kkstore01
mkdir /cluster/data/monDom4/bed/simpleRepeat
cd /cluster/data/monDom4
# split into 50M chunks for trfBig
mkdir split50M
for C in 1 2 3 4 5 6 7 8 X Un
do
mkdir split50M/${C}
faSplit -verbose=2 size ${C}/chr${C}.fa 50000000 split50M/${C}/chr${C}_ \
-lift=split50M/${C}.lft
done
cat split50M/*.lft > lifts/lift50M.lft
cd split50M
ls -1S */*.fa > ../bed/simpleRepeat/split.list
# small kluster for these larger jobs
ssh kki
cd /cluster/data/monDom4/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > template
#LOOP
./runTrf.csh $(dir1) $(root1) {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# happy emacs
cat << '_EOF_' > runTrf.csh
#!/bin/csh -fe
set workDir = /scratch/tmp/monDom4TRF/$1/$2
set inputFN = ../../split50M/$1/$2.fa
set outputFN = $3
mkdir -p $workDir
cp -p $inputFN $workDir
pushd $workDir
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $2.fa /dev/null -bedAt=$2.bed -tempDir=/scratch/tmp
popd
rm -f $outputFN
cp -p $workDir/$2.bed $outputFN
rm -fr $workDir/*
rmdir --ignore-fail-on-non-empty $workDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4TRF/$1
rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4TRF
'_EOF_'
# happy emacs
chmod +x runTrf.csh
gensub2 split.list single template jobList
para create jobList
para try ... check ... push ... etc ...
# Completed: 77 of 77 jobs
# CPU time in finished jobs: 16109s 268.48m 4.47h 0.19d 0.001 y
# IO & Wait Time: 1199s 19.99m 0.33h 0.01d 0.000 y
# Average job time: 225s 3.75m 0.06h 0.00d
# Longest finished job: 1322s 22.03m 0.37h 0.02d
# Submission to last job: 1870s 31.17m 0.52h 0.02d
# lift to chrom coordinates
ssh kkstore01
cd /cluster/data/monDom4/bed/simpleRepeat
find ./trf -type f | xargs cat | \
liftUp simpleRepeat.bed \
/cluster/data/monDom4/lifts/lift50M.lft \
error stdin
# Load into the database:
ssh hgwdev
cd /cluster/data/monDom4/bed/simpleRepeat
hgLoadBed -strict monDom4 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 710087 elements of size 16
# Compare with previous assembly
time featureBits monDom4 simpleRepeat
# 68337838 bases of 3501643220 (1.952%) in intersection
# real 0m42.137s
time featureBits monDom2 simpleRepeat
# 69239555 bases of 3496445653 (1.980%) in intersection
# real 1m5.716s
time featureBits monDom1 simpleRepeat
# 55000238 bases of 3492108230 (1.575%) in intersection
# real 1m49.240s
#######################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2006-02-10 - Hiram)
# After the simpleRepeats track has been built, make a filtered
# version
# of the trf output: keep trf's with period <= 12:
# need these mask files on a per-chrom basis
ssh kkstore01
cd /cluster/data/monDom4/bed/simpleRepeat
mkdir chromMask
for C in 1 2 3 4 5 6 7 8 X Un
do
echo "chr${C} working ..."
cat trf/chr${C}_*.bed | liftUp chromMask/chr${C}.bed \
/cluster/data/monDom4/lifts/lift50M.lft \
error stdin
awk '{if ($5 <= 12) print;}' chromMask/chr${C}.bed \
> chromMask/chr${C}_Mask.bed
echo "chr${C} done"
done
# should be same result as if we had done the whole thing as a
# single piece:
cat chromMask/*_Mask.bed | wc -l
# 485364
awk '{if ($5 <= 12) print;}' simpleRepeat.bed | wc -l
# 485364
#######################################################################
# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE - 2006-02-10 - Hiram)
# Additional masking was added later to use Damian Keefe's repeat
# libraries.
# the new style binRange was in development at this time, so using
# binaries built today with that new functionality. Next time
# these will be the standard commands
ssh kkstore01
cd /cluster/data/monDom4
# verify sequence before and after masking to ensure it doesn't
# get broken
time faCount ?/chr?.fa Un/chrUn.fa
#seq len A C G T N cpg
# chr1 748055161 227831941 138649610 138661137 228270299 14642174 3225777
# chr2 541556283 163736285 100133171 100127159 163985111 13574557 2505214
# chr3 526135210 161221343 95632761 95680478 160815407 12785221 2208210
# chr4 430141050 130190442 78748372 78684824 130210194 12307218 1877031
# chr5 308900514 94866760 56126597 56061671 94769331 7076155 1311015
# chr6 244784211 73686567 45271636 45267221 73515529 7043258 1216115
# chr7 262624689 81219469 47052213 47076531 81079019 6197457 1127925
# chr8 308469712 93087343 56578139 56466687 93245129 9092414 1427329
# chrX 60718501 16286269 11251258 11264518 16304629 5611827 371051
# chrUn 174229318 46556476 32785341 32891615 46354738 15641148 1620662
# total 3605614649 1088682895 662229098 662181841 1088549386 103971429 16890329
# real 1m51.555s
cat << '_EOF_' > scripts/maskChroms.sh
#!/bin/sh
for C in 1 2 3 4 5 6 7 8 X Un
do
echo "chr${C} working ..."
$HOME/bin/x86_64/maskOutFa -soft ${C}/chr${C}.fa \
bed/simpleRepeat/chromMask/chr${C}_Mask.bed ${C}/chr${C}_masked.fa
$HOME/bin/x86_64/maskOutFa -softAdd ${C}/chr${C}_masked.fa \
${C}/chr${C}.fa.out ${C}/chr${C}_masked.fa
echo "done"
done
'_EOF_'
# happy emacs
chmod +x scripts/maskChroms.sh
time ./scripts/maskChroms.sh
# should have the same sequence still
time faCount ?/chr?_masked.fa Un/chrUn_masked.fa
#seq len A C G T N cpg
chr1 748055161 227831941 138649610 138661137 228270299 14642174 3225777
chr2 541556283 163736285 100133171 100127159 163985111 13574557 2505214
chr3 526135210 161221343 95632761 95680478 160815407 12785221 2208210
chr4 430141050 130190442 78748372 78684824 130210194 12307218 1877031
chr5 308900514 94866760 56126597 56061671 94769331 7076155 1311015
chr6 244784211 73686567 45271636 45267221 73515529 7043258 1216115
chr7 262624689 81219469 47052213 47076531 81079019 6197457 1127925
chr8 308469712 93087343 56578139 56466687 93245129 9092414 1427329
chrX 60718501 16286269 11251258 11264518 16304629 5611827 371051
chrUn 174229318 46556476 32785341 32891615 46354738 15641148 1620662
total 3605614649 1088682895 662229098 662181841 1088549386 103971429 16890329
# after verifying they are the same, remove the old unmasked sequence
for C in 1 2 3 4 5 6 7 8 X Un
do
rm ${C}/chr${C}.fa
mv ${C}/chr${C}_masked.fa ${C}/chr${C}.fa
done
# Create new 2bit file with this masked sequence
ls -og *.2bit
# -rw-rw-r-- 1 901986358 Feb 8 09:29 monDom4.2bit
# browser will not function while this 2bit file does not exist
rm monDom4.2bit
faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4.2bit
# the file becomes larger:
ls -og *.2bit
# -rw-rw-r-- 1 950060406 Feb 10 13:53 monDom4.2bit
# and load the rmsk tables
ssh hgwdev
cd /cluster/data/monDom4
time $HOME/bin/i386/hgLoadOut -verbose=2 monDom4 \
?/chr?.fa.out Un/chrUn.fa.out
# hgLoadOut: connected to database: monDom4
# Processing 1/chr1.fa.out
# Strange perc. field -460.2 line 561938 of 1/chr1.fa.out
# Strange perc. field -395.6 line 561938 of 1/chr1.fa.out
# Loading up table chr1_rmsk
# Processing 2/chr2.fa.out
# Strange perc. field -160.2 line 234960 of 2/chr2.fa.out
# Strange perc. field -238.7 line 234960 of 2/chr2.fa.out
# bad rep range [1723, 73] line 451681 of 2/chr2.fa.out
# Strange perc. field -6.0 line 1104797 of 2/chr2.fa.out
# Loading up table chr2_rmsk
# Processing 3/chr3.fa.out
# bad rep range [2344, 191] line 648571 of 3/chr3.fa.out
# Loading up table chr3_rmsk
# Processing 4/chr4.fa.out
# Strange perc. field -7.0 line 7733 of 4/chr4.fa.out
# Loading up table chr4_rmsk
# Processing 5/chr5.fa.out
# Strange perc. field -6.9 line 34415 of 5/chr5.fa.out
# Strange perc. field -1.1 line 44716 of 5/chr5.fa.out
# Strange perc. field -1.1 line 44720 of 5/chr5.fa.out
# Strange perc. field -1.1 line 44722 of 5/chr5.fa.out
# Strange perc. field -1.1 line 44724 of 5/chr5.fa.out
# Strange perc. field -38.3 line 146729 of 5/chr5.fa.out
# Strange perc. field -38.3 line 146732 of 5/chr5.fa.out
# Strange perc. field -38.3 line 146734 of 5/chr5.fa.out
# Strange perc. field -38.3 line 146737 of 5/chr5.fa.out
# Strange perc. field -42.5 line 239189 of 5/chr5.fa.out
# Loading up table chr5_rmsk
# Processing 6/chr6.fa.out
# Strange perc. field -0.2 line 87360 of 6/chr6.fa.out
# Strange perc. field -0.1 line 87360 of 6/chr6.fa.out
# Strange perc. field -3960.1 line 507698 of 6/chr6.fa.out
# Strange perc. field -1856.3 line 507698 of 6/chr6.fa.out
# Loading up table chr6_rmsk
# Processing 7/chr7.fa.out
# Loading up table chr7_rmsk
# Processing 8/chr8.fa.out
# Loading up table chr8_rmsk
# Processing X/chrX.fa.out
# Loading up table chrX_rmsk
# Processing Un/chrUn.fa.out
# Strange perc. field -27.6 line 129404 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129404 of Un/chrUn.fa.out
# Strange perc. field -27.6 line 129407 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129407 of Un/chrUn.fa.out
# Strange perc. field -27.6 line 129409 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129409 of Un/chrUn.fa.out
# Strange perc. field -27.6 line 129411 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129411 of Un/chrUn.fa.out
# Strange perc. field -27.6 line 129413 of Un/chrUn.fa.out
# Strange perc. field -1.8 line 129413 of Un/chrUn.fa.out
# Loading up table chrUn_rmsk
# note: 2 records dropped due to repStart > repEnd
# Note the interesting coincidence of the exact same % for monDom4
# and monDom2
time featureBits monDom4 rmsk
# 1814099592 bases of 3501643220 (51.807%) in intersection
# real 3m57.193s
# time featureBits -countGaps monDom4 rmsk
# 1814099592 bases of 3605614649 (50.313%) in intersection
time featureBits monDom2 rmsk
# 1811406819 bases of 3496445653 (51.807%) in intersection
# real 5m31.388s
time featureBits monDom1 rmsk
# 1775646140 bases of 3492108230 (50.847%) in intersection
#######################################################################
# GC5BASE (DONE - 2006-02-08 - Hiram)
ssh kkstore01
mkdir /cluster/data/monDom4/bed/gc5Base
cd /cluster/data/monDom4/bed/gc5Base
time hgGcPercent -verbose=2 -wigOut -doGaps -file=stdout -win=5 monDom4 \
/cluster/data/monDom4 | wigEncode stdin gc5Base.wig gc5Base.wib
ssh hgwdev
cd /cluster/data/monDom4/bed/gc5Base
mkdir /gbdb/monDom4/wib
ln -s `pwd`/gc5Base.wib /gbdb/monDom4/wib
hgLoadWiggle -verbose=2 monDom4 gc5Base gc5Base.wig
#######################################################################
# LOAD GAP & GOLD TABLES FROM AGP (DONE - 2006-02-10 - Hiram)
ssh hgwdev
cd /cluster/data/monDom4
hgGoldGapGl -noGl monDom4 broad.mit.edu/Monodelphis4.0.agp
# Verify sanity of indexes
hgsql -e "show index from gold;" monDom4
hgsql -e "show index from gap;" monDom4
# Look at the Cardinality column, it should have some healthy
# numbers for all indices
# For some reason, the indices do not get built correctly --
# "show index from gap/gold" shows NULL cardinalities for chrom.
# Rebuild indices with "analyze table".
hgsql -e "analyze table gold; analyze table gap;" monDom4
#######################################################################
# LINEAGE SPECIFIC REPEATS (DONE - 2006-02-10 - Hiram)
# Let's see if DateRepeats can do anything with this Opossum
# business. This was tried, but it doesn't make anything special
# for any of the organisms and a -comp of opossum isn't allowed at
# this time so we couldn't do this on the other organisms either to
# obtain any specific repeats there.
ssh kki
mkdir /cluster/data/monDom4/linSpecRep
cd /cluster/data/monDom4/linSpecRep
for C in 1 2 3 4 5 6 7 8 X Un
do
ln -s ../${C}/chr${C}.fa.out ./chr${C}.rmsk.out
done
# This takes a long time to run as a script, to speed it up it has
# been converted to a mini cluster run
cat << '_EOF_' > template
#LOOP
/cluster/bluearc/RepeatMasker060120/DateRepeats $(path1) -query opossum -comp human -comp mouse -comp rat -comp dog -comp dog
#ENDLOOP
'_EOF_'
ls *.out > out.list
gensub2 out.list single template jobList
ssh kki
cd /cluster/bluearc/scratch/hg/gs.18/build35/rmsk.spec
./runArian.sh > jobList
para create jobList
para try, ... check ... push ... etc ...
# To check if any difference is noted in the various comparison
# organisms:
awk '{if(NF>10){a=NF-4;b=NF-3;c=NF-2;d=NF-1; print $a,$b,$c,$d,$NF}}' \
chr2.rmsk.out_homo-sapiens_mus-musculus_rattus_canis-familiaris_canis-familiaris \
| sort | uniq -c
# 187651 - - - - -
# 113100 0 0 0 0 0
# 210909 ? ? ? ? ?
# 602832 X X X X X
# always the same mark for each comp organism
#######################################################################
# Distribute files to filesystems for kluster runs
ssh kkr1u00
mkdir /iscratch/i/monDom4
cd /iscratch/i/monDom4
cp -p /cluster/data/monDom4/monDom4.2bit .
cp -p /cluster/data/monDom4/chrom.sizes .
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/monDom4/ kkr${R}u00:/iscratch/i/monDom4/
done
ssh pk
mkdir /san/sanvol1/scratch/monDom4
cd /san/sanvol1/scratch/monDom4
cp -p /cluster/data/monDom4/monDom4.2bit .
cp -p /cluster/data/monDom4/chrom.sizes .
#######################################################################
# HUMAN BLASTZ - this time with masked sequence
# (DONE - 2006-02-10 - 2006-02-15 - Hiram)
ssh pk
mkdir /cluster/data/monDom4/bed/blastzHg18.2006-02-10
cd /cluster/data/monDom4/bed/blastzHg18.2006-02-10
cat << '_EOF_' > DEF
# opossum vs. human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Human (hg18)
SEQ2_DIR=/scratch/hg/hg18/nib
SEQ2_LEN=/scratch/hg/hg18/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzHg18.2006-02-10
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-stop=net `pwd`/DEF > blastz.out 2>&1 &
# running 2006-02-10
#######################################################################
# Additional RepeatMasking - this sequence needs to be
# additionally masked with Damian Keefe's libraries
# (DONE - 2006-02-11 - Hiram)
ssh kkr1u00
mkdir /cluster/data/monDom4/RMExtra
cd /cluster/data/monDom4/RMExtra
wget --timestamping --force-directories --directory-prefix=. \
--dont-remove-listing --recursive --level=2 --no-parent \
--no-host-directories --cut-dirs=6 \
ftp://ftp.ebi.ac.uk/pub/databases/ensembl/encode/repeat_libraries/monodelphis/*
# picks up:
# -rw-r--r-- 1 679203 May 9 2005 supplemental.lib
# -rw-r--r-- 1 737274 May 9 2005 ab_initio.lib
# -rw-r--r-- 1 452 May 9 2005 README
cp -p supplemental.lib /iscratch/i/monDom4
cd /iscratch/i/monDom4
for R in 2 3 4 5 6 7 8
do
rsync -a --progress /iscratch/i/monDom4/ kkr${R}u00:/iscratch/i/monDom4/
done
ssh kk
cd /cluster/data/monDom4/RMExtra
cat << '_EOF_' > RMExtra.csh
#!/bin/csh -fe
/bin/mkdir -p /scratch/tmp/monDom4/$1/$2
/bin/mkdir -p ../RMOutExtra/$1
/bin/cp ../split500K/$1/$2.fa /scratch/tmp/monDom4/$1/$2/$2.fa
pushd /scratch/tmp/monDom4/$1/$2
/cluster/bluearc/RepeatMasker060120/RepeatMasker -s \
-lib /iscratch/i/monDom4/supplemental.lib $2.fa
popd
/bin/cp -p /scratch/tmp/monDom4/$1/$2/$2.fa.out $3
/bin/rm -fr /scratch/tmp/monDom4/$1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1/$2
/bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/monDom4/$1
'_EOF_'
# happy emacs
chmod +x RMExtra.csh
cat << '_EOF_' > template
#LOOP
./RMExtra.csh $(dir1) $(root1) {check out line ../RMOutExtra/$(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
# happy emacs
mkdir ../RMOutExtra
gensub2 ../RMRun/split.list single template jobList
para create jobList
para try ... check ... push ... etc...
para time
# Completed: 7170 of 7170 jobs
# CPU time in finished jobs: 8540127s 142335.45m 2372.26h 98.84d 0.271 y
# IO & Wait Time: 103372s 1722.87m 28.71h 1.20d 0.003 y
# Average job time: 1206s 20.09m 0.33h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2326s 38.77m 0.65h 0.03d
# Submission to last job: 22083s 368.05m 6.13h 0.26d
cd /cluster/data/monDom4/RMOutExtra
for C in 1 2 3 4 5 6 7 8 X Un
do
echo "chr${C} working ... "
(cat ${C}/chr${C}_000.out ${C}/chr${C}_0000.out 2> /dev/null; \
ls ${C}/chr${C}_*.out | egrep -v "_000.out|_0000.out" \
| xargs tail --quiet --lines=+4) | \
liftUp ./chr${C}.fa.out ../lifts/lift500K.lft error stdin
echo "chr${C} done"
done
# Add the two results together
cd /cluster/data/monDom4/RMOut
cp -p /cluster/data/monDom2/scripts/uniqueRMOut.csh ../scripts
for C in 1 2 3 4 5 6 7 8 X Un
do
echo "${C} working ..."
cp -p ../${C}/chr${C}.fa.out ./chr${C}.tmp.out
tail +4 ../RMOutExtra/chr${C}.fa.out >> ./chr${C}.tmp.out
../scripts/uniqueRMOut.csh ./chr${C}.tmp.out > chr${C}.fa.out
rm -f ./chr${C}.tmp.out &
echo "done"
done
# and then add this mask to the existing masked sequence
for C in 1 2 3 4 5 6 7 8 X Un
do
echo "chr${C} working ..."
rm -f ./chr${C}.added.fa
maskOutFa -softAdd ../${C}/chr${C}.fa ./chr${C}.fa.out ./chr${C}.added.fa
echo "done"
done
# verify they are not broken compared to what was previous
# note the additional masked sequence in the lower case counts
ssh kkstore01
cd /cluster/data/monDom4/RMOut
faSize chr?.added.fa chrUn.added.fa
# 3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
# 1949956770 lower) in 10 sequences in 10 files
faSize ../?/chr?.fa ../Un/chrUn.fa
# 3605614649 bases (103971429 N's 3501643220 real 1687813668 upper
# 1813829552 lower) in 10 sequences in 10 files
# If that is OK, replace the current sequence
for C in 1 2 3 4 5 6 7 8 X Un
do
rm ../${C}/chr${C}.fa
mv chr${C}.added.fa ../${C}/chr${C}.fa
done
# And rebuild the 2bit file:
faToTwoBit ?/chr?.fa Un/chrUn.fa monDom4RMExtra.2bit
# and switch it out from under the browser
rm monDom4.2bit; mv monDom4RMExtra.2bit monDom4.2bit
# distribute to /iscratch/i/monDom4 ...
############################################################################
# BLATSERVERS ENTRY (DONE - 2006-02-16 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("monDom4", "blat17", "17786", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("monDom4", "blat17", "17787", "0", "1");' \
hgcentraltest
# test it with some sequence
#########################################################################
# CPGISLANDS (DONE - 2006-02-16 - Hiram)
ssh hgwdev
mkdir /cluster/data/monDom4/bed/cpgIsland
cd /cluster/data/monDom4/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# gcc readseq.c cpg_lh.c -o cpglh.exe
cd ../..
ln -s hg3rdParty/cpgIslands/cpglh.exe .
ssh kkstore01
cd /cluster/data/monDom4
mkdir tmp
# cpglh.exe requires hard-masked (N) .fa's.
time for CHR in ?/chr?.fa ??/chr??.fa
do
echo "maskOutFa ${CHR} hard ${CHR}.masked"
maskOutFa ${CHR} hard ${CHR}.masked
done > tmp/hardMask.out 2>&1
# about 3 minutes 20 seconds
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
cd /cluster/data/monDom4/bed/cpgIsland
for F in ../../*/chr*.fa.masked
do
FA=${F/*\/}
C=${FA/.fa.masked/}
echo "./cpglh.exe ${FA} > ${C}.cpg"
./cpglh.exe ${F} > ${C}.cpg
done > cpglh.out 2>&1 &
# about 4 minutes, there were no warnings
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# happy emacs
awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed
ssh hgwdev
cd /cluster/data/monDom4/bed/cpgIsland
hgLoadBed -strict monDom4 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 22409 elements of size 10
featureBits monDom4 cpgIslandExt
# 14526908 bases of 3501643220 (0.415%) in intersection
#########################################################################
# ANDY LAW CPGISSLANDS (DONE - 2006-02-16 - Hiram)
# See notes in makeGalGal2.doc and makeCanFam2.doc
ssh kkstore01
mkdir /cluster/data/monDom4/bed/cpgIslandGgfAndy
cd /cluster/data/monDom4/bed/cpgIslandGgfAndy
# Build the preProcGgfAndy program in
# kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE
# Use masked sequence since this is a mammal...
for F in ../../*/chr*.fa.masked
do
FA=${F/*\/}
C=${FA/.fa.masked/}
echo preproc and run on masked "${C} ${F}" 1>/dev/stderr
~/bin/$MACHTYPE/preProcGgfAndy ${F} \
| /cluster/home/angie/ggf-andy-cpg-island.pl \
| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--;
$gc=$c+$g1; $pCpG=(100.0 * 2 * $cpg / $n);
$pGc=(100.0 * $gc / $n);
$_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" .
"$pCpG\t$pGc\t$oE\n";'
done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed
# about 3 minutes
# load into database:
ssh hgwdev
cd /cluster/data/monDom4/bed/cpgIslandGgfAndy
sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
$HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
hgLoadBed -strict monDom4 cpgIslandGgfAndyMasked -tab -noBin \
-sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
# Loaded 67504 elements of size 10
featureBits monDom4 cpgIslandExt
# 14526908 bases of 3501643220 (0.415%) in intersection
featureBits monDom4 cpgIslandGgfAndyMasked
# 41042229 bases of 3501643220 (1.172%) in intersection
wc -l ../cpgIsland/cpgIsland.bed *bed
# 22409 ../cpgIsland/cpgIsland.bed
# 67504 cpgIslandGgfAndyMasked.bed
#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2006-02-24 - Hiram)
ssh kkstore01
cd /cluster/data/monDom4
# use repMatch of 1228 as this genome is ~ %20 larger than human
# 1024 + (1024 * 0.2) = 1228
time blat monDom4.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=1228
# Wrote 43992 overused 11-mers to 11.ooc
# real 3m24.793s
# copy to bluearc filesystem for kluster use
cp -p 11.ooc /cluster/bluearc/scratch/hg/monDom4
#########################################################################
# GENBANK auto update (DONE - 2006-02-24 - Hiram)
# align with revised genbank process. drop xeno ESTs.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvs update -d -P etc
#edit etc/genbank.conf to add monDom4, it is a copy of monDom2 with changes:
# monDom4 (M. domestica) 8 chroms plus X and Un
monDom4.serverGenome = /cluster/data/monDom4/monDom4.2bit
monDom4.clusterGenome = /cluster/bluearc/scratch/hg/monDom4/monDom4.2bit
monDom4.ooc = /cluster/data/monDom4/11.ooc
monDom4.lift = no
monDom4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
monDom4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
monDom4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
monDom4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
monDom4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
monDom4.downloadDir = monDom4
monDom4.refseq.mrna.xeno.load = yes
monDom4.refseq.mrna.xeno.loadDesc = yes
monDom4.mgcTables.default = full
monDom4.mgcTables.mgc = all
# check that into CVS, then
# update /cluster/data/genbank/
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice -n 19 bin/gbAlignStep -initial monDom4 &
# var/build/logs/2006.02.24-22:38:50.monDom4.initalign.log
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice -n 19 ./bin/gbDbLoadStep -drop -initialLoad monDom4 &
# var/dbload/hgwdev/logs/2006.02.26-09:36:15.dbload.log
XXXX - running 2006-02-26 09:36
##########################################################################
# BLASTZ SELF (DONE - 2006-03-02 - 2006-04-19 - Hiram)
ssh kk
mkdir /cluster/data/monDom4/bed/blastzSelf.2006-03-02
cd /cluster/data/monDom4/bed
ln -s blastzSelf.2006-03-02 blastz.monDom4
cd blastzSelf.2006-03-02
cat << '_EOF_' > DEF
# monDom4 vs monDom4
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
BLASTZ=blastz.v7
BLASTZ_M=100
# TARGET: Opossum monDom4
SEQ1_DIR=/iscratch/i/monDom4/monDom4.2bit
SEQ1_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: MonDom2
SEQ2_DIR=/iscratch/i/monDom4/monDom4.2bit
SEQ2_LEN=/iscratch/i/monDom4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzSelf.2006-03-02
TMPDIR=/scratch/tmp
'_EOF_'
# happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
`pwd`/DEF > blastz.out 2>&1 &
# This was a difficult run, needed to do the last job on Kolossus:
# Completed: 133955 of 133956 jobs
# CPU time in finished jobs: 78602533s 1310042.22m 21834.04h 909.75d 2.492 y
# IO & Wait Time: 47419875s 790331.25m 13172.19h 548.84d 1.504 y
# Average job time: 941s 15.68m 0.26h 0.01d
# Longest finished job: 111202s 1853.37m 30.89h 1.29d
# Submission to last job: 521430s 8690.50m 144.84h 6.04d
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
-continue=cat `pwd`/DEF > cat.out 2>&1 &
# broke during chaining, the processes are too large. Running
# them individually on hgwdev64 and kolossus, they take many days
# each, perhaps a week, still trying to get them done after two
# weeks of messing with them.
# last one took:
# real 7343m55.064s
# user 2270m9.968s
# sys 5072m16.807s
# that's 5 days and 2 hours, about 5 weeks for all:
# -rw-rw-r-- 1 138817724 Mar 10 17:48 monDom4.2bit:chrX:.chain
# -rw-rw-r-- 1 500083934 Mar 11 15:06 monDom4.2bit:chr7:.chain
# -rw-rw-r-- 1 681900033 Mar 11 20:27 monDom4.2bit:chr5:.chain
# -rw-rw-r-- 1 495478058 Mar 11 20:53 monDom4.2bit:chr6:.chain
# -rw-rw-r-- 1 948724278 Mar 12 12:55 monDom4.2bit:chrUn:.chain
# -rw-rw-r-- 1 668565678 Mar 12 18:03 monDom4.2bit:chr8:.chain
# -rw-rw-r-- 1 1214431534 Mar 23 21:39 monDom4.2bit:chr2:.chain
# -rw-rw-r-- 1 1755465687 Apr 9 19:31 monDom4.2bit:chr4:.chain
# -rw-rw-r-- 1 1739564535 Apr 10 15:55 monDom4.2bit:chr1:.chain
# -rw-rw-r-- 1 2041409562 Apr 16 13:58 monDom4.2bit:chr3:.chain
# loading the tables on kolossus
ssh kolossus
cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02/axtChai
cat << '_EOF_' > loadUp.csh
#!/bin/csh -fe
cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02/axtChain/chain
foreach c (`awk '{print $1;}' /cluster/bluearc/scratch/hg/monDom4/chrom.sizes`)
set f = $c.chain
if (! -e $f) then
echo no chains for $c
set f = /dev/null
endif
hgLoadChain monDom4 ${c}_chainSelf $f
end
'_EOF_'
# << emacs
chmod +x loadUp.csh
time ./loadUp.csh
# real 211m16.852s
# request push of *chainSelf* tables from kolossus to hgwdev
# almost 32 Gb of data.
ssh kolossus
cd /cluster/data/monDom4/bed/blastzSelf.2006-03-02
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=10000 -chainLinearGap=medium \
-continue=net `pwd`/DEF > net.out 2>&1 &
############################################################################
## BLASTZ swap from mm8 alignments (DONE - 2006-02-23 - Hiram)
ssh pk
cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits monDom4 chainMm8Link
# 210933035 bases of 3501643220 (6.024%) in intersection
############################################################################
## BLASTZ CHICKEN galGal2 (DONE - 2006-03-06 - 2006-03-08 - Hiram)
ssh pk
mkdir /cluster/data/monDom4/bed/blastzGalGal2.2006-03-06
cd /cluster/data/monDom4/bed
ln -s blastzGalGal2.2006-03-06 blastz.galGal2
cd blastzGalGal2.2006-03-06
cat << '_EOF_' > DEF
# Opossum vs Chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzGalGal2.2006-03-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# real 1399m47.238s
mkdir /cluster/data/galGal2/bed/blastz.monDom4.swap
cd /cluster/data/galGal2/bed
ln -s blastz.monDom4.swap blastz.monDom4
cd blastz.monDom4.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap /cluster/data/monDom4/bed/blastzGalGal2.2006-03-06/DEF \
> swap.out 2>&1 &
featureBits monDom4 chainGalGal2Link > fb.monDom4.chainGalGal2Link 2>&1
cat fb.monDom4.chainGalGal2Link
# 106807090 bases of 3501643220 (3.050%) in intersection
featureBits galGal2 chainMonDom4Link > fb.galGal2.chainMonDom4Link 2>&1
cat fb.galGal2.chainMonDom4Link
# 95852676 bases of 1054197620 (9.092%) in intersection
############################################################################
## BLASTZ ZEBRAFISH danRer3 (DONE - 2006-03-06 - 2003-03-08 - Hiram)
ssh pk
mkdir /cluster/data/monDom4/bed/blastzDanRer3.2006-03-06
cd /cluster/data/monDom4/bed
ln -s blastzDanRer3.2006-03-06 blastz.danRer3
cd blastzDanRer3.2006-03-06
# doing only the chroms on zebrafish, leaving out the chrUn and chrNA
cat << '_EOF_' > DEF
# Opossum vs Zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Zebrafish (danRer3)
# large enough chunk to do complete chroms at once
SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
SEQ2_CHUNK=100000000
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzDanRer3.2006-03-06
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# common failure for unknown reason:
# startStep: 0, at step 5 net to stopStep 9
# netChains: looks like previous stage was not successful
# (can't find [monDom4.danRer3.]all.chain[.gz]).
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net `pwd`/DEF > net.out 2>&1 &
mkdir /cluster/data/danRer3/bed/blastz.monDom4.swap
cd /cluster/data/danRer3/bed
ln -s blastz.monDom4.swap blastz.monDom4
cd blastz.monDom4.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap /cluster/data/monDom4/bed/blastzDanRer3.2006-03-06/DEF \
> swap.out 2>&1 &
featureBits monDom4 chainDanRer3Link > fb.monDom4.chainDanRer3Link 2>&1
cat fb.monDom4.chainDanRer3Link
# 73187496 bases of 3501643220 (2.090%) in intersection
featureBits danRer3 chainMonDom4Link > fb.danRer3.chainMonDom4Link 2>&1
cat fb.danRer3.chainMonDom4Link
# 65225617 bases of 1630323462 (4.001%) in intersection
##########################################################################
# MAKE Human Proteins track (WORKING - 2006-03-24 - Hiram)
# we need the split500K sequences to do this work, set them up on
# the Iservers
ssh kkstore01
cd /cluster/data/monDom4/split500K
rsync -a --progress `pwd`/ kkr1u00:/iscratch/i/monDom4/split500K/
ssh kkr1u00
cd /iscratch/i/monDom4/split500K
# create individual blast db's for each sequence
for C in 1 2 3 4 5 6 7 8 Un X
do
echo ${C} working ...
cd ${C}
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -p F -i $i
done
cd ..
echo ${C} done
done
find /iscratch/i/monDom4/split500K -name "*.nsq" \
| sed "s/\.nsq//" > target.lst
# now sync everything to the other Iserver nodes
for R in 2 3 4 5 6 7 8
do
rsync -a --progress `pwd`/ kkr${R}u00:/iscratch/i/monDom4/split500K/
done
ssh kkstore01
mkdir -p /cluster/data/monDom4/bed/tblastn.hg18KG
cd /cluster/data/monDom4/bed/tblastn.hg18KG
mkdir kgfa
# calculate number of protein sequence to give a job count of about
# 500,000
echo `cat /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | wc -l` \
`cat /iscratch/i/monDom4/split500K/target.lst | wc -l` \
| awk '{printf "%d/(500000/%d) = %d\n", $1, $2, $1/(500000/$2)}'
# 36727/(500000/7170) = 526
# split the proteins by that number
split -l 526 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in kg??; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
# Want output to go to bluearc to preserve sanity of file server
rm -rf /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut
mkdir -p /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/monDom4/bed/tblastn.hg18KG/blastOut .
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
ssh kk
cd /cluster/data/monDom4/bed/tblastn.hg18KG
cat /iscratch/i/monDom4/split500K/*.lft > subChr.lft
cat << '_EOF_' > template
#LOOP
blastSome $(path1) $(path2) {check out exists blastOut/$(root2)/q.$(root1).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
#### !NOTE: Next time run these jobs with a better separation
# of I/O spaces, they seem to hangup bluearc while they work.
# Plus, they are using /tmp instead of /scratch/tmp/
# And they are leaving lots of garbage, the $f.3 file.
# Have added $f.3 to the listing below to remove it
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $f.3 /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
liftUp -nosort -type=".psl" -nohead $3.tmp /cluster/data/monDom4/bed/tblastn.hg18KG/subChr.lft carry $f.3
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3
exit 0
fi
fi
rm -f $f.1 $f.2 $f.3 $3.tmp $f.8
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
cp -p /iscratch/i/monDom4/split500K/target.lst .
gensub2 target.lst kg.lst template jobList
# You want to try 10 jobs and see if the following chain
# pipeline will work in a reasonable time. If it doesn't the
# jobs are too big. There were some hippos in the simpleChain
# step.
para create jobList
para try
para push
# Completed: 504218 of 504218 jobs
# CPU time in finished jobs: 28863259s 481054.31m 8018.57h 334.07d 0.915 y
# IO & Wait Time: 4255045s 70918.42m 1181.96h 49.25d 0.135 y
# Average job time: 66s 1.09m 0.02h 0.00d
# Longest finished job: 443s 7.38m 0.12h 0.01d
# Submission to last job: 166648s 2777.47m 46.29h 1.93d
# This simpleChain step has jobs that are too large. They could
# be broken up even more as was done for the hippos that remained,
# see below.
ssh kk
cd /cluster/data/monDom4/bed/tblastn.hg18KG
mkdir chainEm
cd /cluster/data/monDom4/bed/tblastn.hg18KG/blastOut
find . -name "*.psl" \
> /cluster/data/monDom4/bed/tblastn.hg18KG/chainEm/chain.lst
cd chainEm
mkdir psl
cat << '_EOF_' > template
#LOOP
./chainSome.csh $(root1) $(file2) {check out line psl/c.$(file2).$(root1).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
# you will have to build this simpleChain binary in
# hg/mouseStuff/simpleChain/
# it will not run compiled as an x86_64 binary, only i386
cat << '_EOF_' > chainSome.csh
#!/bin/csh -fe
cat ../blastOut/${1}/q.${2}_*.psl | \
$HOME/bin/i386/simpleChain \
-prot -outPsl -maxGap=250000 stdin psl/c.${2}.${1}.psl
'_EOF_'
# << happy emacs
chmod +x chainSome.csh
ls -1dS ../blastOut/kg?? > chain.dirs
gensub2 chain.dirs ../../../chrom.lst template jobList
para create jobList
para push
# Completed: 574 of 699 jobs
# CPU time in finished jobs: 17708635s 295143.92m 4919.07h 204.96d 0.562 y
# IO & Wait Time: 1589482s 26491.37m 441.52h 18.40d 0.050 y
# Average job time: 33620s 560.34m 9.34h 0.39d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 222754s 3712.57m 61.88h 2.58d
# Submission to last job: 673020s 11217.00m 186.95h 7.79d
# Some of these ended up as hippos after a few days.
# Rebuild a new batch for just the hippos
ssh kk
cd /cluster/store9/monDom4/bed/tblastn.hg18KG/chainEm
mkdir runHippos
mkdir runHippos/psl
# from the running hippos, extract the commands:
para running | grep command: > splitHippos.sh
# edit splitHippos.sh to make those lines read:
# ./mkHippoList.csh kgaa chr1
# where mkHippoList.csh is:
# #!/bin/csh -fe
# ls ../blastOut/${1}/q.${2}_*.psl
# run that:
./splitHippos.sh > hippo.list
# split by chrom
grep chr1_ hippo.list > chr1.list
grep chr2_ hippo.list > chr2.list
grep chr3_ hippo.list > chr3.list
grep chr4_ hippo.list > chr4.list
# these were the only chroms in the list, then
# The sequence below does not produce the correct result.
# Instead, break up all protein results for each chrom to
# individual files.
ssh hgwdev
cd /cluster/store9/monDom4/bed/tblastn.hg18KG/chainEm
# Do these one at a time since they create huge amounts of data in
# /scratch/tmp - after each one has been sent to kkr1u00, the
# data here on hgwdev can be deleted
cat chr1.list | xargs cat > /scratch/tmp/chr1.psl
cd /scratch/tmp
cat << '_EOF_' > splitProteins.pl
#!/usr/bin/env perl
use strict;
use warnings;
sub usage() {
print "usage: sort -k10 chrN.psl | ./splitProteins.pl <chrN>\n";
print " where chrN is : chr1, chr2, chr3, etc...\n";
print " one at a time please.\n";
exit 255;
}
my $argc = scalar(@ARGV);
if ($argc != 1) { usage; }
my $chr = shift;
print "mkdir -p $chr", "\n";
print `mkdir -p $chr`, "\n";
my $curProtein="";
my $linesRead = 0;
while (my $line=<STDIN>) {
++$linesRead;
my ($col1, $col2, $col3, $col4, $col5, $col6, $col7, $col8, $col9, $col10, $rest) = split('\t',$line);
if ($curProtein ne $col10) {
if (length($curProtein) > 0) {
close(FH);
}
open(FH,">$chr/$col10.psl") or die "can not open $chr/$col10.psl";
$curProtein = $col10;
}
print FH $line;
if (0 == ($linesRead % 1000)) {print ".";}
if (0 == ($linesRead % 70000)) {print " $linesRead \n";}
}
close(FH);
'_EOF_'
# << happy emacs
chmod +x ./splitProteins.pl
sort -k10 chr1.psl | ./splitProteins.pl chr1
rsync -a --progress ./chr1/ kkr1u00:/iscratch/i/monDom4/simpleChain/chr1/
# After the four chroms are there, sync to all I servers
ssh kkr1u00
cd /iscratch/i/monDom4/simpleChain
for R in 2 3 4 5 6 7 8
do
rsync -a --progress --stats `pwd`/ \
kkr${R}u00:/iscratch/i/monDom4/simpleChain/
done
#
# while we are here, make up the jobList
cat << '_EOF_' > mkJobList
#!/bin/sh
for C in 1 2 3 4
do
M=0
N="00"
find ./chr${C} -type f | while read F
do
BN=`basename ${F}`
echo "./oneChained.csh chr${C} ${BN} ${N} (check line+ psl/chr${C}/${N}/${BN})"
M=`echo $M | awk '{printf "%d", $1+1}'`
if [ "${M}" -gt 999 ]; then
N=`echo ${N} | awk '{printf "%02d", $1+1}'`
M=0
fi
done
done
'_EOF_'
# << happy emacs
chmod +x mkJobList
./mkJobList > jobList
mkdir /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun
cp jobList /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun
ssh kk
cd /cluster/data/monDom4/bed/tblastn.hg18KG/hippoRun
mkdir psl
cat << '_EOF_' > oneChained.csh
#!/bin/csh -fe
mkdir -p psl/${1}/${3}
cat /iscratch/i/monDom4/simpleChain/${1}/${2} | \
$HOME/bin/i386/simpleChain \
-prot -outPsl -maxGap=250000 stdin psl/${1}/${3}/${2}
'_EOF_'
# << happy emacs
# XXXX this sequence did not work right, it breaks up individual
# XXXX protein results
# these were the only chroms in the list, then
mkdir chr1 chr2 chr3 chr4
cd chr1
split -a 3 -l 100 ../chr1.list chr1
cd ../chr2
split -a 3 -l 100 ../chr2.list chr2
cd ../chr3
split -a 3 -l 100 ../chr3.list chr3
cd ../chr4
split -a 3 -l 100 ../chr4.list chr4
cd ../runHippos
cat << '_EOF_' > mkJobList.sh
#!/bin/sh
for C in 1 2 3 4
do
for F in `(cd ../chr${C}; ls chr${C}*)`
do
echo \
"./chainHippos.csh ${F} chr${C} {check out line psl/c.chr${C}.${F}.psl}"
done
done
'_EOF_'
# << happy emacs
chmod +x ./mkJobList.sh
./mkJobList.sh > jobList
cat << '_EOF_' > chainHippos.csh
#!/bin/csh -fe
sed -e "s/^/..\//" ../${2}/${1} | xargs cat | \
$HOME/bin/i386/simpleChain \
-prot -outPsl -maxGap=250000 stdin psl/c.${2}.${1}.psl
'_EOF_'
# << happy emacs
chmod +x chainHippos.sh
para create jobList
para try ... check ... etc ....
###### Continuing with the concatenation of all the results
ssh kkstore04
# running out of disk space on store9, create a parallel hierarchy
# on a different disk still on this server:
mkdir -p /cluster/store8/monDom4/bed/tblastn.hg18KG/sortingResults
cd /cluster/data/monDom4/bed/tblastn.hg18KG
ln -s /cluster/store8/monDom4/bed/tblastn.hg18KG/sortingResults \
sortingResults
# first set of results was in blastOut
cd blastOut
time for i in kg??
do
find ./$i -name "q.*.psl" | xargs cat \
| awk '($13 - $12)/$11 > 0.6 {print}' \
> /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl
sort -rn \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl \
| pslUniq stdin \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/u.$i.psl
awk '(($1 / $11) ) > 0.60 {print}' \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.$i.psl \
> /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/m60.$i.psl
echo $i
done
# real 199m44.441s
# And then, adding the hippo results to those:
cd ../hippoRun/psl
time for C in 1 2 3 4
do
cd chr${C}
for D in *
do
find ./${D} -name "*.psl" | xargs cat \
| awk '($13 - $12)/$11 > 0.6 {print}' \
> /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl
sort -rn \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl \
| pslUniq stdin \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/u.kg0${C}_${D}.psl
awk '(($1 / $11) ) > 0.60 {print}' \
/cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/c60.kg0${C}_${D}.psl \
> /cluster/data/monDom4/bed/tblastn.hg18KG/sortingResults/m60.kg0${C}_${D}.psl
echo chr${C}/${D}
done
cd ..
done
# real 14m49.721s
cd ../../sortingResults
sort -u -k 14,14 -k 16,16n -k 18,18n u.*.psl m60* \
> /cluster/data/monDom4/bed/tblastn.hg18KG/blastHg18KG.psl
cd ..
wc -l blastHg18KG.psl
# 63264 blastHg18KG.psl
# what is this for ?
cat kgfa/*fa | grep ">" | wc
# 309494 309494 4810327
ssh hgwdev
cd /cluster/data/monDom4/bed/tblastn.hg18KG
hgLoadPsl monDom4 blastHg18KG.psl
time nice -n +19 featureBits monDom4 blastHg18KG
# 19424941 bases of 3501643220 (0.555%) in intersection
# back to kksilo
rm -rf blastOut
# End tblastn
############################################################################
# SWAP CHAINS/NET RN4 (DONE 3/28/06 angie)
ssh kkstore01
mkdir /cluster/data/monDom4/bed/blastz.rn4.swap
cd /cluster/data/monDom4/bed/blastz.rn4.swap
doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.monDom4/DEF \
-workhorse kkr8u00 >& do.log & tail -f do.log
ln -s blastz.rn4.swap /cluster/data/monDom4/bed/blastz.rn4
nice featureBits monDom4 chainRn4Link
# 198094619 bases of 3501643220 (5.657%) in intersection
#######################################################################
# CHIMP BLASTZ - (DONE - 2006-03-29 - 2006-04-10 - Hiram)
ssh pk
mkdir /cluster/data/monDom4/bed/blastzPanTro2.2006-03-29
cd /cluster/data/monDom4/bed
ln -s blastzPanTro2.2006-03-29 blastz.panTro2
cd blastzPanTro2.2006-03-29
cat << '_EOF_' > DEF
# opossum vs. chimp
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzPanTro2.2006-03-29
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# XXXX running 2006-03-29
# 2 chroms crashed out-of-mem, so run manually on kolossus
ssh kolossus
cd /cluster/data/monDom4/bed/blastz.panTro2/axtChain/run
(chain.csh monDom4.2bit:chr4: chain/monDom4.2bit:chr4:.chain; \
chain.csh monDom4.2bit:chr3: chain/monDom4.2bit:chr3:.chain) > chain.log &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=chainMerge \
`pwd`/DEF > blastz.chainMerge.out 2>&1 &
# failed during load due to mysql restart
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-continue=load \
`pwd`/DEF > blastz.load.out 2>&1 &
# SWAP ALIGNMENTS
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -swap \
-workhorse=kolossus \
`pwd`/DEF > swap.log 2>&1 &
# failed on file perm change on liftOver file owned by Hiram
rm /cluster/data/monDom4/bed/liftOver/monDom4ToPanTro2.over.chain.gz
rm /cluster/data/panTro2/bed/blastz.monDom4.swap/axtChain/noClass.net
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -swap \
-workhorse=kolossus \
-continue=net \
`pwd`/DEF > swap.net.log 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
# XXXX running 2006-04-04
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net -swap `pwd`/DEF > swap.net.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=load -swap `pwd`/DEF > swap.load.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=cleanup -swap `pwd`/DEF > swap.cleanup.out 2>&1 &
time nice -n +19 featureBits panTro2 chainMonDom4Link \
> fb.panTro2.chainMonDom4Link 2>&1 &
# BLASTZ/CHAIN/NET XENTRO2 (DONE 4/22/06 angie)
ssh kkstore04
mkdir /cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
cd /cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
cat << '_EOF_' > DEF
# opossum vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64
# Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Frog xenTro2 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/monDom4/bed/blastz.xenTro2.2006-04-20
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/monDom4XenTro2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
>& do.log & tail -f do.log
ln -s blastz.xenTro2.2006-04-20 /cluster/data/monDom4/bed/blastz.xenTro2
ssh kolossus
time nice -n +19 featureBits monDom4 chainXenTro2Link \
> fb.monDom4.chainXenTro2Link 2>&1 &
cat fb.monDom4.chainXenTro2Link
# 8160051 bases of 3501643220 (2.232%) in intersection
###########################################################################
# CREATE OPOSSUM AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS DIRECTORY
# AND NIB DIRECTORY ON THE SAN FOR BLASTZ CLUSTER RUNS
# (DONE, 2006-04-26, hartera)
# If there are no lineage-specific repeats for opossum and other species
# then use all repeats.
ssh pk
mkdir -p /san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
foreach f (/cluster/data/monDom4/*/chr*.fa.out)
cp -p $f \
/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers/$f:t:r:r.out.spec
end
# need nib files when running Blastz with lineage-specific repeats
# so create these and move to the san.
ssh kkstore04
mkdir -p /cluster/data/monDom4/nib
cd /cluster/data/monDom4
foreach f (*/chr*.fa)
echo "Processing $f ..."
nice faToNib -softMask $f nib/$f:t:r.nib
end
# move nibs to the san as taking up a lot of space on /cluster/data/
ssh pk
mv /cluster/data/monDom4/nib /san/sanvol1/scratch/monDom4/
##########################################################################
# QUALITY (DONE - 2006-04-27 - 2006-04-28 - Hiram)
ssh kkstore04
# this is actually a symlink to store8 to distribute the data on
# the full filesystems on kkstore04
mkdir /cluster/data/monDom4/bed/quality
cd /cluster/data/monDom4/bed/quality
cp -p /cluster/data/monDom2/bed/quality/qaToWigEncode.pl .
# using perl script from previous build
# Update the file name in this file:
my $AGP_FILE="/cluster/data/monDom4/broad.mit.edu/Monodelphis4.0.agp";
# convert the qual.fa to wiggle single column input and then wigEncode:
time nice -n +19 \
zcat /cluster/data/monDom4/broad.mit.edu/Monodelphis4.0.agp.chromosome.qual.gz \
| ./qaToWigEncode.pl /dev/stdin \
| $HOME/bin/$MACHTYPE/wigEncode -noOverlap \
stdin quality.wig quality.wib
# Converted stdin, upper limit 63.00, lower limit 0.00
# real 96m57.628s
# Resulting files:
ls -og quality.wi?
# -rw-rw-r-- 1 3501643220 Apr 27 18:16 quality.wib
# -rw-rw-r-- 1 322734978 Apr 27 18:16 quality.wig
# Note how the byte size of the .wib file is exactly equal
# to the number of bases in a featureBits run without -countGaps
# from the rmsk featureBits above:
# 1814099592 bases of 3501643220 (51.807%) in intersection
# This means we have a valid quality score for every base that is
# not in a gap, which is what we want. If this doesn't work out,
# you can turn this wiggle track into a bed file for each
# contiguous run of data, and then intersect those bed files with
# the gap track to see where they overlap. This happened this
# time when a new type of gap "clone" was found to be defined in
# the agp file. Previously there was only "fragment". The perl
# script had to be altered to recognize this new type of gap.
# For example, this could be done after the table was loaded:
ssh kolossus
cd /tmp
awk '{print $1}' /cluster/data/monDom4/chrom.sizes | while read C
do
hgWiggle -doBed -db=monDom4 -chr=${C} quality > ${C}.qual.bed
done
# then to intersect:
for C in 1 2 3 4 5 6 7 8 X Un
do
featureBits -chrom=chr${C} monDom4 gap chr${C}.qual.bed
done
# should all be zero
ssh hgwdev
cd /cluster/data/monDom4/bed/quality
ln -s `pwd`/quality.wib /gbdb/monDom4/wib
time nice -n +19 hgLoadWiggle monDom4 quality quality.wig
# real 3m29.459s
# verify index:
hgsql -e "show index from quality;" monDom4
# check that Cardinality is a number and is not NULL
####################################################################
### swap Hg18 blastz (DONE - 2006-04-28 - Hiram)
###
ssh pk
mkdir /cluster/data/monDom4/bed/blastz.hg18.swap
cd /cluster/data/monDom4/bed
ln -s blastz.hg18.swap blastz.hg18
cd blastz.hg18.swap
# The original DEF file had iscratch directories for the monDom4
# sequence, for this one we need san directories. This is a copy
# of the original one with the SEQ2_DIR and _LEN changed:
cat << '_EOF_' > DEF
# human vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
BLASTZ=blastz.v7
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Human (hg18)
SEQ1_DIR=/scratch/hg/hg18/nib
SEQ1_LEN=/scratch/hg/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzMonDom4.2006-02-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
cd /cluster/data/monDom4/bed/blastz.hg18.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap DEF > swap.out 2>&1 &
# real 118m44.093s
ssh kolossus
cd /cluster/data/monDom4/bed/blastz.hg18.swap
time nice -n +19 featureBits monDom4 chainHg18Link \
> fb.monDom4.chainHg18Link 2>&1
# 351100848 bases of 3501643220 (10.027%) in intersection
#######################################################################
### 7-Way conservation (DONE - 2006-04-28 - 2006-05-02 - Hiram)
ssh hgwdev
mkdir /cluster/data/monDom4/bed/multiz7way
cd /cluster/data/monDom4/bed/multiz7way
cp /cluster/data/mm8/bed/multiz17way/17way.nh .
/cluster/bin/phast/tree_doctor --prune \
chimp_panTro2,macaque_rheMac2,rabbit_oryCun1,cow_bosTau2,dog_canFam2,armadillo_dasNov1,elephant_loxAfr1,tenrec_echTel1,tetraodon_tetNig1,fugu_fr1 \
17way.nh > 7way.nh
# this leaves the tree we are working with here:
(((((human_hg18:0.054922,
(rat_rn4:0.081728,mouse_mm8:0.077017):0.335773):0.285925,
monodelphis_monDom4:0.371073):0.189124,
chicken_galGal2:0.454691):0.123297,
xenopus_xenTro2:0.782453):0.156067,
zebrafish_danRer3:0.938628);
/cluster/bin/phast/draw_tree 7way.nh > 7way.ps
/cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt
grep -y monDom4 7way.distances.txt | sort -k3,3n
# Print out that file for reference, and use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser. Zebrafish ends up before
# tetraodon and fugu on the browser despite its distance.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chainMonDom4Link chain linearGap
# distance on MonDom4 on other minScore
# 1 0.711920 - human hg18 (% 10.027) (% 12.385) 5000 loose
# 2 1.014888 - chicken galGal2 (% 03.050) (% 09.092) 5000 loose
# 3 1.069788 - mouse mm8 (% 06.024) (% 08.245) 5000 loose
# 4 1.074499 - rat rn4 (% 05.657) (% 07.784) 5000 loose
# 5 1.465947 - frog xenTro2 (% 02.232) (no swap ) 5000 loose
# 6 1.778189 - zebrafish danRer3 (% 02.090) (% 04.001) 5000 loose
cd /cluster/data/monDom4/bed/multiz7way
# bash shell syntax here ...
export H=/cluster/data/monDom4/bed
mkdir mafLinks
for G in hg18 galGal2 mm8 rn4 xenTro2 danRer3
do
mkdir mafLinks/$G
if [ ! -d ${H}/blastz.${G}/mafNet ]; then
echo "missing directory blastz.${G}/mafNet"
fi
ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
done
# Usually these MAFs are copied to the san for a pk kluster run.
# in this case they are small enough that there is no need to copy
# them anywhere. The alignment can be done right here on the
# kkstore04 filesystem.
# Copy MAFs to some appropriate NFS server for kluster run
# ssh kkstore04
# mkdir /san/sanvol1/scratch/monDom4/multiz7way
# cd /san/sanvol1/scratch/monDom4/multiz7way
# time rsync -a --copy-links --progress \
# /cluster/data/monDom4/bed/multiz7way/mafLinks/ .
# We have about 734 Mb of data here, takes ~ 30 seconds to copy
# the autoMultiz cluster run
ssh kk
mkdir /cluster/data/monDom4/bed/multiz7way/autoMultiz
cd /cluster/data/monDom4/bed/multiz7way/autoMultiz
mkdir penn
# cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
# cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
# cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.i386/multiz-tba/autoMZ penn
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
../7way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir -p maf run
cd run
# NOTE: you need to set the db properly in this script
# And the binDir and pairs dir
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = monDom4
set c = $1
set maf = $2
set binDir = /cluster/data/monDom4/bed/multiz7way/autoMultiz/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /cluster/data/monDom4/bed/multiz7way/mafLinks
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/monDom4/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 10 jobs
para try ... check ... push ... etc ...
# Completed: 10 of 10 jobs
# CPU time in finished jobs: 38783s 646.38m 10.77h 0.45d 0.001 y
# IO & Wait Time: 1184s 19.73m 0.33h 0.01d 0.000 y
# Average job time: 3997s 66.61m 1.11h 0.05d
# Longest finished job: 8706s 145.10m 2.42h 0.10d
# Submission to last job: 8706s 145.10m 2.42h 0.10d
# combine results into a single file for loading and gbdb reference
ssh kkstore04
cd /cluster/data/monDom4/bed/multiz7way
# There used to be a mafFilter here with a minScore of 500, but it
# turns out that the scores in these maf files are pretty much
# useless. They range from very large negatives to very large
# positives.
time catDir maf > multiz7way.maf
# real 1m18.849s
# makes a 2 Gb file:
# -rw-rw-r-- 1 2119554905 May 1 14:23 multiz7way.maf
# Create per-chrom individual maf files for downloads
ssh kkstore04
cd /cluster/data/monDom4/bed/multiz7way
mkdir mafDownloads
time for M in maf/chr*.maf
do
B=`basename $M`
cp -p ${M} mafDownloads/${B}
gzip mafDownloads/${B}
echo ${B} done
done
# real 7m21.713s
# deliver to downloads
ssh hgwdev
ln -s /cluster/data/monDom4/bed/multiz7way/mafDownloads \
/usr/local/apache/htdocs/goldenPath/monDom4/multiz7way
# Load into database
ssh hgwdev
cd /cluster/data/monDom4/bed/multiz7way
mkdir /gbdb/monDom4/multiz7way
ln -s /cluster/data/monDom4/bed/multiz7way/multiz7way.maf \
/gbdb/monDom4/multiz7way
time nice -n +19 hgLoadMaf monDom4 multiz7way
# Loaded 2180653 mafs in 1 files from /gbdb/monDom4/multiz7way
# real 3m35.280s
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 monDom4 multiz7waySummary multiz7way.maf
# Created 1423996 summary blocks from 5463968 components and
# 2180653 mafs from multiz7way.maf
# real 5m40.531s
# create tree image:
cat << '_EOF_' > species.nh
(((((human,(rat,mouse)),opossum),chicken),frog),zebrafish)
'_EOF_'
/cluster/bin/phast/draw_tree -b -s species.nh > species7.ps
# photoshop to enhance, reduce the amount of whitespace to make it
# smaller, then save as jpg
ln -s /cluster/data/monDom4/bed/multiz7way/species7.jpg \
/usr/local/apache/htdocs/images/phylo/monDom4_7way.jpg
# the chopping of whitespace can be done with Imagemagick
# 'display' - use the "transform -> chop" menu, select horizontal
# or vertical chop, highlight an area to cut
############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
# (DONE - 2006-05-01 - 2006-05-02 - Hiram)
# Estimate phastCons parameters
ssh kkstore04
mkdir /cluster/data/monDom4/bed/multiz7way/cons
cd /cluster/data/monDom4/bed/multiz7way/cons
# Create a starting-tree.mod based on chr1 (the largest one)
time /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr1.maf \
--refseq ../../../1/chr1.fa --in-format MAF \
--windows 100000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
# real 5m50.239s
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "(((((hg18,(rn4,mm8)),monDom4),galGal2),xenTro2),danRer3)" \
--out-root starting-tree
# real 1m11.568s
rm s1.*.ss
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.398
# This 0.398 is used in the --gc argument below
# Create big bad bloated SS files on san filesystem (takes ~ 2h 20m)
# Increasing their size this time from 1,000,000 to 10,000,000 to
# slow down the phastCons pk jobs
ssh kkstore04
mkdir -p /san/sanvol1/scratch/monDom4/cons/ss
cd /san/sanvol1/scratch/monDom4/cons/ss
time for C in `awk '{print $1}' /cluster/data/monDom4/chrom.sizes`
do
if [ -s /cluster/data/monDom4/bed/multiz7way/maf/${C}.maf ]; then
mkdir ${C}
echo msa_split $C
chrN=${C/chr/}
chrN=${chrN/_random/}
/cluster/bin/phast/$MACHTYPE/msa_split \
/cluster/data/monDom4/bed/multiz7way/maf/${C}.maf \
--refseq /cluster/data/monDom4/${chrN}/${C}.fa \
--in-format MAF --windows 10000000,0 --between-blocks 5000 \
--out-format SS --out-root ${C}/${C}
fi
done &
# real 28m44.827s
# Create a random list of 50 1 mb regions (do not use the _randoms)
cd /san/sanvol1/scratch/monDom4/cons/ss
ls -1l chr*/chr*.ss | grep -v random | \
awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
# Set up parasol directory to calculate trees on these 50 regions
ssh kk
mkdir /san/sanvol1/scratch/monDom4/cons/treeRun1
cd /san/sanvol1/scratch/monDom4/cons/treeRun1
mkdir tree log
# Tuning this loop should come back to here to recalculate
# Create little script that calls phastCons with right arguments
# --target-coverage of 0.20 is about right for mouse, will be
# tuned exactly below
cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
/cluster/data/monDom4/bed/multiz7way/cons/starting-tree.mod \
--gc 0.398 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-lengths 12 --target-coverage 0.17 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
# << happy emacs
chmod a+x makeTree.csh
# Create gensub file
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Make cluster job and run it
gensub2 ../randomSs.list single template jobList
para create jobList
para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs: 38161s 636.02m 10.60h 0.44d 0.001 y
# IO & Wait Time: 385s 6.41m 0.11h 0.00d 0.000 y
# Average job time: 771s 12.85m 0.21h 0.01d
# Longest finished job: 1672s 27.87m 0.46h 0.02d
# Submission to last job: 1693s 28.22m 0.47h 0.02d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kkstore04
cd /san/sanvol1/scratch/monDom4/cons/treeRun1
ls -1 tree/chr*/*.cons.mod > cons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
--output-average ../ave.cons.mod > cons_summary.txt
ls -1 tree/chr*/*.noncons.mod > noncons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
--output-average ../ave.noncons.mod > noncons_summary.txt
cd ..
cp -p ave.*.mod /cluster/data/monDom4/bed/multiz7way/cons
# measuring entropy
# consEntopy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
# never stops with the --NH argument
/cluster/bin/phast/$MACHTYPE/consEntropy .17 12 --NH 9.78 \
ave.cons.mod ave.noncons.mod
# ( Solving for new omega: 12.000000 9.952747 9.669713 9.663148 )
# Trans parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.090778 bits/site
# Expected min. length: L_min=9.400348 sites
# Expected max. length: L_max=6.597224 sites
# Phylogenetic information threshold: PIT=L_min*H=10.253692 bits
# Recommended expected length: omega=9.663148 sites (for L_min*H=9.780000)
XXXX - working 2006-05-01 17:00
### !!! *** This one with .17 and 12 is the one that was finally used
# consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4
# Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.478838 bits/site
# Required length: N=6.753382 sites
# Total entropy: NH=9.987159 bits
# SKIP to here passing by the tuning numbers
ssh kk
# Create cluster dir to do main phastCons run
mkdir /san/sanvol1/scratch/monDom4/cons/consRun1
cd /san/sanvol1/scratch/monDom4/cons
cp /san/sanvol1/scratch/mm8/cons/elliotsEncode.mod .
# This file looks like:
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
TRAINING_LNL: -988246.132962
BACKGROUND: 0.295 0.205 0.205 0.295
RATE_MAT:
-1.165221 0.315494 0.589884 0.259843
0.189778 -0.878194 0.208718 0.479698
0.444622 0.261535 -0.885604 0.179447
0.234867 0.720815 0.215191 -1.170872
TREE: (((((((((((((hg18:0.006690,panTro2:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn4:0.081728,monDom4:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom4:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro2:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer3:0.782561):0.156067);
cd /san/sanvol1/scratch/monDom4/cons/consRun1
mkdir ppRaw bed
# Create script to run phastCons with right parameters
# These parameters:
# --rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
# were taken from Kate's 17-way in Hg17, including the
# -not-informative panTro2
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \
--expected-lengths 12 --target-coverage 0.17 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score \
--require-informative 0 > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/elliotsEncode.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# << happy emacs
chmod a+x doPhast
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
# These jobs are very fast and very I/O intensive, even on the san
# they will hang it up as they work at full tilt.
# Completed: 366 of 366 jobs
# CPU time in finished jobs: 29300s 488.33m 8.14h 0.34d 0.001 y
# IO & Wait Time: 19643s 327.39m 5.46h 0.23d 0.001 y
# Average job time: 134s 2.23m 0.04h 0.00d
# Longest finished job: 247s 4.12m 0.07h 0.00d
# Submission to last job: 1880s 31.33m 0.52h 0.02d
# combine predictions and transform scores to be in 0-1000 interval
# it uses a lot of memory, so on kolossus:
ssh kolossus
cd /san/sanvol1/scratch/monDom4/cons/consRun1
# The sed's and the sort get the file names in chrom,start order
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/monDom4/bed/multiz7way
# Figure out how much is actually covered by the bed files as so:
# The 2583393846 comes from the non-n genome size,
# from faSize on all chroms:
ssh kkstore04
cd /cluster/data/monDom4
faSize ?{,?}/chr*.fa
# 3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
# 1949956770 lower) in 10 sequences in 10 files
cd /san/sanvol1/scratch/monDom4/cons/consRun1
awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/3501643220\n",100.0*sum/3501643220,sum}' \
mostConserved.bed
# --expected-lengths 12 --target-coverage 0.17 --quiet \
# % 4.80 = 100.0*168149380/3501643220
ssh kolossus
cd /san/sanvol1/scratch/monDom4/cons/consRun1
# Aiming for %70 coverage in
# the following featureBits measurement on CDS:
# Beware of negative scores when too high. The logToBedScore
# will output an error on any negative scores.
# I don't think this will work for opossum because we don't have
# enough refGene:cds coverage:
time nice -n +19 featureBits monDom4 refGene:cds
# 34857 bases of 3501643220 (0.001%) in intersection
HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits monDom4 \
-enrichment refGene:cds mostConserved.bed
# --expected-lengths 12 --target-coverage 0.17 --quiet \
# refGene:cds 0.001%, mostConserved.bed 4.802%, both 0.001%, cover
# 75.13%, enrich 15.65x
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/monDom4/bed/multiz7way
time nice -n +19 hgLoadBed -strict monDom4 \
phastConsElements7way mostConserved.bed
# real 2m31.673s
# Loaded 1882640 elements of size 5
# should measure the same as above
time nice -n +19 featureBits monDom4 -enrichment refGene:cds \
phastConsElements7way
# refGene:cds 0.001%, phastConsElements7way 4.802%, both 0.001%,
# cover 75.13%, enrich 15.65x
# Create merged posterier probability file and wiggle track data files
ssh kkstore04
cd /san/sanvol1/scratch/monDom4/cons/consRun1
# the sed business gets the names sorted by chromName, chromStart
# so that everything goes in numerical order into wigEncode
# You will want to test this to make sure it is sorting correctly,
# run this command and see if everything is in order by chrom and
# position:
find ./ppRaw -type f \
| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
time nice -n +19 find ./ppRaw -type f \
| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
phastCons7.wig phastCons7.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 4m4.694s
# -rw-rw-r-- 1 465425363 May 2 12:42 phastCons7.wib
# -rw-rw-r-- 1 79633105 May 2 12:42 phastCons7.wig
cp -p phastCons7.wi? /cluster/data/monDom4/bed/multiz7way/
# prepare compressed copy of ascii data values for downloads
ssh kkstore04
cd /san/sanvol1/scratch/monDom4/cons/consRun1
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons7Scores
for D in ppRaw/chr*
do
C=${D/ppRaw\/}
out=phastCons7Scores/${C}.data.gz
echo "========================== ${C} ${D}"
find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 8m10.959s
# copy them for downloads
ssh kkstore04
mkdir /cluster/data/monDom4/bed/multiz7way/phastCons7Scores
cd /cluster/data/monDom4/bed/multiz7way/phastCons7Scores
cp -p /san/sanvol1/scratch/monDom4/cons/consRun1/phastCons7Scores/* .
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/monDom4
ln -s /cluster/data/monDom4/bed/multiz7way/phastCons7Scores .
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/monDom4/bed/multiz7way
ln -s `pwd`/phastCons7.wib /gbdb/monDom4/wib/phastCons7.wib
time nice -n +19 hgLoadWiggle monDom4 phastCons7 phastCons7.wig
# real 0m52.109s
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/monDom4/bed/multiz7way
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=monDom4 phastCons7 > histogram.data 2>&1
# real 9m8.075s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Opossum MonDom4 Histogram phastCons7 track"
set xlabel " phastCons7 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
#########################################################################
# Build maf annotation for multiz7way (DONE - 2006-07-31 - Hiram)
ssh kolossus
cd /cluster/data/monDom4/bed/multiz7way
mkdir anno
cd anno
mkdir maf run
cd run
rm sizes nBeds
for i in `cat /cluster/data/monDom4/bed/multiz7way/species.lst`
do
ln -s /cluster/data/$i/chrom.sizes $i.len
ln -s /cluster/data/$i/$i.N.bed $i.bed
echo $i.bed >> nBeds
echo $i.len >> sizes
done
echo "#!/bin/csh -fe" > jobs.csh
echo date >> jobs.csh
# do smaller jobs first, the 'ls -rS' lists in reverse order by size
for i in `ls -rS ../../maf/*.maf`
do
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/monDom4/monDom4.2bit ../maf/`basename $i` >> jobs.csh
echo "echo $i" >> jobs.csh
done
echo date >> jobs.csh
chmod +x jobs.csh
# establish a screen for this long running job
screen
cd /cluster/data/monDom4/bed/multiz7way
time ./jobs.csh > jobs.log 2>&1
# real 195m54.071s
# user 57m51.373s
# sys 128m25.386s
# ctrl-a ctrl-d to release
# to return to the screen:
screen -r -d
cd /cluster/data/monDom4/bed/multiz7way/anno/maf
mkdir -p /gbdb/monDom4/multiz7way/anno/maf
ln -s /cluster/data/monDom4/bed/multiz7way/anno/maf/*.maf \
/gbdb/monDom4/multiz7way/anno/maf
time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/monDom4/multiz7way/anno/maf \
monDom4 multiz7way
# Loaded 2781002 mafs in 10 files from /gbdb/monDom4/multiz7way/anno/maf
# real 1m49.236s
time cat *.maf \
| nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 monDom4 multiz7waySummary stdin
# Created 1423996 summary blocks from 5463968 components and 2781002 mafs
# from stdin
# real 2m30.875s
#######################################################################
# MULTIZ7WAY MAF FRAMES (DONE - 2006-08-01 - Hiram)
ssh hgwdev
mkdir /cluster/data/monDom4/bed/multiz7way/frames
cd /cluster/data/monDom4/bed/multiz7way/frames
# The following is adapted from the sequence in mm8.txt
#------------------------------------------------------------------------
# get the genes for all genomes
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using mrna table as genes:
mkdir genes
for qDB in danRer3
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${qDB}.mrna-cds.${tmpExt}
tmpMrna=${qDB}.mrna.${tmpExt}
tmpCds=${qDB}.cds.${tmpExt}
echo $qDB
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
${qDB} > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
rm -f $tmpExt
done
# using knownGene for rn4 mm8 hg18
# using refGene for galGal2
# using mgcGenes for xenTro2
# using ensGene for monDom4
# genePreds; (must keep only the first 10 columns for knownGene)
for qDB in rn4 mm8 hg18 galGal2 xenTro2 monDom4
do
if [ $qDB = "xenTro2" ]; then
geneTbl=mgcGenes
elif [ $qDB = "galGal2" ]; then
geneTbl=refGene
elif [ $qDB = "monDom4" ]; then
geneTbl=ensGene
else
geneTbl=knownGene
fi
echo hgsql -N -e 'select * from '"$geneTbl ${qDB}"
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
rm -f $tmpExt
done
#------------------------------------------------------------------------
# create frames
ssh kkstore04
cd /cluster/data/monDom4/bed/multiz7way/frames
rm mkMafFrames.sh
for C in `awk '{print $1;}' /cluster/data/monDom4/chrom.sizes`
do
echo "echo mafFrames/${C}.maf"
echo genePredToMafFrames monDom4 ../mafDownloads/${C}.maf.gz \
mafFrames/${C}.maf \
monDom4 genes/monDom4.gp.gz \
hg18 genes/hg18.gp.gz \
mm8 genes/mm8.gp.gz \
rn4 genes/rn4.gp.gz \
galGal2 genes/galGal2.gp.gz \
danRer3 genes/danRer3.gp.gz \
xenTro2 genes/xenTro2.gp.gz
done > mkMafFrames.sh
chmod +x mkMafFrames.sh
time nice -n +19 ./mkMafFrames.sh
# real 1m10.711s
gzip mafFrames/*.maf
#------------------------------------------------------------------------
# load the database
ssh hgwdev
cd /cluster/data/monDom4/bed/multiz7way/frames
time nice -n +19 hgLoadMafFrames monDom4 multiz7wayFrames \
mafFrames/*.maf.gz
# real 0m17.864s
# The use of this table requires an entry in the trackDb.ra for this:
# frames multiz7wayFrames
# irows on
###########################################################################
# BLASTZ SWAP OF ZEBRAFISH (danRer4) CHAIN AND NET ALIGNMENTS OVER TO
# monDom4 AND CREATE AXNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
# (DONE, 2006-04-29, hartera)
# see also makeDanRer4.doc
# Remove test tables and directories and start again:
ssh hgwdev
cd /cluster/data/monDom4
foreach c (`cat chrom.lst`)
hgsql -e "drop table chr${c}_chainDanRer4;" monDom4
hgsql -e "drop table chr${c}_chainDanRer4Link;" monDom4
hgsql -e "drop table chr${c}_chainDanRer4NoDyMsk;" monDom4
hgsql -e "drop table chr${c}_chainDanRer4NoDyMskLink;" monDom4
end
hgsql -e "drop table netDanRer4;" monDom4
hgsql -e "drop table netDanRer4NoDyMsk;" monDom4
# remove downloads
rm -r /usr/local/apache/htdocs/goldenPath/monDom4/vsDanRer4
rm \
/usr/local/apache/htdocs/goldenPath/monDom4/liftOver/monDom4ToDanRer4.over.chain.gz
rm /cluster/data/monDom4/bed/liftOver/monDom4ToDanRer4.over.chain.gz
# remove old Blastz swap
rm -r /cluster/data/monDom4/bed/blastz.danRer4.swap
# remove link to old blastz directory
rm -r /cluster/data/monDom4/bed/blastz.danRer4
# Used the following BLASTZ parameters:
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
# Used all repeats as lineage-specific repeats.
# Swap directory is: /cluster/data/monDom4/bed/blastz.danRer4.swap
ssh pk
cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& doSwap.log &
featureBits monDom4 chainDanRer4Link
# 62249730 bases of 3501643220 (1.778%) in intersection
featureBits monDom4 chainDanRer3Link
# 73187496 bases of 3501643220 (2.090%) in intersection
featureBits monDom4 netDanRer4
# 745122163 bases of 3501643220 (21.279%) in intersection
featureBits monDom4 netDanRer3
# 672446285 bases of 3501643220 (19.204%) in intersection
featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4Link -enrichment
# refGene:cds 0.001%, chainDanRer4Link 1.807%, both 0.000%,
# cover 43.85%, enrich 24.27x
featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer3Link -enrichment
# refGene:cds 0.001%, chainDanRer3Link 2.044%, both 0.000%,
# cover 49.24%, enrich 24.09x
# Only 36 RefSeqs in monDom4 refGene so try xenoRefGene (78707 sequences)
# and mrna (174 sequences):
featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4Link -enrichment
# xenoRefGene:cds 0.820%, chainDanRer4Link 1.807%, both 0.661%,
# cover 80.63%, enrich 44.63x
featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer3Link -enrichment
# xenoRefGene:cds 0.820%, chainDanRer3Link 2.044%, both 0.577%,
# cover 70.45%, enrich 34.47x
featureBits -chrom=chr1 monDom4 mrna chainDanRer4Link -enrichment
# mrna 0.004%, chainDanRer4Link 1.807%, both 0.002%,
# cover 52.67%, enrich 29.15x
featureBits -chrom=chr1 monDom4 mrna chainDanRer3Link -enrichment
# mrna 0.004%, chainDanRer3Link 2.044%, both 0.002%,
# cover 54.50%, enrich 26.66x
featureBits -chrom=chr1 monDom4 xenoRefGene:cds netDanRer4 -enrichment
# xenoRefGene:cds 0.820%, netDanRer4 24.539%, both 0.720%,
# cover 87.79%, enrich 3.58x
featureBits -chrom=chr1 monDom4 xenoRefGene:cds netDanRer3 -enrichment
# xenoRefGene:cds 0.820%, netDanRer3 22.272%, both 0.640%,
# cover 78.12%, enrich 3.51x
featureBits -chrom=chr1 monDom4 mrna netDanRer4 -enrichment
# mrna 0.004%, netDanRer4 24.539%, both 0.002%, cover 56.90%, enrich 2.32x
featureBits -chrom=chr1 monDom4 mrna netDanRer3 -enrichment
# mrna 0.004%, netDanRer3 22.272%, both 0.002%, cover 62.62%, enrich 2.81x
##########################################################################
# PRODUCING GENSCAN PREDICTIONS (WORKING - 2006-05-02 - Hiram)
# Want to make up a chrom and contig 2bit file to get proper
# names on the genscan predictions.
ssh kkstore04
cd /cluster/data/monDom4/Un
grep chrUn ../broad.mit.edu/Monodelphis4.0.agp > chrUn.broad.agp
cat chrUn.broad.agp | /cluster/bin/scripts/agpToLift > chrUn_random.lft
$HOME/kent/src/hg/utils/lft2BitToFa.pl ../monDom4.2bit chrUn_random.lft \
> chrUn_random.ctg.fa
# Create a 2bit file with the full chrom sequences and the
# random contigs, all hard masked
cat ?/chr?.fa Un/chrUn_random.ctg.fa \
| maskOutFa stdin hard stdout \
| faToTwoBit stdin monDom4Chroms_RandomContigs.hard.2bit
# make sure it still has all the unmasked sequence in it:
twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
| faSize stdin
# 3589973501 bases (2038287051 N's 1551686450 real 1551686450
# upper 0 lower) in 9949 sequences in 1 files
twoBitToFa monDom4.2bit stdout | faSize stdin
# 3605614649 bases (103971429 N's 3501643220 real 1551686450 upper
# 1949956770 lower) in 10 sequences in 1 files
# note the 'real' bases are the same, the lowers have become N's
# 1089350685 + 97171400 = 1186522085
# 1949956770 + 103971429 = 2053928199
# 2053928199 - 2038287051 = 15641148 == N's in gaps between contigs
# There are a lot of contigs that became completely N's with no
# sequence left. genscan doesn't like that. We need to get them
# out of the analysis. Pick out the ones with over 18 real bases
# left:
twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
| faCount stdin > chroms_randoms.faCount
egrep -v "^#|^total" chroms_randoms.faCount \
| awk '{if ($2-$7 > 17) {print $1}}' > over18.lst
# creating 4,000,000 sized chunks
mkdir hardChunks
twoBitToFa monDom4Chroms_RandomContigs.hard.2bit stdout \
| faSomeRecords /dev/stdin over18.lst stdout \
| faSplit about stdin 4000000 hardChunks/c_
rsync -a --progress hardChunks/ /cluster/bluearc/monDom4/hardChunks/
ssh hgwdev
mkdir /cluster/data/monDom4/bed/genscan
cd /cluster/data/monDom4/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on pk cluster instead of kki since these jobs are going to
# take quite a while and we don't want to tie up the kki kluster
# entirely.
ssh pk
cd /cluster/data/monDom4/bed/genscan
# Make 3 subdirectories for genscan to place output files
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
# Since we split on gaps, we have no chunks like that. You can
# verify with faCount on the chunks.
ls -1S /cluster/bluearc/monDom4/hardChunks/c_*.fa > genome.list
# Create template file, for gensub2. For example (3-line file):
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan - par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << emacs
gensub2 genome.list single template jobList
para create jobList
para try, check, push, check, ...
XXXX - running 2006-05-04 14:13
# Completed: 906 of 906 jobs
# CPU time in finished jobs: 215764s 3596.07m 59.93h 2.50d 0.007 y
# IO & Wait Time: 2798s 46.63m 0.78h 0.03d 0.000 y
# Average job time: 241s 4.02m 0.07h 0.00d
# Longest finished job: 57335s 955.58m 15.93h 0.66d
# Submission to last job: 63345s 1055.75m 17.60h 0.73d
cat gtf/c_*.gtf | liftUp -type=.gtf t.gtf \
/cluster/data/monDom4/Un/chrUn_random.lft carry stdin
# Concatenate and lift to chrom results:
ssh kkstore04
cd /cluster/data/monDom4/bed/genscan
for C in `awk '{print $1}' ../../chrom.sizes | sed -e "s/chr//"`
do
mkdir -p ${C}
catDir gtf | liftUp ${C}/chr${C}.gtf \
/cluster/data/monDom4/hardChunks.lft error stdin
catDir subopt | liftUp ${C}/chr${C}.bed \
/cluster/data/monDom4/hardChunks.lft error stdin
catDir pep > ${C}/chr${C}.pep
echo "done: chr${C}"
done
# Load into the database as so:
ssh hgwdev
cd /cluster/data/monDom4/bed/genscan
ldHgGene -bin -gtf monDom4 genscan 1/chr1.gtf
# 35103 gene predictions
hgPepPred monDom4 generic genscanPep 1/chr1.pep
hgLoadBed monDom4 genscanSubopt 1/chr1.bed
# Loaded 446953 elements of size 6
# Compare with other assemblies:
featureBits monDom4 genscan
# 48628209 bases of 3496445653 (1.391%) in intersection
featureBits monDom1 genscan
# 47170795 bases of 3492108230 (1.351%) in intersection
featureBits monDom4 genscanSubopt
# 52714123 bases of 3496445653 (1.508%) in intersection
featureBits monDom1 genscanSubopt
# 51805835 bases of 3492108230 (1.484%) in intersection
featureBits hg17 genscan
# 55323340 bases of 2866216770 (1.930%) in intersection
featureBits hg17 genscanSubopt
# 55986178 bases of 2866216770 (1.953%) in intersection
# Clean up
cd /san/sanvol1/scratch/monDom4
rm -fr hardMasked200K
# This kolossus clean up was actually left until much later.
# I found these copies here to be useful later on.
ssh kolossus
cd /scratch/monDom4
rm -f *
cd ..
rmdir monDom4
#########################################################################
# creating bigZips downloads (DONE - 2006-05-15 - Hiram)
ssh hgwdev
mkdir /cluster/data/monDom4/downloads
mkdir /cluster/data/monDom4/downloads/bigZips
mkdir /cluster/data/monDom4/downloads/chromosomes
cd /cluster/data/monDom4/downloads/chromosomes
# There wasn't a README for a previous assembly, these things are
# pretty generic, the mm8 one can be used with only slight edits.
# Next time use the previous Opossum assembly README.
cp /usr/local/apache/htdocs/goldenPath/mm8/chromosomes/README.txt .
# edit that readme to provide correct references and details
ln -s `pwd` /usr/local/apache/htdocs/goldenPath/monDom4/chromosomes
ssh kkstore04
cd /cluster/data/monDom4/downloads/chromosomes
for C in 1 2 3 4 5 6 7 8 X Un
do
echo "gzip -c ../../${C}/chr${C}.fa > chr${C}.fa.gz"
gzip -c ../../${C}/chr${C}.fa > chr${C}.fa.gz
done
md5sum *.fa.gz R*.txt > md5sum.txt
cd ../bigZips
gzip -c ../../broad.mit.edu/Monodelphis4.0.agp > Monodelphis4.0.agp.gz
cd ../..
tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa Un/chrUn.fa
# 16 minutes
tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \
Un/chrUn.fa.masked
# 8 minutes
tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out Un/chrUn.fa.out
cd bed/simpleRepeat
mkdir trfMask
cd trfMask
for C in 1 2 3 4 5 6 7 8 X Un
do
ln -s ../chromMask/chr${C}_Mask.bed chr${C}.bed
done
tar --dereference cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMask
ssh hgwdev
cd /cluster/data/monDom4/downloads/bigZips
cp /usr/local/apache/htdocs/goldenPath/mm8/bigZips/README.txt .
ln -s /cluster/data/monDom4/downloads/bigZips \
/usr/local/apache/htdocs/goldenPath/monDom4/bigZips
cd /usr/local/apache/htdocs/goldenPath/monDom4/bigZips
ln -s /cluster/data/monDom4/monDom4.2bit .
ssh kkstore04
cd /cluster/data/monDom4/downloads/bigZips
md5sum *.gz *.2bit R*.txt > md5sum.txt
#########################################################################
# BLASTZ CHICKEN galGal3 (DONE 5/26/06 angie)
ssh pk
mkdir /cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
cd /cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
cat << '_EOF_' > DEF
# opossum vs chicken
BLASTZ=blastz.v7.x86_64
# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib
SEQ1_SMSK=/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzGalGal3.2006-05-25
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsmonDom4 \
-bigClusterHub=pk -smallClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose \
>& do.log & tail -f do.log
ln -s blastzGalGal3.2006-05-25 /cluster/data/monDom4/bed/blastz.galGal3
#####################################################################
#### LOAD ENSEMBL GENES (DONE - 2006-06-21 - Hiram)
mkdir /cluster/data/monDom4/bed/ensGene
cd /cluster/data/monDom4/bed/ensGene
Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
Choose Ensembl 39 and Monodelphis domestica, click next
It displays status in a window on the right, indicating how many
entries are here, currently: 20,907
The next page is the "filter" step, we do not want any filters,
nothing is changed on this page, click next
Now we are on the "output" tab, the filter in the window on the right
indicates that 20,907 passed the filter. (there is no filter)
Now, on this output page, change the pull-down menu item from
its default of "features" to read "structures"
All the check-boxes now change.
Mark the check box GTF under output format
Under Gene Ensemble Attributes,
Unselect Biotype
Select
Ensembl Gene ID
Ensembl Transcript ID
External Gene ID
gzip compression
and give it a filename: ensGeneMonDom4
it will add the .gff.gz suffix
press "export"
# They have a chrMT - we don't have the chrM sequence,
# Add "chr" to front of each line
# in the gene data gtf file to make
# it compatible with ldHgGene and convert remove chrMT name
zcat ensGeneMonDom4.gff.gz \
| sed -e "s/^\([0-9XYU][0-9n]*\)/chr\1/; /^MT/d" > ensGene.gtf
# Verify names converted OK:
awk '{print $1}' ensGene.gtf | sort | uniq -c
# 153247 chr1
# 126767 chr2
# 82637 chr3
# 90199 chr4
# 47661 chr5
# 50469 chr6
# 34230 chr7
# 55841 chr8
# 41868 chrUn
# 9965 chrX
ldHgGene -gtf monDom4 ensGene ensGene.gtf
# Read 33878 transcripts in 692884 lines in 1 files
# 33878 groups 10 seqs 1 sources 4 feature types
# 33878 gene predictions
featureBits monDom4 ensGene
# 32770559 bases of 3501643220 (0.936%) in intersection
# Let's see if the peptides created from the genome sequence are
# different than the peptides that Ensemble says they are making:
# fetch chrX peptides from the ensGene table, sort them by their
# name into single line instances:
getRnaPred -peptides monDom4 ensGene chrX stdout \
| faToTab -type=protein stdin stdout | sort > ensGenePep.chrX.fa
# And chrX peptides downloaded from ensmart:
faToTab -type=protein ensPepMonDom4chrX.fasta.gz stdout \
| sort > ensPepchrX.fa
# A diff of those two show they are different. But this is not
# correct yet because they don't even have the same names in them
# need to try some other options here to get the same sequence
# It turns out it is necessary to have the ensGtp table to enable a link
# to Ensemble from a gene's details page
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
# Save file as monDom4.ensGtp.txt.gz
gunzip ensGtp.txt.gz
hgsql monDom4 < ~/kent/src/hg/lib/ensGtp.sql
# remove header line from ensGtp.txt
zcat monDom4.ensGtp.txt.gz \
| hgsql monDom4 \
-e "load data local infile '/dev/stdin' into table ensGtp ignore 1 lines"
# Obtaining protein sequence from EnsMart
# Select "sequences" from the pull-down on the output page
# check Peptide in the "Sequences" selection area
# and "Ensembl Transcript ID (versioned) in the Transcript
# Attributes area
# Uncheck all items in the Gene Attributes section.
# Text,Fasta output, gzip, file name: monDom4.ensPep
# becomes monDom4.ensPep.fasta.gz
# The result should be just an ID and a peptide, e.g.:
# >ENSMODT00000024417.2
# MSEFWLCFNCCIAEQPQPKRRRRIDRSMIGEPTNFVHTAHVGSGDLFSGMNSVSSIQSQMQSKGGYGGGMPSNGQMQLVETKAG
#
# monDom4.ensPep.fa.gz
zcat monDom4.ensPep.fa.gz | \
$HOME/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
> ensGene.fa.tab
# There appear to be more peptides than there were genes before.
# I wonder if Ensembl has updated this list since the gene data was
# fetched before:
awk '{print $1}' genePred.tab | sort -u > geneNameList
awk '{print $1}' ensGene.fa.tab | sort -u > pepNameList
wc geneNameList pepNameList
# 33878 33878 711438 geneNameList
# 33915 33914 712193 pepNameList
# 67793 67792 1423631 total
comm -12 geneNameList pepNameList | wc
# 33878 33878 711438
# So create an exclude list to get rid of the extras:
comm -13 geneNameList pepNameList > excludeList
# and leave them out
zcat monDom4.ensPep.fa.gz | \
$HOME/kent/src/utils/faToTab/faToTab.pl excludeList /dev/stdin \
> ensGene.fa.tab
hgPepPred monDom4 tab ensPep ensGene.fa.tab
##########################################################################
# N-SCAN gene predictions (nscanGene, nscanPep) - (2006-09-13 markd)
cd /cluster/data/monDom4/bed/nscan/
# obtained NSCAN predictions from michael brent's group
# at WUSTL
wget -nv -r -l 1 -np --accept=gtf http://ardor.wustl.edu/predictions/possum/monDom4/chr_gtf
wget -nv -r -l 1 -np --accept=fa http://ardor.wustl.edu/predictions/possum/monDom4/chr_ptx
# clean up downloaded directorys and compress files.
mv ardor.wustl.edu/predictions/opossum/monDom4/* .
rm -rf ardor.wustl.edu
gzip chr_*/*
chmod a-w chr_*/*.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt monDom4 nscanGene chr_gtf/chr*.gtf.gz
# load protein, add .a suffix to match transcript id
hgPepPred -suffix=.a monDom4 generic nscanPep chr_ptx/chr*.fa.gz
rm *.tab
# update trackDb; need a monDom4-specific page to describe informants
opossum/monDom4/nscanGene.html (copy from rn4 and edit)
opossum/monDom4/trackDb.ra
# verify that the search spec matches the naming convention, sometimes
# transcripts end with .a othertimes .0
OK!
#########################################################################
# BLASTZ PLATYPUS ornAna1 (DONE 4/3/07 angie)
ssh pk
mkdir /cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
cd /cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
cat << '_EOF_' > DEF
# opossum vs platypus
BLASTZ=blastz.v7.x86_64
# Use "mammal-fish" params even though this is mammal-mammal...
# pretty distant, hopefully not too many shared undermasked repeats,
# and if we get overwhelmed we can back off:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/nib
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Platypus ornAna1
SEQ2_DIR=/san/sanvol1/scratch/ornAna1/ornAna1.2bit
SEQ2_LEN=/san/sanvol1/scratch/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastz.ornAna1.2007-04-02
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/monDom4ornAna1 \
-bigClusterHub=pk -smallClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose \
>& do.log & tail -f do.log
ln -s blastz.ornAna1.2007-04-02 /cluster/data/monDom4/bed/blastz.ornAna1
#########################################################################
# BLASTZ Rhesus rheMac2 (DONE 2007-05-27 braney)
ssh kolossus
mkdir /cluster/data/monDom4/bed/blastz.rheMac2
cd /cluster/data/monDom4/bed/blastz.rheMac2
cat << '_EOF_' > DEF
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Opossum monDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Rhesus (rheMac2)
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib
SEQ2_LEN=/san/sanvol1/scratch/rheMac2/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastz.rheMac2
TMPDIR=/scratch/tmp
# << emacs
doBlastzChainNet.pl DEF \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
>& blastz.out 2>&1 &
############################################################################
## BLASTZ swap from canFam2 alignments (2007-11-10 - markd)
# /cluster/data/canFam2/bed/blastz.monDom4/DEF1 has incorrect BASE, so
# create DEF.fixed
ssh hgwdev
mkdir /cluster/data/monDom4/bed/blastz.canFam2.swap
cd /cluster/data/monDom4/bed/blastz.canFam2.swap
ln -s blastz.canFam2.swap ../blastz.canFam2
(time /cluster/bin/scripts/doBlastzChainNet.pl \
-swap /cluster/data/canFam2/bed/blastz.monDom4/DEF.fixed) >& swap.out&
nice -n +19 featureBits monDom4 chainCanFam2Link
# 325604035 bases of 3501643220 (9.299%) in intersection
##############################################################################
# Blastz Orangutan ponAbe2 (DONE - 2007-11-26 - Hiram)
ssh kkstore04
# managing disk space on full filesystem
mkdir /cluster/store8/monDom4/bed/blastzPonAbe2.2007-11-22
ln -s /cluster/store8/monDom4/bed/blastzPonAbe2.2007-11-22 \
/cluster/data/monDom4/bed
cd /cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22
screen # use screen to control this job
mkdir /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
cd /cluster/data/ponAbe2/bed/blastzOrnAna1.2007-11-13
cat << '_EOF_' > DEF
# Opossum vs. Orangutan
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# TARGET: Opossum MonDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/cluster/data/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100
# QUERY: Orangutan PonAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ1_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# XXX - can run Opossum only on pk since the chroms are so large
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-stop=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> cat.log 2>&1 &
# real 1600m18.602s
# Completed: 134688 of 134688 jobs
# CPU time in finished jobs: 12106044s 201767.41m 3362.79h 140.12d 0.384 y
# IO & Wait Time: 851946s 14199.09m 236.65h 9.86d 0.027 y
# Average job time: 96s 1.60m 0.03h 0.00d
# Longest finished job: 11205s 186.75m 3.11h 0.13d
# Submission to last job: 95155s 1585.92m 26.43h 1.10d
time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
-continue=chainRun -stop=download -chainLinearGap=loose \
-bigClusterHub=pk -verbose=2 \
> chainRun.log 2>&1 &
# real 2724m19.530s
cat fb.monDom4.chainPonAbe2Link.txt
# 395887393 bases of 3501643220 (11.306%) in intersection
# and for the swap
mkdir /cluster/data/ponAbe2/bed/blastz.monDom4.swap
cd /cluster/data/ponAbe2/bed/blastz.monDom4.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
/cluster/data/monDom4/bed/blastzPonAbe2.2007-11-22/DEF \
-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> swap.log 2>&1 &
# real 223m38.325s
cat fb.ponAbe2.chainMonDom4Link.txt
# 406409435 bases of 3093572278 (13.137%) in intersection
##############################################################################
# Blastz Marmoset calJac1 (WORKING - 2007-11-24 - Hiram)
ssh kkstore04
screen # use screen to control this job
# managing disk space on full filesystem
mkdir /cluster/store8/monDom4/bed/blastzCalJac1.2007-11-24
ln -s /cluster/store8/monDom4/bed/blastzCalJac1.2007-11-24 \
/cluster/data/monDom4/bed
cd /cluster/data/monDom4/bed/blastzCalJac1.2007-11-24
# this chunking makes 651,480 jobs
cat << '_EOF_' > DEF
# Opossum vs. Marmoset
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# TARGET: Opossum MonDom4
SEQ1_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ1_LEN=/cluster/data/monDom4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100
# QUERY: Marmoset calJac1, largest contig 2,551,648, 49,724 contigs
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=2000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
# XXX - can run Opossum only on pk since the chroms are so large
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
-stop=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> cat.log 2>&1 &
# real 2581m49.853s
# Completed: 651480 of 651480 jobs
# CPU time in finished jobs: 25871528s 431192.14m 7186.54h 299.44d 0.820 y
# IO & Wait Time: 2695828s 44930.46m 748.84h 31.20d 0.085 y
# Average job time: 44s 0.73m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1804s 30.07m 0.50h 0.02d
# Submission to last job: 146641s 2444.02m 40.73h 1.70d
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
-continue=chainRun -stop=download \
-chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> chainRun.log 2>&1 &
# real 1810m38.601s
# the 10 chaining jobs on kki take about 35 hours for the longest job
# Completed: 10 of 10 jobs
# CPU time in finished jobs: 369079s 6151.32m 102.52h 4.27d 0.012 y
# IO & Wait Time: 385s 6.41m 0.11h 0.00d 0.000 y
# Average job time: 36946s 615.77m 10.26h 0.43d
# Longest finished job: 103069s 1717.82m 28.63h 1.19d
# Submission to last job: 103069s 1717.82m 28.63h 1.19d
cat fb.monDom4.chainCalJac1Link.txt
# 386915334 bases of 3501643220 (11.050%) in intersection
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=5000 \
-continue=cleanup \
-chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> cleanup.log 2>&1 &
mkdir /cluster/data/calJac1/bed/blastz.monDom4.swap
cd /cluster/data/calJac1/bed/blastz.monDom4.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# real 498m24.344s
# this failed during the load because the chainLink table became
# too large. These were loaded manually with an sql statement to
# start the table definition, then a load data local infile using
# the .tab files left over from the failed load. Note the extra
# definitions on the chainMonDom4Link table
CREATE TABLE chainMonDom4 (
bin smallint(5) unsigned NOT NULL default '0',
score double not null, # score of chain
tName varchar(255) not null, # Target sequence name
tSize int unsigned not null, # Target sequence size
tStart int unsigned not null, # Alignment start position in target
tEnd int unsigned not null, # Alignment end position in target
qName varchar(255) not null, # Query sequence name
qSize int unsigned not null, # Query sequence size
qStrand char(1) not null, # Query strand
qStart int unsigned not null, # Alignment start position in query
qEnd int unsigned not null, # Alignment end position in query
id int unsigned not null, # chain id
#Indices
KEY tName (tName(16),bin),
KEY id (id)
) TYPE=MyISAM;
time nice -n +19 hgsql -e \
"load data local infile \"chain.tab\" into table chainMonDom4;" calJac1
# real 8m10.273s
CREATE TABLE chainMonDom4Link (
bin smallint(5) unsigned NOT NULL default 0,
tName varchar(255) NOT NULL default '',
tStart int(10) unsigned NOT NULL default 0,
tEnd int(10) unsigned NOT NULL default 0,
qStart int(10) unsigned NOT NULL default 0,
chainId int(10) unsigned NOT NULL default 0,
KEY tName (tName(16),bin),
KEY chainId (chainId)
) ENGINE=MyISAM max_rows=160000000 avg_row_length=55 pack_keys=1 CHARSET=latin1;
time nice -n +19 hgsql -e \
"load data local infile \"link.tab\" into table chainMonDom4Link;" calJac1
# this one took a number of hours
# finish the nets and load
time nice -n +19 netClass -verbose=0 -noAr noClass.net \
calJac1 monDom4 calJac1.monDom4.net
# real 4m18.898s
time nice -n +19 netFilter -minGap=10 calJac1.monDom4.net \
| hgLoadNet -verbose=0 calJac1 netMonDom4 stdin
# real 0m54.559s
cd /cluster/data/calJac1/bed/blastz.monDom4.swap
time nice -n +19 featureBits calJac1 chainMonDom4Link \
> fb.calJac1.chainMonDom4Link.txt 2>&1
# real 25m7.373s
cat fb.calJac1.chainMonDom4Link.txt
# 391248654 bases of 2929139385 (13.357%) in intersection
# continuing
cd /cluster/data/calJac1/bed/blastz.monDom4.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/monDom4/bed/blastzCalJac1.2007-11-24/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-continue=download -swap -bigClusterHub=pk > download.log 2>&1 &
###########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
+############################################################################
+# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
+
+vertebrate-wide transMap alignments were built Tracks are created and loaded
+by a single Makefile. This is available from:
+ svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
+
+see doc/builds.txt for specific details.
+############################################################################