src/hg/makeDb/doc/bosTau4.txt 1.23
1.23 2010/04/01 17:33:10 hiram
never got susScr1 to work finished with susScr2 lastz
Index: src/hg/makeDb/doc/bosTau4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/bosTau4.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/bosTau4.txt 25 Jan 2010 19:16:14 -0000 1.22
+++ src/hg/makeDb/doc/bosTau4.txt 1 Apr 2010 17:33:10 -0000 1.23
@@ -1,1891 +1,1944 @@
# for emacs: -*- mode: sh; -*-
# Bos Taurus -- Baylor Release 4.0 (Oct 4 2007)
# "$Id$"
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2008-03-07 - Hiram)
ssh kkstore06
mkdir /cluster/store3/bosTau4
ln -s /cluster/store3/bosTau4 /cluster/data/bosTau4
mkdir /cluster/data/bosTau4/baylor
cd /cluster/data/bosTau4/baylor
for F in README.Btau20070913.txt Contigs/*
do
wget --timestamping \
"ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20070913-freeze/${F}"
done
mkdir chroms
cd chroms
wget --timestamping \
"ftp://ftp.hgsc.bcm.tmc.edu/pub/data/Btaurus/fasta/Btau20070913-freeze/LinearScaffolds/*"
# Looking up chrM in Entrez nucleotide search,
# from bosTau3 note: NC_006853 GI:60101824
# search for Bos taurus mitochondrion complete genome
# Note, their Contigs files are from their 2006 release.
# Their chrom files are new to this release.
# fixup the Chr case names in the agp file:
sed -e "s/^Chr/chr/" Btau20070913.All.ac.agp > bosTau4.agp
# fixup the contig names in the contigs.fa and qual files, utilize
# the following perl scripts:
cat << '_EOF_' > fixContigNames.pl
#!/usr/bin/env perl
use warnings;
use strict;
my $argc = scalar(@ARGV);
if (1 != $argc) {
print "usage ./fixContigNames.pl Btau20060815.contigs.fa.gz \\\n",
" | gzip > bosTau4.contigs.fa.gz\n";
exit 255;
}
my $fName = shift;
open (FH,"zcat $fName|") or die "can not read $fName $!";
while (my $line = <FH>) {
if ($line =~ m/^>/) {
my @a = split('\|', $line);
$a[3] =~ s/\.[0-9]+//;
print ">$a[3]\n";
} else {
print "$line";
}
}
close (FH)
'_EOF_'
# << happy emacs
cat << '_EOF_' > fixQuals.pl
#!/usr/bin/env perl
use warnings;
use strict;
my $argc = scalar(@ARGV);
if (1 != $argc) {
print "usage ./fixQuals.pl Btau20060815.contigs.fa.qual.gz \\\n",
" | gzip > bosTau4.qual.gz\n";
exit 255;
}
my $fName = shift;
open (FH,"zcat $fName|") or die "can not read $fName $!";
while (my $line = <FH>) {
if ($line =~ m/^>/) {
$line =~ s/\.[0-9]+.*//;
}
print "$line";
}
close (FH)
'_EOF_'
# << happy emacs
# There are extra records in the qual and contigs fa files.
# Only the sequences in the agp file should be used:
grep "^chr" bosTau4.agp | egrep -v "clone|fragment" | cut -f6 \
| sort > agp.names
# run those scripts, extract only the used records, and lift the
# qual scores via the AGP file to a qac file.
chmod +x fixContigNames.pl fixQuals.pl
./fixContigNames.pl Btau20060815.contigs.fa.gz \
| faSomeRecords stdin agp.names stdout | gzip > bosTau4.contigs.fa.gz
./fixQuals.pl Btau20060815.contigs.fa.qual.gz \
| faSomeRecords stdin agp.names stdout \
| qaToQac stdin stdout \
| qacAgpLift bosTau4.agp stdin bosTau4.qual.qac
#############################################################################
# running makeGenomeDb.pl (DONE - 2008-03-07 - Hiram)
ssh kkstore06
cd /cluster/data/bosTau4
cat << '_EOF_' > bosTau4.config.ra
db bosTau4
clade mammal
scientificName Bos Taurus
assemblyDate Oct. 2007
assemblyLabel Baylor Release 4.0
orderKey 236
dbDbSpeciesDir cow
mitoAcc 60101824
agpFiles /cluster/data/bosTau4/baylor/bosTau4.agp
fastaFiles /cluster/data/bosTau4/baylor/bosTau4.contigs.fa.gz
qualFiles /cluster/data/bosTau4/baylor/bosTau4.qual.qac
commonName Cow
'_EOF_'
# << happy emacs
makeGenomeDb.pl bosTau4.config.ra > makeGenomeDb.log 2>&1
#########################################################################
# REPEATMASKER (DONE - 2008-03-07 - Hiram)
ssh kkstore06
screen # use a screen to manage this job
mkdir /cluster/data/bosTau4/bed/repeatMasker
cd /cluster/data/bosTau4/bed/repeatMasker
#
# This was going too slow on memk, on the order of 8 days.
# chilled that memk batch, then switched it to finish on pk
doRepeatMasker.pl -buildDir=/cluster/data/bosTau4/bed/repeatMasker \
-bigClusterHub=memk bosTau4 > do.log 2>&1 &
# continuing
doRepeatMasker.pl -buildDir=/cluster/data/bosTau4/bed/repeatMasker \
-continue=cat -bigClusterHub=pk bosTau4 > cat.log 2>&1 &
time nice -n +19 featureBits bosTau4 rmsk > fb.bosTau4.rmsk.txt 2>&1 &
# 1280927549 bases of 2731830700 (46.889%) in intersection
# RepeatMasker and lib version from do.log:
# RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $
# Jan 11 2008 (open-3-1-9) version of RepeatMasker
# CC RELEASE 20071204;
# Compare coverage to previous assembly:
featureBits bosTau3 rmsk
# 1200525422 bases of 2731807384 (43.946%) in intersection
featureBits bosTau2 rmsk
# 1291561904 bases of 2812203870 (45.927%) in intersection
#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-03-07 - Hiram)
ssh kkstore06
screen # use a screen to manage this job
mkdir /cluster/data/bosTau4/bed/simpleRepeat
cd /cluster/data/bosTau4/bed/simpleRepeat
#
doSimpleRepeat.pl -buildDir=/cluster/data/bosTau4/bed/simpleRepeat \
bosTau4 > do.log 2>&1 &
# after RM run is done, add this mask:
cd /cluster/data/bosTau4
twoBitMask bosTau4.rmsk.2bit -add bed/simpleRepeat/trfMask.bed bosTau4.2bit
twoBitToFa bosTau4.2bit stdout | faSize stdin
# 2917974530 bases (186161294 N's 2731813236 real
# 1450855409 upper 1280957827 lower) in 11900 sequences in 1 files
# %43.90 masked total, %46.89 masked real
twoBitToFa bosTau4.rmsk.2bit stdout | faSize stdin
# 2917974530 bases (186161294 N's 2731813236 real
# 1451369861 upper 1280443375 lower) in 11900 sequences in 1 files
# %43.88 masked total, %46.87 masked real
#########################################################################
# Create OOC file for genbank runs (DONE - 2008-03-10 - Hiram)
# use same repMatch value as bosTau2
ssh kkstore06
cd /cluster/data/bosTau3
blat bosTau4.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=bosTau4.11.ooc -repMatch=1005
ssh kkr1u00
mkdir /iscratch/i/bosTau4
cd /iscratch/i/bosTau4
cp -p /cluster/data/bosTau4/bosTau4.2bit .
for R in 2 3 4 5 6 7 8
do
rsync -a --progress ./ kkr${R}u00:/iscratch/i/bosTau4/
done
#########################################################################
# Starting Genbank
ssh hgwdev
cd $HOME/kent/src/hg/makeDb/genbank
# edit etc/genbank.conf to add the following entry:
# bosTau4 (B. taurus) 31 chromosomes plus 11869 scaffolds
bosTau4.serverGenome = /cluster/data/bosTau4/bosTau4.2bit
bosTau4.clusterGenome = /iscratch/i/bosTau4/bosTau4.2bit
bosTau4.ooc = /cluster/data/bosTau4/bosTau4.11.ooc
bosTau4.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter}
bosTau4.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
bosTau4.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
bosTau4.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
bosTau4.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter}
bosTau4.lift = no
bosTau4.downloadDir = bosTau4
bosTau4.perChromTables = no
bosTau4.mgc = yes
cvs ci -m "Added bosTau4." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # control this business with a screen since it takes a while
cd /cluster/data/genbank
# This is a call to a script that will push our jobs out to the cluster
# since it's a big job.
time nice -n +19 bin/gbAlignStep -initial bosTau4 &
# logFile: var/build/logs/2008.03.10-14:14:43.bosTau4.initalign.log
# real 567m7.431s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad bosTau4
# logFile: var/dbload/hgwdev/logs/2008.03.11-08:48:17.dbload.log
# real 34m55.565s
# enable daily alignment and update of hgwdev (DONE - 2008-03-11 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add bosTau4 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added bosTau4." etc/align.dbs etc/hgwdev.dbs
make etc-update
############################################################################
# DONE - 2008-03-11 - Hiram
# Reset default position to an area of chr3 with a number of genes
hgsql -e \
'update dbDb set defaultPos="chr3:21083267-21232422" where name="bosTau4";' \
hgcentraltest
# And there was a mistake in the description date entry
hgsql -e \
'update dbDb set description="Oct. 2007" where name="bosTau4";' \
hgcentraltest
#########################################################################
## genscan run (DONE - 2008-03-11 - Hiram)
## create hard masked sequence
ssh kkstore06
cd /cluster/data/bosTau4
mkdir hardMasked
for C in `cut -f1 chrom.sizes`
do
echo "hardMasked/${C}.hard.fa"
twoBitToFa -seq=${C} bosTau4.2bit stdout \
| maskOutFa stdin hard hardMasked/${C}.hard.fa
ls -ld "hardMasked/${C}.hard.fa"
done
# And, make sure there aren't any sequences in this lot that have
# become all N's with no sequence left in them. This drives genscan nuts
echo hardMasked/*.hard.fa | xargs faCount > faCount.hard.txt
# the lowest three are:
egrep -v "^#|^total" faCount.hard.txt \
| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
# chrUn.004.9957 0
# chrUn.004.9961 0
# chrUn.004.9996 0
# There are a whole bunch of these, and many with just a few bases.
# Actually, before removing these for genscan, run the cpgIsland
# business first since it can work on them all.
# So, remove any with less than 100 bases of sequence
egrep -v "^#|^total" faCount.hard.txt \
| awk '{size=$2-$7; if (size < 100){printf "hardMasked/%s.hard.fa\n", $1}}' \
| xargs rm
# now get them over to a kluster location
mkdir /san/sanvol1/scratch/bosTau4/hardChunks
cd /san/sanvol1/scratch/bosTau4/hardChunks
# creating 4,000,000 sized chunks, the chroms stay together as
# single pieces. The contigs get grouped together into 4,000,000
# sized fasta files. You don't want to break these things up
# because genscan will be doing its own internal 2.4 million
# window on these pieces, and the gene names are going to be
# constructed from the sequence name in these fasta files.
echo /cluster/data/bosTau4/hardMasked/*.hard.fa | xargs cat \
| faSplit about stdin 4000000 c_
ssh hgwdev
mkdir /cluster/data/bosTau4/bed/genscan
cd /cluster/data/bosTau4/bed/genscan
# Check out hg3rdParty/genscanlinux to get latest genscan:
cvs co hg3rdParty/genscanlinux
# Run on small cluster (more mem than big cluster).
ssh memk
cd /cluster/data/bosTau4/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
# Since we split on gaps, we have no chunks like that. You can
# verify with faCount on the chunks.
ls -1Sr /san/sanvol1/scratch/bosTau4/hardChunks/c_*.fa > genome.list
# Create run-time script to operate gsBig in a cluster safe manner
cat << '_EOF_' > runGsBig
#!/bin/csh -fe
set runDir = `pwd`
set srcDir = $1
set inFile = $2
set fileRoot = $inFile:r
mkdir /scratch/tmp/$fileRoot
cp -p $srcDir/$inFile /scratch/tmp/$fileRoot
pushd /scratch/tmp/$fileRoot
/cluster/bin/x86_64/gsBig $inFile $fileRoot.gtf -trans=$fileRoot.pep -subopt=$fileRoot.bed -exe=$runDir/hg3rdParty/genscanlinux/genscan -par=$runDir/hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
popd
cp -p /scratch/tmp/$fileRoot/$fileRoot.gtf gtf
cp -p /scratch/tmp/$fileRoot/$fileRoot.pep pep
cp -p /scratch/tmp/$fileRoot/$fileRoot.bed subopt
rm -fr /scratch/tmp/$fileRoot
'_EOF_'
# << happy emacs
chmod +x runGsBig
cat << '_EOF_' > template
#LOOP
runGsBig /san/sanvol1/scratch/bosTau4/hardChunks $(file1) {check out line gtf/$(root1).gtf} {check out line pep/$(root1).pep} {check out line subopt/$(root1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 genome.list single template jobList
para create jobList
para try, check, push, check, ...
XXX - running 2008-03-11 11:20
# Completed: 97 of 97 jobs
# CPU time in finished jobs: 93952s 1565.87m 26.10h 1.09d 0.003 y
# IO & Wait Time: 372s 6.19m 0.10h 0.00d 0.000 y
# Average job time: 972s 16.21m 0.27h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 4870s 81.17m 1.35h 0.06d
# Submission to last job: 7902s 131.70m 2.19h 0.09d
# cat and lift the results into single files
ssh kkstore06
cd /cluster/data/bosTau4/bed/genscan
sort -k1,1 -k4.4n gtf/c_*.gtf > genscan.gtf
sort -k1,1 -k2,2n subopt/c_*.bed > genscanSubopt.bed
cat pep/c_*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/bosTau4/bed/genscan
ldHgGene bosTau4 -gtf genscan genscan.gtf
# Read 49598 transcripts in 346887 lines in 1 files
# 49598 groups 5059 seqs 1 sources 1 feature types
# 49598 gene predictions
hgPepPred bosTau4 generic genscanPep genscan.pep
hgLoadBed bosTau4 genscanSubopt genscanSubopt.bed
# Loaded 528092 elements of size 6
# let's check the numbers
time nice -n +19 featureBits bosTau4 genscan
# 58224594 bases of 2731830700 (2.131%) in intersection
time nice -n +19 featureBits bosTau3 genscan
# 59251085 bases of 2731807384 (2.169%) in intersection
#########################################################################
# CPGISLANDS (DONE - 2008-03-11 - Hiram)
ssh hgwdev
mkdir /cluster/data/bosTau4/bed/cpgIsland
cd /cluster/data/bosTau4/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# There was a problem in here, in both cpg.c and cpg_lh.c:
# warning: conflicting types for built-in function 'malloc'
# comment out the lines where malloc is declared to get this to build
# gcc readseq.c cpg_lh.c -o cpglh.exe
cd ../..
ln -s hg3rdParty/cpgIslands/cpglh.exe .
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
mkdir results
echo ../../hardMasked/*.hard.fa | sed -e "s/ /\n/g" | while read F
do
FA=${F/*\/}
C=${FA/.hard.fa/}
echo "./cpglh.exe ${FA} > results/${C}.cpg"
nice -n +19 ./cpglh.exe ${F} > results/${C}.cpg
done > cpglh.out 2>&1 &
# about 5 minutes
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# << happy emacs
catDir results \
| awk -f filter.awk | sort -k1,1 -k2,2n > cpgIsland.bed
ssh hgwdev
cd /cluster/data/bosTau4/bed/cpgIsland
hgLoadBed bosTau4 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
# Reading cpgIsland.bed
# Loaded 37595 elements of size 10
featureBits bosTau4 cpgIslandExt
# 24202824 bases of 2731830700 (0.886%) in intersection
featureBits bosTau3 cpgIslandExt
# 24374280 bases of 2731807384 (0.892%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11 - Hiram)
ssh kkstore02
screen # use a screen to manage this multi-day job
mkdir /cluster/data/hg18/bed/blastzBosTau4.2008-03-11
cd /cluster/data/hg18/bed/
ln -s blastzBosTau4.2008-03-11 blastz.bosTau4
cd blastzBosTau4.2008-03-11
cat << '_EOF_' > DEF
BLASTZ_M=50
# TARGET: Human Hg18
SEQ1_DIR=/cluster/bluearc/scratch/data/hg18/nib
SEQ1_LEN=/cluster/data/hg18/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau4
SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=300
SEQ2_CHUNK=20000000
SEQ2_LAP=0
BASE=/cluster/data/hg18/bed/blastzBosTau4.2008-03-11
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-syntenicNet > do.log 2>&1
# real 611m17.901s
cat fb.hg18.chainBosTau4Link.txt
# 1345348636 bases of 2881515245 (46.689%) in intersection
mkdir /cluster/data/bosTau4/bed/blastz.hg18.swap
cd /cluster/data/bosTau4/bed/blastz.hg18.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/hg18/bed/blastzBosTau4.2008-03-11/DEF \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet > do.log 2>&1
# broken down during netSynteny.csh due to too many open files on
# a chainSplit
# To get them split, set up a job on memk to do the split
# via chainFilter: chainFilter -t=${T} bigFile.chain > ${T}.chain
# Where the targets T are from the list:
# grep "^chain " bigFile.chain | awk '{print $3}' | sort -u
# see full example below in Platypus Blastz
###########################################################################
# Blastz Platypus ornAna1 (DONE - 2008-03-11,14 - Hiram)
ssh kkstore06
screen # use screen to control this job
mkdir /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11
cd /cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11
cat << '_EOF_' > DEF
# Cow vs. platypus
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50
# TARGET: Cow bosTau4
SEQ1_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
SEQ1_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=300
SEQ1_CHUNK=20000000
SEQ1_LAP=0
# QUERY: Platypus ornAna1
SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit
SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0
BASE=/cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=5000 -verbose=2 \
-chainLinearGap=loose -bigClusterHub=kk > do.log 2>&1 &
# real 1233m16.397s
# the pk kluster became free as this job had a long time to go.
# Unsure how long it had to go because the batch became confused
# and couldn't calculate eta, it had waiting=0 jobs ? So, chill
# out the batch, allow the running jobs to complete,
# go to pk and get the jobs completed there.
# There was a difficulty with the para.results file. It lost track
# of 1158 jobs, but they were completed and results exist.
# then continuing:
time nice -n +19 doBlastzChainNet.pl `pwd`/DEF \
-chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
-continue=cat -chainLinearGap=loose -bigClusterHub=pk > cat.log 2>&1 &
# real 66m36.465s
cat fb.bosTau4.chainOrnAna1Link.txt
# 201799073 bases of 2731830700 (7.387%) in intersection
mkdir /cluster/data/ornAna1/bed/blastz.bosTau4.swap
cd /cluster/data/ornAna1/bed/blastz.bosTau4.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/bosTau4/bed/blastzOrnAna1.2008-03-11/DEF \
-chainMinScore=5000 -verbose=2 -smallClusterHub=memk \
-swap -chainLinearGap=loose -bigClusterHub=pk > do.log 2>&1 &
# real 106m17.385s
cat fb.ornAna1.chainBosTau4Link.txt
# 187868681 bases of 1842236818 (10.198%) in intersection
# need to split up the chain file to run the reciprocal net
# there are too many chroms to work with chain split
# do it with chainFilter as a kluster job on memk
ssh memk
mkdir /scratch/tmp/ornAna1
cd /scratch/tmp/ornAna1
for R in 0 1 2 3 4 5 6 7
do
echo $R
ssh mkr0u${R} mkdir /scratch/tmp/ornAna1
ssh mkr0u${R} cp -p /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/bosTau4.ornAna1.all.chain.gz /scratch/tmp/ornAna1
ssh mkr0u${R} gunzip /scratch/tmp/ornAna1/bosTau4.ornAna1.all.chain.gz
done
zcat /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/bosTau4.ornAna1.all.chain.gz | grep "^chain " | awk '{print $3}' | sort -u > chain.list
cat << '_EOF_' > splitOne.csh
#!/bin/csh -fe
set tmpDir = /scratch/tmp/ornAna1
set T=$1
chainFilter -t=${T} $tmpDir/bosTau4.ornAna1.all.chain > $tmpDir/${T}.chain
'_EOF_'
# << happy emacs
cat << '_EOF_' > template
#LOOP
./splitOne.csh $(file1)
'_EOF_'
# << happy emacs
gensub2 chain.list single template jobList
para create jobList
para try ... check ... push ... time ...
# fetch the results from tmp directories
for R in 0 1 2 3 4 5 6 7
do
ssh mkr0u${R} "(cd /scratch/tmp/ornAna1; cp -p -u chr*.chain /cluster/data/bosTau4/bed/blastz.ornAna1/axtChain/chain)"
done
############################################################################
# BLATSERVERS ENTRY (DONE - 2008-03-12 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("bosTau4", "blat2", "17778", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("bosTau4", "blat2", "17779", "0", "1");' \
hgcentraltest
# test it with some sequence
#########################################################################
## 5-Way Multiz (DONE - 2008-03-18 - Hiram)
##
# the all.chain.gz files were split up via kluster jobs on memk
# in order to get mafSynNet files. Example above in ornAna1 blastz
ssh hgwdev
mkdir /cluster/data/bosTau4/bed/multiz5way
cd /cluster/data/bosTau4/bed/multiz5way
# take the 30-way tree from mm9 and eliminate genomes not in
# this alignment
# rearrange to get bosTau4 on the top of the graph
# paste this tree into the on-line phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to create the image for the tree diagram
# select the 9 organisms from the 30-way recently done on mouse mm9
/cluster/bin/phast/tree_doctor \
--prune-all-but Human_hg18,Mouse_mm9,Dog_canFam2,Platypus_ornAna1,Cow_bosTau3 \
/cluster/data/mm9/bed/multiz30way/mm9OnTop.fullNames.nh \
| sed -e "s/bosTau3/bosTau4/g" > 5-way.fullNames.nh
# looks something like this:
(((Mouse_mm9:0.325818,Human_hg18:0.126901):0.019763,
(Dog_canFam2:0.156637,Cow_bosTau4:0.163945):0.031326):0.332197,
Platypus_ornAna1:0.488110);
# rearrange to get Cow at the top:
# this leaves us with:
cat << '_EOF_' > bosTau4.5-way.nh
(((Cow_bosTau4:0.163945,Dog_canFam2:0.156637):0.031326,
(Human_hg18:0.126901,Mouse_mm9:0.325818):0.019763):0.332197,
Platypus_ornAna1:0.488110);
'_EOF_'
# << happy emacs
# create a species list from that file:
sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' bosTau4.5-way.nh \
| sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
| sed -e "s/.*_//; s/:.*//" | sort > species.list
# verify that has 5 db names in it
# create a stripped down nh file for use in autoMZ run
echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' bosTau4.5-way.nh \
| sed -e "s/ / /g"` > tree.5.nh
# that looks like, as a single line:
# (((bosTau4 canFam2) (hg18 mm9)) ornAna1)
# verify all blastz's exists
cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/bosTau4/bed/multiz5way
foreach db (`grep -v bosTau4 species.list`)
set bdir = /cluster/data/bosTau4/bed/blastz.$db
if (-e $bdir/mafRBestNet/chr1.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/chr1.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/chr1.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./listMafs.csh
# see what it says:
./listMafs.csh
# canFam2 mafSynNet
# hg18 mafSynNet
# mm9 mafSynNet
# ornAna1 mafSynNet
/cluster/bin/phast/all_dists bosTau4.5-way.nh > 5way.distances.txt
grep -i bostau 5way.distances.txt | sort -k3,3n
# Cow_bosTau4 Dog_canFam2 0.320582
# Cow_bosTau4 Human_hg18 0.341935
# Cow_bosTau4 Mouse_mm9 0.540852
# Cow_bosTau4 Platypus_ornAna1 1.015578
# use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chainCalJac1Link chain linearGap
# distance on CalJac1 on other minScore
# 1 0.320582 Dog_canFam2 (% 82.846) (% 78.351) 3000 medium
# 2 0.341935 Human_hg18 (% 74.810) (% 77.648) 3000 medium
# 3 0.540852 Mouse_mm9 (% 76.925) (% 74.694) 3000 medium
# 4 1.015578 Platypus_ornAna1 (% 77.296) (% 76.308) 3000 medium
# create a coherent set of all the mafs involved in this run
mkdir mafLinks
cd mafLinks
ln -s ../../blastz.canFam2/mafSynNet ./canFam2
ln -s ../../blastz.hg18/mafSynNet ./hg18
ln -s ../../blastz.mm9/mafSynNet ./mm9
ln -s ../../blastz.ornAna1/mafSynNet ./ornAna1
# check data size:
du -hscL *
# 982M canFam2
# 940M hg18
# 483M mm9
# 79M ornAna1
# 2.5G total
# copy net mafs to cluster-friendly storage
ssh kkstore06
cd /cluster/data/bosTau4/bed/multiz5way/mafLinks
mkdir -p /san/sanvol1/scratch/bosTau4/multiz5way/mafs
rsync -a --progress --copy-links ./ \
/san/sanvol1/scratch/bosTau4/multiz5way/mafs/
# create a run-time list of contigs to operate on, not all contigs
# exist in all alignments, but we want all contig names used in any
# alignment:
cd /san/sanvol1/scratch/bosTau4/multiz5way/mafs
for D in *
do
cd "${D}"
find . -type f
cd ..
done | sed -e "s#^./##; s#.maf##" | sort -u > /tmp/5-way.contig.list
wc -l /tmp/5-way.contig.list
# 2321 /tmp/5-way.contig.list
# ready for the multiz run
ssh pk
mkdir /cluster/data/bosTau4/bed/multiz5way/splitRun
cd /cluster/data/bosTau4/bed/multiz5way/splitRun
scp -p kkstore06:/tmp/5-way.contig.list .
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# set the db and pairs directories here
cat << '_EOF_' > autoMultiz.csh
#!/bin/csh -ef
set db = bosTau4
set c = $1
set result = $2
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz5way/mafs
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.5.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.5.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/bosTau4/bed/multiz5way/splitRun/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
gensub2 ../5-way.contig.list single template jobList
para create jobList
para try ... check ... push ... etc
# Completed: 2321 of 2321 jobs
# CPU time in finished jobs: 35680s 594.67m 9.91h 0.41d 0.001 y
# IO & Wait Time: 9398s 156.63m 2.61h 0.11d 0.000 y
# Average job time: 19s 0.32m 0.01h 0.00d
# Longest finished job: 1982s 33.03m 0.55h 0.02d
# Submission to last job: 3983s 66.38m 1.11h 0.05d
# put the split maf results back together into a single maf file
# eliminate duplicate comments
ssh kkstore06
cd /cluster/data/bosTau4/bed/multiz5way
grep "^##maf version" splitRun/maf/chr1.maf \
| sort -u > bosTau4.5way.maf
for F in `find ./splitRun/maf -type f -depth`
do
grep -h "^#" "${F}" | egrep -v "maf version=1|eof maf" \
| sed -e "s#/_MZ_[^ ]* # #g; s#__[0-9]##g"
done | sort -u >> bosTau4.5way.maf
for F in `find ./splitRun/maf -type f -depth`
do
grep -v -h "^#" "${F}"
done >> bosTau4.5way.maf
grep "^##eof maf" splitRun/maf/chr1.maf \
| sort -u >> bosTau4.5way.maf
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/bosTau4/multiz5way/maf
ln -s /cluster/data/bosTau4/bed/multiz5way/*.maf \
/gbdb/bosTau4/multiz5way/maf/multiz5way.maf
# this generates an immense multiz5way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /scratch/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/bosTau4/multiz5way/maf bosTau4 multiz5way
# real 3m31.593s
# Loaded 5392296 mafs in 1 files from /gbdb/bosTau4/multiz5way/maf
# load summary table
time nice -n +19 cat /gbdb/bosTau4/multiz5way/maf/*.maf \
| hgLoadMafSummary bosTau4 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz5waySummary stdin
# real 3m41.663s
# Created 779088 summary blocks from 10697864 components and 5164426 mafs from stdin
# Gap Annotation
# prepare bed files with gap info
ssh kkstore06
mkdir /cluster/data/bosTau4/bed/multiz5way/anno
cd /cluster/data/bosTau4/bed/multiz5way/anno
mkdir maf run
# these actually already all exist from previous multiple alignments
# remove the echo in front of the twoBitInfo to actually make it work
for DB in `cat ../species.list`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `grep -v bosTau4 ../../species.list`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
ssh memk
# temporarily copy the bosTau4.5way.maf file onto the memk
# nodes /scratch/data/bosTau4/maf/ directory
for R in 0 1 2 3 4 5 6 7
do
ssh mkr0u${R} rsync -a --progress \
/cluster/data/bosTau4/bed/multiz5way/bosTau4.5way.maf \
/scratch/data/bosTau4/maf/
done
mkdir /cluster/data/bosTau4/bed/multiz5way/anno/splitMaf
# need to split up the single maf file into individual
# per-scaffold maf files to run annotation on
cd /cluster/data/bosTau4/bed/multiz5way/anno/splitMaf
# create bed files to list approximately 1553 scaffolds in
# a single list, approximately 33 lists
cat << '_EOF_' > mkBedLists.pl
#!/usr/bin/env perl
use strict;
use warnings;
my $bedCount = 0;
my $i = 0;
my $bedFile = sprintf("file_%d.bed", $bedCount);
open (BF,">$bedFile") or die "can not write to $bedFile $!";
open (FH,"</cluster/data/bosTau4/chrom.sizes") or
die "can not read /cluster/data/bosTau4/chrom.sizes $!";
while (my $line = <FH>) {
chomp $line;
if ( (($i + 1) % 1553) == 0 ) {
printf "%s\n", $line;
close (BF);
++$bedCount;
$bedFile = sprintf("file_%d.bed", $bedCount);
open (BF,">$bedFile") or die "can not write to $bedFile $!";
}
++$i;
my ($chr, $size) = split('\s+',$line);
printf BF "%s\t0\t%d\t%s\n", $chr, $size, $chr;
}
close (FH);
close (BH);
'_EOF_'
# << happy emacs
chmod +x mkBedLists.pl
./mkBedLists.pl
# now, run a mafsInRegion on each one of those lists
cat << '_EOF_' > runOne
#!/bin/csh -fe
set runDir = "/cluster/data/bosTau4/bed/multiz5way/anno/splitMaf"
set resultDir = $1
set bedFile = $resultDir.bed
mkdir -p $resultDir
mkdir -p /scratch/tmp/bosTau4/$resultDir
pushd /scratch/tmp/bosTau4/$resultDir
mafsInRegion $runDir/$bedFile -outDir . \
/scratch/data/bosTau4/maf/bosTau4.5way.maf
popd
rsync -q -a /scratch/tmp/bosTau4/$resultDir/ ./$resultDir/
rm -fr /scratch/tmp/bosTau4/$resultDir
rmdir --ignore-fail-on-non-empty /scratch/tmp/bosTau4
'_EOF_'
# << happy emacs
chmod +x runOne
cat << '_EOF_' > template
#LOOP
./runOne $(root1)
#ENDLOOP
'_EOF_'
# << happy emacs
ls file*.bed > runList
gensub2 runList single template jobList
para create jobList
para try ... check ... push ... etc
# Completed: 8 of 8 jobs
# CPU time in finished jobs: 1137s 18.96m 0.32h 0.01d 0.000 y
# IO & Wait Time: 552s 9.19m 0.15h 0.01d 0.000 y
# Average job time: 211s 3.52m 0.06h 0.00d
# Longest finished job: 599s 9.98m 0.17h 0.01d
# Submission to last job: 599s 9.98m 0.17h 0.01d
cd /cluster/data/bosTau4/bed/multiz5way/anno/run
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set outDir = ../maf/$2
set result = $3
set input = $1
mkdir -p $outDir
cat $input | \
nice mafAddIRows -nBeds=nBeds stdin /scratch/data/bosTau4/bosTau4.2bit $result
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(path1) $(lastDir1) {check out line+ ../maf/$(lastDir1)/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
find ../splitMaf -type f -name "*.maf" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 2320 of 2320 jobs
# CPU time in finished jobs: 8405s 140.09m 2.33h 0.10d 0.000 y
# IO & Wait Time: 6934s 115.56m 1.93h 0.08d 0.000 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest finished job: 369s 6.15m 0.10h 0.00d
# Submission to last job: 834s 13.90m 0.23h 0.01d
# put the results back together into a single file
ssh kkstore06
cd /cluster/data/bosTau4/bed/multiz5way/anno
grep "^##maf version" maf/file_0/chr1.maf \
| sort -u > bosTau4.anno.5way.maf
find ./maf -type f -depth -name "*.maf" | while read F
do
grep -v -h "^#" "${F}"
done >> bosTau4.anno.5way.maf
echo "##eof maf" >> bosTau4.anno.5way.maf
ssh hgwdev
cd /cluster/data/bosTau4/bed/multiz5way/anno
mkdir -p /gbdb/bosTau4/multiz5way/anno
ln -s `pwd`/bosTau4.anno.5way.maf \
/gbdb/bosTau4/multiz5way/anno/multiz5way.maf
# by loading this into the table multiz5way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /scratch/tmp
time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/bosTau4/multiz5way/anno \
bosTau4 multiz5way
# Loaded 6711605 mafs in 1 files from /gbdb/bosTau4/multiz5way/anno
# real 4m54.025s
# normally filter this for chrom size > 1,000,000 and only load
# those chroms. But this is a scaffold assembly, load everything:
time nice -n +19 hgLoadMafSummary bosTau4 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz5waySummary \
/gbdb/bosTau4/multiz5way/anno/multiz5way.maf
# Created 779088 summary blocks from 10697864 components
# and 6412983 mafs from /gbdb/bosTau4/multiz5way/anno/multiz5way.maf
# real 4m6.782s
# by loading this into the table multiz5waySummary, it will replace
# the previously loaded table with the unannotated mafs
# remove the multiz5way*.tab files in this /scratch/tmp directory
rm multiz5way*.tab
# And, you can remove the previously loaded non-annotated maf file link:
rm /gbdb/bosTau4/multiz5way/maf/bosTau4.5way.maf
rmdir /gbdb/bosTau4/multiz5way/maf
###########################################################################
## Annotate 5-way multiple alignment with gene annotations
## (DONE - 2008-03-18 - Hiram)
# Gene frames
## given previous survey done for 9-way alignment on Marmoset
## and the appearance of new ensGene tables on everything
# use knownGene for hg18, mm9
# use ensGene for canFam2, ornAna1
# and refGene for bosTau4
ssh hgwdev
mkdir /cluster/data/bosTau4/bed/multiz5way/frames
cd /cluster/data/bosTau4/bed/multiz5way/frames
mkdir genes
# knownGene
for DB in hg18 mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
for DB in canFam2 ornAna1
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# refGene
for DB in bosTau4
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
ls -og genes
# -rw-rw-r-- 1 861210 Mar 18 15:40 bosTau4.gp.gz
# -rw-rw-r-- 1 1865308 Mar 18 15:40 canFam2.gp.gz
# -rw-rw-r-- 1 2008806 Mar 18 15:39 hg18.gp.gz
# -rw-rw-r-- 1 1965274 Mar 18 15:39 mm9.gp.gz
# -rw-rw-r-- 1 1347532 Mar 18 15:40 ornAna1.gp.gz
ssh kkstore06
cd /cluster/data/bosTau4/bed/multiz5way/frames
# anything to annotate is in a pair, e.g.: bosTau4 genes/bosTau4.gp.gz
time (cat ../anno/bosTau4.anno.5way.maf | nice -n +19 genePredToMafFrames bosTau4 stdin stdout bosTau4 genes/bosTau4.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz canFam2 genes/canFam2.gp.gz ornAna1 genes/ornAna1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 77526 bosTau4
# 110768 ornAna1
# 221524 mm9
# 230010 hg18
# 243396 canFam2
# load the resulting file
ssh hgwdev
cd /cluster/data/bosTau4/bed/multiz5way/frames
time nice -n +19 hgLoadMafFrames bosTau4 multiz5wayFrames \
multiz5way.mafFrames.gz
# real 0m21.968s
# enable the trackDb entries:
# frames multiz5wayFrames
# irows on
#############################################################################
# phastCons 5-way (DONE - 2008-03-19 - Hiram)
# split 5way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh memk
mkdir /cluster/data/bosTau4/bed/multiz5way/msa.split
cd /cluster/data/bosTau4/bed/multiz5way/msa.split
mkdir -p /san/sanvol1/scratch/bosTau4/multiz5way/cons/ss
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/bosTau4/bed/multiz5way/anno/maf
set WINDOWS = /san/sanvol1/scratch/bosTau4/multiz5way/cons/ss
pushd $WINDOWS
set resultDir = $1
set c = $2
rm -fr $resultDir/$c
mkdir -p $resultDir
twoBitToFa -seq=$c /scratch/data/bosTau4/bosTau4.2bit /scratch/tmp/bosTau4.$c.fa
# need to truncate odd-ball scaffold/chrom names that include dots
# as phastCons utils can't handle them
set TMP = /scratch/tmp/$c.clean.maf.$$
perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$resultDir/$c.maf > $TMP
/cluster/bin/phast/$MACHTYPE/msa_split $TMP -i MAF \
-M /scratch/tmp/bosTau4.$c.fa \
-o SS -r $resultDir/$c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/bosTau4.$c.fa
rm -f $TMP
popd
mkdir -p $resultDir
date > $resultDir/$c.out
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(dir1) $(root1) {check out line+ $(dir1)/$(root1).out}
#ENDLOOP
'_EOF_'
# << happy emacs
# create list of maf files:
(cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... etc
# Completed: 2320 of 2320 jobs
# CPU time in finished jobs: 1710s 28.50m 0.47h 0.02d 0.000 y
# IO & Wait Time: 6951s 115.85m 1.93h 0.08d 0.000 y
# Average job time: 4s 0.06m 0.00h 0.00d
# Longest finished job: 128s 2.13m 0.04h 0.00d
# Submission to last job: 1048s 17.47m 0.29h 0.01d
# take the cons and noncons trees from the mouse 30-way
# Estimates are not easy to make, probably more correctly,
# take the 30-way .mod file, and re-use it here.
ssh hgwdev
cd /cluster/data/bosTau4/bed/multiz5way
cp -p /cluster/data/mm9/bed/multiz30way/mm9.30way.mod .
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh memk
mkdir -p /cluster/data/bosTau4/bed/multiz5way/cons/run.cons
cd /cluster/data/bosTau4/bed/multiz5way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all gliers placentals
# Well, that's what it was when used in the Mm9 30-way,
# in this instance, there is only the directory "all"
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast/bin
set subDir = $1
set f = $2
set c = $2:r
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/bosTau4/bed/multiz5way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/bosTau4/multiz5way/cons
if (-s $cons/$grp/$grp.non-inf) then
cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
cp -p $cons/$grp/$grp.mod $tmp
cp -p $san/ss/$subDir/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$subDir $san/$grp/bed/$subDir
sleep 4
touch $san/$grp/pp/$subDir $san/$grp/bed/$subDir
rm -f $san/$grp/pp/$subDir/$f.pp
rm -f $san/$grp/bed/$subDir/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$subDir
mv $tmp/$f.bed $san/$grp/bed/$subDir
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
# Create parasol batch and run it
pushd /san/sanvol1/scratch/bosTau4/multiz5way/cons
find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
> /cluster/data/bosTau4/bed/multiz5way/cons/ss.list
popd
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
/cluster/bin/phast.cz/tree_doctor ../../mm9.30way.mod \
--prune-all-but=bosTau3,hg18,mm9,canFam2,ornAna1 \
| sed -e "s/bosTau3/bosTau4/" > all.mod
cd ../run.cons/all
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(lastDir1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/bosTau4/multiz5way/cons/all/pp/$(lastDir1)/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 2569 of 2569 jobs
# CPU time in finished jobs: 8636s 143.93m 2.40h 0.10d 0.000 y
# IO & Wait Time: 17371s 289.52m 4.83h 0.20d 0.001 y
# Average job time: 10s 0.17m 0.00h 0.00d
# Longest finished job: 44s 0.73m 0.01h 0.00d
# Submission to last job: 1008s 16.80m 0.28h 0.01d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
find ./bed -type f -name "chr*.bed" | xargs cat \
| sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 3 minutes
cp -p mostConserved.bed /cluster/data/bosTau4/bed/multiz5way/cons/all
# load into database
ssh hgwdev
cd /cluster/data/bosTau4/bed/multiz5way/cons/all
time nice -n +19 hgLoadBed bosTau4 phastConsElements5way mostConserved.bed
# Loaded 1005876 elements of size 5
# Try for 5% overall cov, and 70% CDS cov
# We don't have any gene tracks to compare CDS coverage
# --rho .31 --expected-length 45 --target-coverage .3
featureBits bosTau4 phastConsElements5way
# 132010504 bases of 2731830700 (4.832%) in intersection
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
mkdir -p phastCons5wayScores
for D in `ls -1d pp/file* | sort -t_ -k2n`
do
TOP=`pwd`
F=${D/pp\/}
out=${TOP}/phastCons5wayScores/${F}.data.gz
echo "${D} > ${F}.data.gz"
cd ${D}
find . -name "*.pp" -type f \
| sed -e "s#^./##; s/chrUn.004./chrUn_004_/; s/-/.-./" \
| sort -t '.' -k1,1 -k3.3n \
| sed -e "s/.-./-/; s/chrUn_004_/chrUn.004./" | xargs cat \
| gzip > ${out}
cd "${TOP}"
done
# copy those files to the downloads area:
# /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way/phastConsScores
# for hgdownload downloads
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/bosTau4/multiz5way/cons/all
ls -1 phastCons5wayScores/*.data.gz | sort -t_ -k2n | xargs zcat \
| wigEncode -noOverlap stdin phastCons5way.wig phastCons5way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
time nice -n +19 cp -p *.wi? /cluster/data/bosTau4/bed/multiz5way/cons/all
# real 0m40.875s
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/bosTau4/bed/multiz5way/cons/all
ln -s `pwd`/phastCons5way.wib /gbdb/bosTau4/multiz5way/phastCons5way.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/bosTau4/multiz5way bosTau4 \
phastCons5way phastCons5way.wig
# real 1m5.667s
# remove garbage
rm wiggle.tab
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/bosTau4/bed/multiz5way/cons/all
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=bosTau4 phastCons5way > histogram.data 2>&1
# real 3m37.316s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Cow BosTau4 Histogram phastCons5way track"
set xlabel " phastCons5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# These trackDb entries turn on the wiggle phastCons data track:
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons5way
# spanList 1
# autoScale Off
# windowingFunction mean
# pairwiseHeight 12
# yLineOnOff Off
#############################################################################
# Downloads (DONE - 2008-01-11 - Hiram)
# Let's see if the downloads will work
ssh hgwdev
/cluster/data/bosTau4
# expecting to find repeat masker .out file here:
ln -s bed/RepeatMasker/bosTau4.fa.out .
time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
-workhorse=hgwdev bosTau4 > jkStuff/downloads.log 2>&1
# real 24m3.210s
# failed making upstream sequences:
# featureBits bosTau4 mgcGenes:upstream:1000 -fa=stdout
# setpriority: Permission denied.
# the 'nice' from my bash shell causes trouble inside the csh
# script which uses nice. Finish off the install step manually
# with the mgcGenes upstreams ...
#############################################################################
# PushQ entries (DONE - 2008-01-11 - Hiram)
ssh hgwdev
/cluster/data/bosTau4
/cluster/bin/scripts/makePushQSql.pl bosTau4 > jkStuff/pushQ.sql
# output warnings:
# bosTau4 does not have seq
# bosTau4 does not have gbMiscDiff
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
# genscanPep
#############################################################################
# create download files (DONE - 2008-03-19 - Hiram)
ssh hgwdev
cd /cluster/data/bosTau4
ln -s /cluster/data/bosTau4/bed/repeatMasker/bosTau4.fa.out .
makeDownloads.pl bosTau4 > makeDownloads.log 2>&1
# *EDIT* the README files and ensure they are correct
#############################################################################
# PushQ entries (DONE - 2008-03-19 - Hiram)
ssh hgwdev
/cluster/data/bosTau4
/cluster/bin/scripts/makePushQSql.pl bosTau4 > jkStuff/pushQ.sql
# output warnings:
# hgwdev does not have /usr/local/apache/htdocs/goldenPath/bosTau4/liftOver/bosTau4ToBosTau*
# bosTau4 does not have seq
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
# genscanPep
# looks like there should be a bosTau3 to bosTau4 liftOver run
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-03-28)
ssh kkstore06
# bash if not using bash shell already
mkdir /cluster/data/bosTau4/blastDb
cd /cluster/data/bosTau4
grep -v chrUn chrom.sizes | awk '{print $1}' > chr.lst
for i in `cat chr.lst`;
do twoBitToFa bosTau4.unmasked.2bit -seq=$i stdout;
done > temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
grep chrUn chrom.sizes | awk '{print $1}' > chr.lst
for i in `cat chr.lst`;
do twoBitToFa bosTau4.unmasked.2bit -seq=$i stdout;
done > temp.fa
faSplit sequence temp.fa 150 blastDb/y
rm temp.fa chr.lst
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 3440
mkdir -p /san/sanvol1/scratch/bosTau4/blastDb
cd /cluster/data/bosTau4/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/bosTau4/blastDb
done
mkdir -p /cluster/data/bosTau4/bed/tblastn.hg18KG
cd /cluster/data/bosTau4/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/bosTau4/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 3440 query.lst
# we want around 350000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\)
# 36727/(350000/3440) = 360.973943
mkdir -p /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa
split -l 361 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/bosTau4/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/bosTau4/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit
ssh pk
cd /cluster/data/bosTau4/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
Completed: 350880 of 350880 jobs
CPU time in finished jobs: 27082816s 451380.27m 7523.00h 313.46d 0.859 y
IO & Wait Time: 2334990s 38916.50m 648.61h 27.03d 0.074 y
Average job time: 84s 1.40m 0.02h 0.00d
Longest finished job: 578s 9.63m 0.16h 0.01d
Submission to last job: 96125s 1602.08m 26.70h 1.11d
ssh kkstore06
cd /cluster/data/bosTau4/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh pk
cd /cluster/data/bosTau4/bed/tblastn.hg18KG/chainRun
para create chainSpec
para maxNode 30
para try, check, push, check etc.
# Completed: 99 of 102 jobs
# Crashed: 3 jobs
# CPU time in finished jobs: 113248s 1887.47m 31.46h 1.31d 0.004 y
# IO & Wait Time: 86043s 1434.04m 23.90h 1.00d 0.003 y
# Average job time: 2013s 33.55m 0.56h 0.02d
# Longest finished job: 6139s 102.32m 1.71h 0.07d
# Submission to last job: 10416s 173.60m 2.89h 0.12d
# ran three crashed jobs on kolossus
ssh kkstore06
cd /cluster/data/bosTau4/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/bosTau4/bed/tblastn.hg18KG/blastHg18KG.psl
cd ..
pslCheck blastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/bosTau4/bed/tblastn.hg18KG
hgLoadPsl bosTau4 blastHg18KG.psl
# check coverage
featureBits bosTau4 blastHg18KG
# 40254923 bases of 2731830700 (1.474%) in intersection
featureBits bosTau4 refGene:cds blastHg18KG -enrichment
# refGene:cds 0.429%, blastHg18KG 1.474%, both 0.379%, cover 88.39%, enrich 59.98x
ssh kkstore06
rm -rf /cluster/data/bosTau4/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/bosTau4/bed/tblastn.hg18KG/blastOut
#end tblastn
##########################################################################
# Create 5-way downloads (DONE - 2008-03-28 - Hiram)
ssh hgwdev
mkdir -p /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way
cd /cluster/data/bosTau4/bed/multiz5way/downloads/phastCons5way
cp -p \
/san/sanvol1/scratch/bosTau4/multiz5way/cons/all/phastCons5wayScores/* .
ln -s ../../cons/all/all.mod ./5way.mod
cp /cluster/data/calJac1/bed/multiz9way/downloads/phastCons9way/README.txt .
# edit that README.txt to be correct for this 5-way alignment
cd ..
mkdir multiz5way
cd multiz5way
cp -p /cluster/data/calJac1/bed/multiz9way/downloads/multiz9way/README.txt .
# edit that README.txt to be correct for this 5-way alignment
ssh kkstore06
cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
ln -s ../../bosTau4.5-way.nh 5way.nh
time gzip -c ../../anno/bosTau4.anno.5way.maf > bosTau4.5way.maf.gz
# real 34m59.295s
ssh hgwdev
cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
# creating upstream files from refGene, bash script:
cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=bosTau4
GENE=refGene
NWAY=multiz5way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
stdin stdout \
-orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
'_EOF_'
# << happy emacs
chmod +x ./mkUpstream.sh
time nice -n +19 ./mkUpstream.sh
-rw-rw-r-- 1 9883443 Mar 28 13:02 upstream1000.maf.gz
-rw-rw-r-- 1 17938570 Mar 28 13:06 upstream2000.maf.gz
-rw-rw-r-- 1 40384656 Mar 28 13:10 upstream5000.maf.gz
#
# check the names in these upstream files to ensure sanity:
zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
| sort | uniq -c | sort -rn | less
# should be a list of the other 4 species with a high count,
# then refGene names, e.g.:
# 8806 ornAna1
# 8806 mm9
# 8806 hg18
# 8806 canFam2
# 7 NM_001077006
# 3 NM_001113231
# 3 NM_001105381
# 3 NM_001102527
# 3 NM_001102322
# ...
ssh kkstore06
cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
md5sum *.maf.gz > md5sum.txt
cd ../phastCons5way
md5sum *.data.gz *.mod > md5sum.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/bosTau4/multiz5way
mkdir /usr/local/apache/htdocs/goldenPath/bosTau4/phastCons5way
cd /cluster/data/bosTau4/bed/multiz5way/downloads/multiz5way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/bosTau4/multiz5way
cd ../phastCons5way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/bosTau4/phastCons5way
# if your ln -s `pwd`/* made extra links to files you don't want there,
# check the goldenPath locations and remove those extra links
#############################################################################
# N-SCAN gene predictions (nscanGene) - (2008-04-21 markd)
# obtained NSCAN predictions from michael brent's group
# at WUSTL
cd /cluster/data/bosTau4/bed/nscan/
wget http://mblab.wustl.edu/predictions/Cow/bosTau4/bosTau4.gtf
wget http://mblab.wustl.edu/predictions/Cow/bosTau4/bosTau4.prot.fa
wget http://mblab.wustl.edu/predictions/Cow/bosTau4/readme.html
bzip2 bosTau4.*
chmod a-w *
# load track
gtfToGenePred -genePredExt bosTau4.gtf.bz2 stdout | hgLoadGenePred -bin -genePredExt bosTau4 nscanGene stdin
hgPepPred bosTau4 generic nscanPep bosTau4.prot.fa.bz2
rm *.tab
# update trackDb; need a bosTau4-specific page to describe informants
cow/bosTau4/nscanGene.html (copy from readme.html)
cow/bosTau4/trackDb.ra
# set search regex to
termRegex chr[0-9a-zA-Z_]+\.[0-9]+\.[0-9]+(\.[0-9]+)*
#############################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
# SELF BLASTZ (DONE - 2008-06-27 - Hiram)
# do not want the noise from the contigs
ssh kkstore06
screen # use a screen to manage this multi-day job
mkdir /cluster/data/bosTau4/noContigs
cd /cluster/data/bosTau4/noContigs
cut -f1 ../chrom.sizes | grep -v chrUn | while read C
do
echo "twoBitToFa -seq=${C} ../bosTau4.2bit ${C}.fa"
twoBitToFa -seq=${C} ../bosTau4.2bit ${C}.fa
done
faToTwoBit chr*.fa bosTau4.noContigs.2bit
twoBitToFa bosTau4.noContigs.2bit stdout | faSize stdin
# 2634429662 bases (167456864 N's 2466972798 real
# 1326108891 upper 1140863907 lower) in 31 sequences in 1 files
# %43.31 masked total, %46.25 masked real
grep -v chrUn ../chrom.sizes | ave -col=2 stdin
# count 31
# total 2634429662.000000
twoBitInfo bosTau4.noContigs.2bit stdout | sort -k2nr \
> bosTau4.noContigs.chrom.sizes
cp -p bosTau4.noContigs.2bit /cluster/bluearc/scratch/data/bosTau4
cp -p bosTau4.noContigs.chrom.sizes /cluster/bluearc/scratch/data/bosTau4
mkdir /cluster/data/bosTau4/bed/blastzSelf.2008-08-27
cd /cluster/data/bosTau4/bed
ln -s blastzSelf.2008-08-27 blastzSelf
cd blastzSelf.2008-08-27
cat << '_EOF_' > DEF
BLASTZ_M=400
# TARGET: Cow bosTau4
SEQ1_DIR=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.2bit
SEQ1_LEN=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Cow bosTau4
SEQ2_DIR=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.2bit
SEQ2_LEN=/cluster/bluearc/scratch/data/bosTau4/bosTau4.noContigs.chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/bosTau4/bed/blastzSelf.2008-08-27
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
cd /cluster/data/bosTau4/bed/blastzSelf.2008-08-27
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
`pwd`/DEF > blastz.out 2>&1 &
# real 3537m49.719s
# had to fix a slight problem in the make downloads step, then:
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
-continue=cleanup `pwd`/DEF > cleanup.out 2>&1 &
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
bosTau4.upstreamGeneTbl = mgcGenes
bosTau4.upstreamMaf = multiz5way /hive/data/genomes/bosTau4/bed/multiz5way/species.list
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
############################################################################
#############################################################################
# LIFTOVER TO bosTauMd3 (DONE - 2009-12-16 - galt)
ssh hgwdev
cd /hive/data/genomes/bosTau4
ln -s bosTau4.11.ooc 11.ooc
time nice +19 doSameSpeciesLiftOver.pl -verbose=2 \
-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
bosTau4 bosTauMd3 >& doLiftOverToBosTauMd3.log
pushd /usr/local/apache/htdocs/goldenPath/bosTau4/liftOver/
md5sum bosTau4ToBosTauMd3.over.chain.gz >> md5sum.txt
popd
#########################################################################
-# susScr1 Pig BLASTZ/CHAIN/NET (WORKING - 2010-01-21 - Hiram)
+# susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-25 - Hiram)
screen # use a screen to manage this multi-day job
- mkdir /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-21
- cd /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-21
+ mkdir /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25
+ cd /hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25
cat << '_EOF_' > DEF
# Pig vs. Cow
BLASTZ_M=50
# TARGET: Cow BosTau4
SEQ1_DIR=/scratch/data/bosTau4/bosTau4.2bit
SEQ1_LEN=/scratch/data/bosTau4/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=100
# QUERY: Pig SusScr1
SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
-BASE=/hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-21
+BASE=/hive/data/genomes/bosTau4/bed/lastzSusScr1.2010-01-25
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -syntenicNet \
- -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
+ # real 2377m21.473s
+ # never did get this to work. susScr2 as target bosTau4 as query
+ # did finish but with immense chain pileups
+ # failed second time too
+ # XXX - re-running as 01-25 (was 01-21) Mon Jan 25 11:29:32 PST 2010
# real 3811m31.428s
# failed
-XXX - running Thu Jan 21 14:49:43 PST 2010
+ # XXX - running Thu Jan 25 14:49:43 PST 2010
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
`pwd`/DEF -verbose=2 \
-bigClusterHub=memk -chainMinScore=3000 -chainLinearGap=medium \
-syntenicNet > do.log 2>&1
#########################################################################
# lastz swap Rat Rn4 (DONE - 2010-01-22 - Hiram)
# original run
cd /cluster/data/rn4/bed/blastzBosTau4.2010-01-19
cat fb.rn4.chainBosTau4Link.txt
# 649931321 bases of 2571531505 (25.274%) in intersection
mkdir /cluster/data/bosTau4/bed/blastz.rn4.swap
cd /cluster/data/bosTau4/bed/blastz.rn4.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/rn4/bed/blastzBosTau4.2010-01-19/DEF \
-noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-swap -syntenicNet > swap.log 2>&1 &
# real 77m18.645s
cat fb.bosTau4.chainRn4Link.txt
# 664253901 bases of 2731830700 (24.315%) in intersection
#########################################################################
+# lastz swap Pig SusScr2 (DONE - 2010-03-31 - Hiram)
+ # the primary lastz run on susScr2
+ cd /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27
+ cat fb.susScr2.chainBosTau4Link.txt
+ # 1478903080 bases of 2231298548 (66.280%) in intersection
+
+ # and the swap
+ mkdir /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap
+ cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/DEF \
+ -swap -noLoadChainSplit -syntenicNet \
+ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
+ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
+ # most interesting, this failed on the first step chainSwap
+ # but the failure didn't make it stop, it continued and produced
+ # zero results to the end. Running manually:
+ cd /hive/data/genomes/bosTau4/bed/blastz.susScr2.swap/axtChain
+export sizeG=188743680
+ulimit -d $sizeG
+ulimit -v $sizeG
+
+chainSwap /hive/data/genomes/susScr2/bed/lastzBosTau4.2010-03-27/axtChain/susScr2.bosTau4.all.chain.gz stdout \
+| nice chainSort stdin stdout | nice gzip -c > bosTau4.susScr2.all.chain.gz
+
+ # it also runs out of memory in the lift over file creation:
+ export sizeG=188743680
+ ulimit -d $sizeG
+ ulimit -v $sizeG
+
+ netChainSubset -verbose=0 noClass.net bosTau4.susScr2.all.chain.gz stdout \
+ | chainStitchId stdin stdout | gzip -c > bosTau4.susScr2.over.chain.gz
+
+ # and netChains.csh is finished manually with this added:
+# memory limit 160 Gb
+limit datasize 163840m
+limit vmemoryuse 163840m
+
+ # manually run the loadUp.csh with this added:
+# memory limit 160 Gb
+limit datasize 163840m
+limit vmemoryuse 163840m
+ # real 498m5.861s
+
+ cat fb.bosTau4.chainSusScr2Link.txt
+ # 1383557633 bases of 2731830700 (50.646%) in intersection
+
+#########################################################################