src/hg/makeDb/doc/panTro2.txt 1.18
1.18 2010/02/12 23:51:36 hiram
last runs to Marmoset calJac3
Index: src/hg/makeDb/doc/panTro2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/panTro2.txt,v
retrieving revision 1.17
retrieving revision 1.18
diff -b -B -U 1000000 -r1.17 -r1.18
--- src/hg/makeDb/doc/panTro2.txt 20 Sep 2009 17:16:46 -0000 1.17
+++ src/hg/makeDb/doc/panTro2.txt 12 Feb 2010 23:51:36 -0000 1.18
@@ -1,3260 +1,3323 @@
# for emacs: -*- mode: sh; -*-
# This file describes browser build for the panTro2 chimp genome: Feb 2006
#
# "$Id$"
#
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-------------+-------------------------+
# | tableName | type |
# +-------------+-------------------------+
# | refGene | genePred refPep refMrna |
# | xenoRefGene | genePred |
# | nscanGene | genePred nscanPep |
# | genscan | genePred genscanPep |
# +-------------+-------------------------+
#######################################################################
# DOWNLOAD 6X CHIMP SEQUENCE FROM LaDeana HIllier, WUSTL (2006-02-27 - kate)
#
# panTro2
# 6X Chimp from LaDeana Hillier, WUSLT
# NCBI accession for the project is: AADA01000000 (02-SEP-2005)
# NOTE: chrom numbering follows hunan orthology, a change from panTro1.
# Human chrom 2 is orthologus to chimp chroms 2a and 2b.
# SETUP BUILD AREA AND DOWNLOAD ASSEMBLY (DONE 2006-02-27 kate)
ssh kkstore01
mkdir /cluster/store8/panTro2
ln -s /cluster/store8/panTro2 /cluster/data
cd /cluster/data/panTro2
# annotations dir
mkdir bed
# temp dir for lift files and
# scripts copy-pasted from this doc
mkdir jkStuff
# make download dir
wget ftp://genome.wustl.edu/pub/user/lhillier/panTro2.tar.gz
tar xvfz *.gz
mv panTro2 wustl
# for finished chr21, there is chr21.agp, chr21.clones.agp,
# and chr21.fa (complete chrom fasta file)
# for partially finished chrY, there is:
# .agp, .clones.agp, and .fa for chrY and chrY_random
# for chr7, that includes a 5M finished region, there
# are: chr7.agp, chr7.contigs.fa, and chr7.clones.fa
# (also chr7_random.{agp,contigs.fa}
# For all other chroms there are AGP files plus contig fasta:
# chr*.agp, chr*.contigs.fa files
grep '>' wustl/*.contigs.fa | wc -l
# 246371
# NOTE: Release notes list 265882 contigs
# (due to replacement of contigs by clone sequence in finished chroms?)
grep '>' wustl/*.clones.fa | wc -l
# 505
# BUILD CHROM FASTA FILES (2006-02-28 kate)
# Most chroms are built from contigs, a few have
# clones in the AGP, chr7 has both!
cut -f1 wustl/*.agp | uniq | grep -v '^$' | grep -v random | \
sed 's/chr//' > chrom.lst
wc -l chrom.lst
# 28
# 1,2a,2b,3-22,X,Y,M,Un, and chr6_hla_hap1
cat > jkStuff/makeChroms.csh << 'EOF'
date
foreach d (`cat chrom.lst`)
set c = chr$d
echo $c
mkdir $d
cp wustl/$c.{agp,*.fa} $d
if (-e $d/$c.clones.fa) then
set fa = clones.fa
# chr7 has both contigs and clones and clone file
# doesn't have genbank sequence header -- merge them
if ($c == "chr7") then
cat $d/$c.contigs.fa >> $d/$c.clones.fa
rm $d/$c.contigs.fa
else
# extract clone id from sequence name line to please agpToFa
awk -F\| '{if (/^>/) printf(">%s\n", $4); else print}' \
wustl/$c.clones.fa > $d/$c.clones.fa
endif
else
set fa = contigs.fa
endif
agpToFa -simpleMultiMixed $d/$c.agp $c $d/$c.fa $d/$c.$fa
if (-e wustl/${c}_random.agp) then
set r = ${c}_random
echo $r
cp wustl/$r.{agp,*.fa} $d
if (-e $d/$r.clones.fa) then
set fa = clones.fa
awk -F\| '{if (/^>/) printf(">%s\n", $4); else print}' \
wustl/$r.clones.fa > $d/$r.clones.fa
else
set fa = contigs.fa
endif
agpToFa -simpleMultiMixed $d/$r.agp $r $d/$r.fa $d/$r.$fa
endif
end
date
'EOF'
# << happy emacs
csh jkStuff/makeChroms.csh >&! jkStuff/makeChroms.log &
# 5 minutes
cat > jkStuff/checkChroms.csh << 'EOF'
date
foreach d (`cat chrom.lst`)
set c = chr$d
echo $c
faSize $d/$c.fa
checkAgpAndFa $d/$c.agp $d/$c.fa | tail -1
if (-e $d/${c}_random.fa) then
set r = ${c}_random
echo $r
faSize $d/$r.fa
checkAgpAndFa $d/$r.agp $d/$r.fa | tail -1
endif
end
date
'EOF'
# << happy emacs
csh jkStuff/checkChroms.csh >&! jkStuff/checkChroms.log &
# 3 minutes
# Replace chrM and chrM_random from this assembly with
# chrM from NCBI.
# so retrieving it from NCBI, via method described in canFam2 make doc.
# (search "pan troglodytes mitochondrion genome" finds NC_001643
rm -fr M
mkdir M
cd M
wget -O chrM.fa 'http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?db=nucleotide&val=5835121&dopt=FASTA'
# Edit chrM.fa: make sure the long fancy header line says it's the
# Canis familiaris mitochondrion complete genome, and then replace the
# header line with just ">chrM".
faSize chrM.fa
# 16554 bases
# cleanup chrom dirs
cd ..
foreach d (`cat chrom.lst`)
set c = chr$d
mkdir $d/parts
mv $d/*.clones.fa $d/*.contigs.fa $d/parts
end
# retrieve fixed chr9.agp from LaDeana (2006-03-02)
mkdir wustl.chr9
cd wustl.chr9
wget ftp://genome.wustl.edu/pub/user/lhillier/pub/chr9.agp.gz
gunzip *.gz
cp -p chr9.agp ../9
mv wustl/chr9.agp wustl/chr9.agp.old
cp -p chr9.agp wustl/chr9.agp
cd ../9
agpToFa -simpleMultiMixed chr9.agp chr9 chr9.fa parts/chr9.contigs.fa
checkAgpAndFa chr9.agp chr9.fa | tail -1
########################
# QUALITY SCORES (2006-03-15 kate)
# From Joanne Nelson, WUSTL
# Redo chr7 with finished clone scores newly available (2006-07-13 kate)
# Finished chr21 scores are being submitted to Genbank by Riken,
# but are not yet available
ssh kkstore04
cd /cluster/data/panTro2
mkdir bed/quality
cd bed/quality
wget -r ftp://genome.wustl.edu/private/963082470306293/upload_for_kate
mkdir wustl
mv genome.wustl.edu/private/*/upload*/*.qvl wustl
rm -fr genome.wustl.edu
grep '>' wustl/chr*.qvl | wc -l
# 246371
# same as #contigs in sequence files, above
gzip wustl/*.qvl
cat > makeQuals.csh << 'EOF'
date
mkdir -p qac
set b = /cluster/data/panTro2
foreach f (`ls wustl/*.qvl.gz | grep -v chrM | grep -v ContigF`)
set c = $f:t:r:r:r
echo $c
set d = `echo $c | sed 's/chr//;s/_random//'`
# contig names in quality files are slightly different from AGP's
sed 's/Contig/bld2_Cont/' $b/$d/$c.agp > agp.tmp
nice zcat wustl/$c.contigs.qvl | \
nice qaToQac stdin stdout | \
nice qacAgpLift agp.tmp stdin qac/$c.qac
end
date
'EOF'
# << happy emacs
csh makeQuals.csh >&! makeQuals.log &
#chr7
#AC161123.3 not found
# need special handling for chr7, that has both contigs
# (with quality scores), and clones (w/o)
# For chimpHiQualDiffs, construct a chr7 in one piece, with
# missing data filled in with score=98 (manually assigned "low quality"),
# although these are clones, and should have high quality.
# By request of Daryl Thomas
sed 's/Contig/bld2_Cont/' /cluster/data/panTro2/7/chr7.agp > agp.tmp
nice zcat wustl/chr7.contigs.qvl.gz | \
nice qaToQac stdin stdout | \
nice qacAgpLift -mScore=98 agp.tmp stdin qac/chr7.qac
# Redo chr7 with finished clone scores newly available (2006-07-13)
wget -nd -r ftp://genome.wustl.edu/private/723378195268200/chimpqual
# oops -- no longer at this site -- inquiring with Joanne Nelson
# Also, create dummy chrY, chrY_random, chr21, all of which
# are finished sequence from clones
cat > makeFakeQuals.csh << 'EOF'
date
foreach c (chr21 chrY chrY_random)
echo $c
set d = `echo $c | sed 's/chr//;s/_random//'`
# pick a small set of quality scores, needed so utils don't complain
nice zcat wustl/chr22_random.contigs.qvl.gz | \
nice qaToQac stdin stdout | \
nice qacAgpLift -mScore=98 /cluster/data/panTro2/$d/$c.agp \
stdin qac/$c.qac
end
date
'EOF'
# And a dummy chrM
echo 'chrM 1 16554 1 W Unknown 1 16554 +' \
> agp.tmp
nice zcat wustl/chr7.contigs.qvl.gz | \
nice qaToQac stdin stdout | \
nice qacAgpLift -mScore=98 agp.tmp stdin qac/chrM.qac
# Load track table, excluding "dummy" chroms
cat > toWig.csh << 'EOF'
foreach f (`ls qac/chr*.qac | egrep -v 'chrY|chr21|chrM'`)
qacToWig -fixed $f stdout
end
'EOF'
# << happy emacs
date
csh toWig.csh | nice wigEncode stdin quality.wig quality.wib
date
ssh hgwdev
cd /cluster/data/panTro2/bed/quality
ln -s /cluster/data/panTro2/bed/quality/quality.wib /gbdb/panTro2/wib
hgLoadWiggle panTro2 quality quality.wig
# create downloads for quality scores
ssh kkstore04
cd /cluster/data/panTro2/bed/quality
mkdir qa
cat > makeDownload.csh >> 'EOF'
foreach f (qac/chr*.qac)
set c = $f:t:r
echo $c
qacToQa $f stdout | gzip > qa/$c.qa.gz
end
cd qa
tar cvfz chromQuals.tar.gz chr*.qa.gz
'EOF'
rm -fr qa
ssh hgwdev
ln -s /cluster/data/panTro2/bed/quality/chromQuals.tar.gz \
/usr/local/apache/htdocs/goldenPath/panTro2/bigZips
# edit README.txt and update md5sum.txt
########################
# CREATE DATABASE AND GRP TABLE (DONE 2006-02-28 kate)
ssh hgwdev
hgsql '' -e 'create database panTro2'
# Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
df -h /var/lib/mysql
# /dev/sdc1 1.8T 1.6T 71G 96% /var/lib/mysql
# oops -- not enough ?
hgsql panTro2 -e \
"create table grp (PRIMARY KEY(NAME)) select * from mm8.grp"
# remove encode track groups
hgsql panTro2 -e "delete from grp where name like 'encode%'"
########################
# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) 2BIT (DONE 2006-02-28 kate)
# Redo to fix chr9 (2006-03-02 kate)
# Make .2bit, unmasked until RepeatMasker and TRF steps are done.
# Do this now so we can load up RepeatMasker and run featureBits;
# can also load up other tables that don't depend on masking.
ssh kkstore01
cd /cluster/data/panTro2
nice faToTwoBit ?{,?}/chr*.fa 6_hla_hap1/chr*.fa panTro2.2bit
mkdir bed/chromInfo
twoBitInfo panTro2.2bit stdout \
| awk '{print $1 "\t" $2 "\t/gbdb/panTro2/panTro2.2bit";}' \
> bed/chromInfo/chromInfo.tab
# Make symbolic links from /gbdb/panTro2/ to the real .2bit.
ssh hgwdev
mkdir /gbdb/panTro2
ln -s /cluster/data/panTro2/panTro2.2bit /gbdb/panTro2/
# Load /gbdb/panTro2/panTro2.2bit paths into database and save size info.
cd /cluster/data/panTro2
hgsql panTro2 < $HOME/kent/src/hg/lib/chromInfo.sql
hgsql panTro2 -e 'load data local infile \
"/cluster/data/panTro2/bed/chromInfo/chromInfo.tab" \
into table chromInfo;'
echo "select chrom,size from chromInfo" | hgsql -N panTro2 > chrom.sizes
wc chrom.sizes
# 52
# GOLD AND GAP TRACKS (DONE 2006-02-28 kate)
# Redo to fix chr9 (2006-03-02 kate)
# Change chrY centromere to unbridged (2006-07-11 kate)
ssh hgwdev
cd /cluster/data/panTro2
cp -p wustl/chrY.agp wustl/chrY.agp.old
sed '/centromere/s/yes/no/' wustl/chrY.agp.old > wustl/chrY.agp
cp wustl/chrY.agp Y
cat wustl/*.agp | grep -v chrM | hgGoldGapGl -noGl panTro2 stdin
[kate@hgwdev panTro2]$ nice featureBits panTro2 -countGaps -noRandom gap
423043673 bases of 3175632892 (13.322%) in intersection
[kate@hgwdev panTro2]$ nice featureBits panTro1 -countGaps -noRandom gap
671583599 bases of 3083993401 (21.776%) in intersection
[kate@hgwdev panTro2]$ nice featureBits hg15 -countGaps -noRandom gap
238329157 bases of 3070537687 (7.762%) in intersection
[kate@hgwdev panTro2]$ nice featureBits hg18 -countGaps -noRandom gap
222401287 bases of 3091592211 (7.194%) in intersection
chr1 12771775 bases of 229974691 (5.554%) in intersection
chr10 9298149 bases of 135001995 (6.887%) in intersection
chr11 10601065 bases of 134204764 (7.899%) in intersection
chr12 5495847 bases of 135371336 (4.060%) in intersection
chr13 28069601 bases of 115868456 (24.225%) in intersection
chr14 21093350 bases of 107349158 (19.649%) in intersection
chr15 23087195 bases of 100063422 (23.073%) in intersection
chr16 16171896 bases of 90682376 (17.834%) in intersection
chr17 9948345 bases of 83384210 (11.931%) in intersection
chr18 3076876 bases of 77261746 (3.982%) in intersection
chr19 12470245 bases of 64473437 (19.342%) in intersection
chr20 4187763 bases of 62293572 (6.723%) in intersection
chr21 13764311 bases of 46489110 (29.608%) in intersection
chr22 17821597 bases of 50165558 (35.526%) in intersection
chr2a 8581091 bases of 114460064 (7.497%) in intersection
chr2b 120728260 bases of 248603653 (48.563%) in intersection
chr3 8990241 bases of 203962478 (4.408%) in intersection
chr4 7932593 bases of 194897272 (4.070%) in intersection
chr5 8761164 bases of 183994906 (4.762%) in intersection
chr6 9202606 bases of 173908612 (5.292%) in intersection
chr6_hla_hap1 0 bases of 34169 (0.000%) in intersection
chr7 9183564 bases of 160261443 (5.730%) in intersection
chr8 6927299 bases of 145085868 (4.775%) in intersection
#chr9 115285361 bases of 224587821 (51.332%) in intersection
# Seems too big -- notified LaDeana of huge gap (80MB)
# She fixed the AGP, and here's the result
chr9 29207531 bases of 138509991 (21.087%) in intersection
chrM 0 bases of 16554 (0.000%) in intersection
chrUn 8188633 bases of 58616431 (13.970%) in intersection
chrX 24409881 bases of 155361357 (15.712%) in intersection
chrY 1261428 bases of 23952694 (5.266%) in intersection
# HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 2006-03-03 kate)
ssh hgwdev
cd $HOME/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add in all the right places and do
make update
cvs commit makefile
mkdir -p chimp/panTro2
cvs add chimp/panTro2
cvs ci -m "trackDb dir for second chimp genome(s)" chimp/panTro2
# Do this in a clean (up-to-date, no edits) tree:
make alpha DBS=panTro2
# Add dbDb entry (not a new organism so defaultDb and genomeClade already
# have entries):
hgsql -h genome-testdb hgcentraltest \
-e 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
values("panTro2", "Jan. 2006", \
"/gbdb/panTro2", "Chimp", "chr7:115705331-115981791", 1, \
15, "Chimp", "Pan troglodytes", \
"/gbdb/panTro2/html/description.html", 0, 0, \
"Washington University Build 2 Version 1");'
# SPLIT SEQUENCE FOR REPEATMASKER (2006-03-01 kate)
# Split into 500K chunks, at gaps if possible
ssh kkstore01
cd /cluster/data/panTro2
mkdir split500K
foreach d (`cat chrom.lst`)
set c = chr$d
mkdir split500K/$c
faSplit gap -minGapSize=100 -verbose=2 $d/$c.fa 500000 \
split500K/$c/${c}_ -lift=split500K/$c.lft
set r = ${c}_random
if (-e $d/$r.fa) then
mkdir split500K/$r
faSplit gap -minGapSize=100 -verbose=2 $d/$r.fa 500000 \
split500K/$r/${r}_ -lift=split500K/$r.lft
endif
end
mkdir lifts
cat split500K/*.lft > lifts/split500K.lft
# Redo chr9 (2006-03-02 kate)
rm -fr split500K/chr9
mkdir split500K/chr9
faSplit gap -minGapSize=100 -verbose=2 9/chr9.fa 500000 \
split500K/chr9/chr9_ -lift=split500K/chr9.lft
# forgot to redo this -- doing it now (2006-03-21 kate)
cat split500K/*.lft > lifts/split500K.lft
# REPEATMASKER RUN (2006-03-01 kate)
# Using -species pan option to get chimp repeats
# Redone 3/10 with additional chimp repeats from
# Evan Eichler (left out of -pan lib)
ssh pk
cd /cluster/data/panTro2
mkdir RMRun
cd RMRun
# Record RM version used
ls -l /cluster/bluearc/RepeatMasker
#lrwxrwxrwx 1 hiram protein 18 Jan 20 13:13 /cluster/bluearc/RepeatMasker -> RepeatMasker060120/
cat /cluster/bluearc/RepeatMasker/Libraries/version > RM.version
#RM database version 20060120
# Run RepeatMasker on a dummy input, just to make it initialize its chimp
# libraries once before the cluster run
/cluster/bluearc/RepeatMasker/RepeatMasker -spec pan /dev/null
# Building species libraries in: /cluster/bluearc/RepeatMasker060120/Libraries/20060120/pan
pushd /cluster/data/panTro2/split500K
ls -1S */*.fa | sort > ../RMRun/split.lst
popd
cat << 'EOF' > template
#LOOP
./RMChimp.csh $(dir1) $(root1) {check out line out/$(dir1)/$(root1).out}
#ENDLOOP
'EOF'
# << for emacs
cat > RMChimp.csh << 'EOF'
#!/bin/csh -ef
set d = /cluster/data/panTro2
set tmp = /scratch/tmp/panTro2/$2
mkdir -p $tmp
mkdir -p out/$1
cp $d/split500K/$1/$2.fa $tmp
pushd $tmp
/cluster/bluearc/RepeatMasker060120/RepeatMasker -ali -s -species pan $2.fa
popd
cp -p $tmp/$2.fa.out $3
if (-e $tmp/$2.fa.align) cp $tmp/$2.fa.align out/$1
if (-e $tmp/$2.fa.tbl) cp $tmp/$2.fa.tbl out/$1
if (-e $tmp/$2.fa.cat) cp $tmp/$2.fa.cat out/$1
rm -fr $tmp/*
rmdir --ignore-fail-on-non-empty $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/panTro2
'EOF'
# << for emacs
chmod +x RMChimp.csh
gensub2 split.lst single template jobList
para create jobList
# 6538 jobs
para try
# CPU time in finished jobs: 32493557s 541559.29m 9025.99h 376.08d 1.030 y
# IO & Wait Time: 40999s 683.31m 11.39h 0.47d 0.001 y
# Average job time: 4976s 82.94m 1.38h 0.06d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 7607s 126.78m 2.11h 0.09d
# Submission to last job: 121890s 2031.50m 33.86h 1.41d
# 34 hours
# Lift up the 500KB .out's to chrom level
ssh kkstore01
cd /cluster/data/panTro2/RMRun
cat > lift.csh << 'EOF'
foreach d (`cat ../chrom.lst`)
set c = chr$d
echo $c
liftUp ../$d/$c.fa.out ../lifts/split500K.lft error out/$c/*.out
set r = ${c}_random
if (-e out/$r) then
echo $r
liftUp ../$d/$r.fa.out ../lifts/split500K.lft error out/$r/*.out
endif
end
'EOF'
# << for emacs
csh lift.csh >&! lift.log &
# Redo chr9 after remaking split500K.lft (2006-03-24 kate)
liftUp ../9/chr9.fa.out ../lifts/split500K.lft error out/chr9/*.out
# Load .outs into database
# Redone after relift (2006-03-24 kate)
ssh hgwdev
cd /cluster/data/panTro2
ls */*.fa.out | wc -l
# 52 (one per chrom)... load them up
hgLoadOut panTro2 */*.fa.out
# Several messages like: Strange perc. field -3.1 line 327855 of 1/chr1.fa.out
# And at end: 953 records dropped due to repStart > repEnd
# These are noted in rn4 and other make docs.
# TODO: run it with -verbose=2 and send output to Robert Hubley/Arian Smit
# VERIFY REPEATMASKER RESULTS (2006-03-03 kate)
# TODO: Eyeball some repeat annotations in the browser, compare to lib seqs.
# Run featureBits on previous genome build, and compare:
#nice featureBits panTro2 rmsk
# w/o 2 Eichler repeats:
# 1396630330 bases of 2909512873 (48.002%) in intersection
# with 2 Eichler repeats:
# 1401134418 bases of 2909512873 (48.157%) in intersection
nice featureBits panTro1 rmsk
# 1311281819 bases of 2733948177 (47.963%) in intersection
# after chr9 .out reload...
# 1311281819 bases of 2733948177 (47.963%) in intersection
# looks good -- similar but just a bit higher
ssh hgwdev
cd /cluster/data/panTro2/RMRun
# take a look at
hgsql panTro2 -e "select distinct(repFamily) from chr1_rmsk" > repFamilies.panTro2
hgsql panTro2 -e "select distinct(repName) from chr1_rmsk where repFamily <> 'Simple_repeat' and repFamily <> 'Low_complexity' order by repName" > repNames.panTro2
wc -l repNames.panTro2
hgsql panTro1 -e "select distinct(repFamily) from chr1_rmsk" > repFamilies.panTro1
hgsql panTro1 -e "select distinct(repName) from chr1_rmsk" > repNames.panTro1
hgsql panTro1 -e "select distinct(repName) from chr1_rmsk where repFamily <> 'Simple_repeat' and repFamily <> 'Low_complexity' order by repName" > repNames.panTro1
wc -l *Names*
# 864 repNames.panTro1
# 906 repNames.panTro2
# 40 new repeats (+ 2 added from Eichler)
wc -l *Families*
# 38 repFamilies.panTro1
# 38 repFamilies.panTro2
# no new repeat families
# RUN TRF TO IDENTIFY SIMPLE REPEATS (2006-03-01 kate)
ssh kkstore01
cd /cluster/data/panTro2
# First break up sequence into 5MB chunks at contigs/gaps
# NOTE: Probably unnecessary -- Hiram has run on whole chroms
# using small cluster
mkdir split5M
foreach d (`cat chrom.lst`)
set c = chr$d
mkdir split5M/$c
faSplit gap -minGapSize=100 -verbose=2 $d/$c.fa 5000000 \
split5M/$c/${c}_ -lift=split5M/$c.lft
set r = ${c}_random
if (-e $d/$r.fa) then
mkdir split5M/$r
faSplit gap -minGapSize=100 -verbose=2 $d/$r.fa 5000000 \
split5M/$r/${r}_ -lift=split5M/$r.lft
endif
end
mkdir lifts
cat split5M/*.lft > lifts/split5M.lft
# Run TRF
mkdir -p bed/simpleRepeat/trf
cd bed/simpleRepeat
cat > runTrf.csh << 'EOF'
date
foreach d (`find /cluster/data/panTro2/split5M/* -type d`)
cd $d
foreach f (*.fa)
set b = $f:r
echo $f
trfBig -trf=/cluster/bin/i386/trf $f /dev/null -tempDir=/tmp \
-bedAt=/cluster/data/panTro2/bed/simpleRepeat/trf/$b.bed
end
end
date
'EOF'
csh runTrf.csh >&! runTrf.log &
# ~6 hours run-time
# Redo chr9 (2006-03-02 kate)
cd /cluster/data/panTro2
rm -fr split5M/chr9
mkdir split5M/chr9
faSplit gap -minGapSize=100 -verbose=2 9/chr9.fa 5000000 \
split5M/chr9/chr9_ -lift=split5M/chr9.lft
cd bed/simpleRepeat
cp runTrf.csh runTrf.chr9.csh
# edit for just chr9
csh runTrf.chr9.csh >&! runTrf.chr9.log &
# check for EOF in output files to give some assurance
endsInLf trf/*
# trf/chrM_0.bed zero length
# lift to chromosome coordinates
liftUp simpleRepeat.bed /cluster/data/panTro2/lifts/split5M.lft \
warn trf/*.bed
# forgot to include the new lift in the all split (2006-03-21 kate)
# so do it now and reload table
cd /cluster/data/panTro2
cat split5M/*.lft > lifts/split5M.lft
cd bed/simpleRepeat
liftUp simpleRepeat.bed /cluster/data/panTro2/lifts/split5M.lft \
warn trf/*.bed
# load into database
ssh hgwdev
cd /cluster/data/panTro2/bed/simpleRepeat
hgLoadBed panTro2 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
nice featureBits panTro2 simpleRepeat
# 82897828 bases of 2909512873 (2.849%) in intersection
nice featureBits panTro1 simpleRepeat
# 53732632 bases of 2733948177 (1.965%) in intersection
# seems quite a bit higher -- compare it to hg17 chrom1
# nice featureBits panTro2 simpleRepeat -chrom=chr1
# 3068104 bases of 217202916 (1.413%) in intersection
nice featureBits hg17 simpleRepeat -chrom=chr1
# 3438627 bases of 222827847 (1.543%) in intersection
# darn similar
# PROCESS SIMPLE REPEATS INTO MASK
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore01
cd /cluster/data/panTro2/bed/simpleRepeat
mkdir -p trfMask trfMaskChrom
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# lift filtered simple repeats to chrom coordinates, by chrom
cat > liftChroms.csh << 'EOF'
foreach d (`cat /cluster/data/panTro2/chrom.lst`)
set c = chr$d
echo $c
liftUp trfMaskChrom/$c.bed /cluster/data/panTro2/split5M/$c.lft \
warn trfMask/${c}_?{,?}.bed
set r = ${c}_random
if (-e /cluster/data/panTro2/$d/$r.fa) then
liftUp trfMaskChrom/$r.bed /cluster/data/panTro2/split5M/$r.lft \
warn trfMask/${r}_?{,?}.bed
endif
end
'EOF'
# << for emacs
csh liftChroms.csh >&! liftChroms.log &
# check coverage of filtered TRF
hgwdev
cd /cluster/data/panTro2/bed/simpleRepeat
cat trfMaskChrom/*.bed > /tmp/filtTrf.bed
featureBits panTro2 /tmp/filtTrf.bed
# 28800863 bases of 2909512873 (0.990%) in intersection
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (2006-03-03)
# soft-mask (lower-case) the chrom fasta files
# then make hard-masked versions from the soft-masked
# note: next time simplify this (as below for chr9)
ssh kkstore01
cd /cluster/data/panTro2
set trfChr=bed/simpleRepeat/trfMaskChrom
foreach f (?{,?}/chr*.fa 6_hla_hap1/chr*.fa)
cp $f $f.unmasked
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
cp $f $f.rmsk
set c = $f:t:r
maskOutFa -softAdd $f $trfChr/$c.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
# remask chr9 after relifting .out's (2006-03-24)
set f = 9/chr9.fa
set c = chr9
maskOutFa -soft $f.unmasked $f.out $f.rmsk
maskOutFa -softAdd $f.rmsk $trfChr/$c.bed $f
maskOutFa $f hard $f.masked
# a few messages (e.g. 10 per chrom) like:
# WARNING: negative rEnd: -531 chr1:14090705-14090763 L1MEc
# after checking, remove .unmasked and .rmsk
# make 2bit for browser and blat
# redone for chr9 correction (2006-03-24 kate)
faToTwoBit ?{,?}/chr*.fa 6_hla_hap1/chr*.fa panTro2.2bit
mkdir /cluster/bluearc/scratch/hg/panTro2
ssh kkr1u00
cp -p /cluster/data/panTro2/panTro2.2bit /iscratch/i/panTro2/panTro2.2bit
~kent/bin/iSync
# after chr9 remask:
cp -p /cluster/data/panTro2/panTro2.2bit /iscratch/i/panTro2
cp -p /cluster/data/panTro2/nib/chr9.nib /iscratch/i/panTro2/nib
~kent/bin/iSync
# make nibs for blastz w/linSpecRep
mkdir nib
foreach f (?{,?}/chr*.fa 6_hla_hap1/chr*.fa)
echo $f:t:r
faToNib -softMask $f nib/$f:t:r.nib
end
# remake chr9 (2006-03-24 kate)
faToNib -softMask 9/chr9.fa nib/chr9.nib
# MAKE 11.OOC
# #Remade after chr9 remask (2006-03-24 kate)
ssh kolossus
cd /cluster/data/panTro2
mkdir /cluster/bluearc/panTro2
blat panTro2.2bit /dev/null /dev/null \
-tileSize=11 -makeOoc=/cluster/bluearc/panTro2/11.ooc -repMatch=1024
cp -p /cluster/bluearc/panTro2/11.ooc .
# COPY SEQUENCE TO CLUSTER NODES FOR CLUSTER RUNS
set scratch = /cluster/bluearc/scratch/hg/panTro2
mkdir $scratch
cp -p panTro2.2bit $scratch
cp -rp nib $scratch
cp -rp chrom.sizes $scratch
cp -p 11.ooc $scratch
# request cluster-admin sync /scratch/hg/panTro2 to cluster nodes
# update 2bit and chr9 nib after updating chr9 masking
cp -p panTro2.2bit $scratch
cp -p nib/chr9.nib $scratch
cp -p 11.ooc $scratch
# request cluster-admin sync /scratch/hg/panTro2 to cluster nodes
# GENBANK ALIGNMENTS (DONE 2006-03-25 kate/markd)
# restarted 3/13 4pm
# Make a lift file that identifies gap locations for Genbank
# This is a pseudo "liftAll" made from the AGP files, however negative
# strand contigs are renamed to indicate they need to be
# reversed.
ssh kkstore01
cd /cluster/data/panTro2
cat ?{,?}/*.agp 6_hla_hap1/*.agp | agpToLift -revStrand > \
lifts/genbank.lft
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add panTro2
# the hack in the genbank code for panTro1 as changed for
# panTro2. panTro2 treats both chimp and human cDNAs as
# native, and all else as xeno. Don't do xeno ESTs any more.
panTro2.serverGenome = /cluster/data/panTro2/panTro2.2bit
panTro2.clusterGenome = /scratch/hg/panTro2/panTro2.2bit
panTro2.ooc = /cluster/bluearc/panTro2/11.ooc
panTro2.align.unplacedChroms = chrUn,chr*_random
panTro2.lift = /cluster/data/panTro2/lifts/genbank.lft
panTro2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
panTro2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
panTro2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
panTro2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
panTro2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
panTro2.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter}
panTro2.downloadDir = panTro2
panTro2.genbank.est.xeno.load = no
panTro2.refseq.mrna.native.load = yes
panTro2.refseq.mrna.xeno.load = yes
panTro2.refseq.mrna.xeno.loadDesc = yes
cvs ci etc/genbank.conf
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial panTro2 &
# various problems with ssh, bluearc, etc....
nice bin/gbAlignStep -initial -continue=copy panTro2 &
ssh hgwdev
nice bin/gbDbLoadStop -initialLoad -drop panTro2 &
# MAKE LINEAGE-SPECIFIC REPEATS FOR BLASTZ (2006-03-15 kate)
ssh kkstore01
cd /cluster/data/panTro2
mkdir linSpecRep
cd linSpecRep
mkdir dates
cd dates
# run Arian's script to annotate .out files with species-specificity
# include all currently supported comparison species
cat > outRepeats.csh << 'EOF'
date
set d = /cluster/data/panTro2
foreach f ($d/?{,?}/chr*.fa.out $d/6_hla_hap1/chr*.fa.out)
set c = $f:t:r:r
echo $c
cp $f .
/cluster/bluearc/RepeatMasker060120/DateRepeats $c.fa.out -query human \
-comp mouse -comp rat -comp dog -comp cow -comp rabbit
mv $c.fa.*cuniculus $c.fa.dates.out
end
date
'EOF'
# << for emacs
csh outRepeats.csh >&! outRepeats.log &
# 25 minutes
# Redo for fixed chr9 (2006-03-24 kate)
cp /cluster/data/panTro2/9/chr9.fa.out .
set c = chr9
/cluster/bluearc/RepeatMasker060120/DateRepeats $c.fa.out -query human \
-comp mouse -comp rat -comp dog -comp cow -comp rabbit
mv $c.fa.*cuniculus $c.fa.dates.out
# run our script to extract lineage-specific by column
# from the annotated .outs
# just run on chr1 to assess which variants produce any
# differences.
mkdir testChr1
cd testChr1
cat > testChr1.csh << 'EOF'
set script = /cluster/bin/scripts/extractRepeats
set f = ../chr1.fa.dates.out
echo mouse
$script 1 $f > notInMouse.out.spec
echo rat
$script 2 $f > notInRat.out.spec
echo dog
$script 3 $f > notInDog.out.spec
echo cow
$script 4 $f > notInCow.out.spec
echo rabbit
$script 5 $f > notInRabbit.out.spec
'EOF'
csh testChr1.csh >&! testChr1.log &
ls -l *.spec
-rw-rw-r-- 1 kate protein 17678799 Mar 15 09:51 notInCow.out.spec
-rw-rw-r-- 1 kate protein 17678799 Mar 15 09:51 notInDog.out.spec
-rw-rw-r-- 1 kate protein 17773939 Mar 15 09:48 notInHuman.out.spec
-rw-rw-r-- 1 kate protein 17773939 Mar 15 09:50 notInMouse.out.spec
-rw-rw-r-- 1 kate protein 17773939 Mar 15 09:51 notInRabbit.out.spec
-rw-rw-r-- 1 kate protein 17773939 Mar 15 09:50 notInRat.out.spec
# this indicates there are 2 unique LSR files:
# 2) notInRodent (includes Mouse, Rat, Rabbit)
# 3) notInOther (includes Cow & Dog)
# Extract mouse (column 1) and dog (column 3)
cd /cluster/data/panTro2/linSpecRep
mkdir notInRodent notInOthers
cat > makeLSR.csh << 'EOF'
foreach f (dates/*.fa.dates.out)
set c = $f:t:r:r:r
echo $c
set out = $c.out.spec
/cluster/bin/scripts/extractRepeats 1 $f > notInRodent/$out
/cluster/bin/scripts/extractRepeats 3 $f > notInOthers/$out
end
'EOF'
# << happy emacs
csh makeLSR.csh >&! makeLSR.log &
# copy to bluearc for blastz runs
set d = /cluster/bluearc/panTro2/linSpecRep
mkdir -p $d
cp -rp notInRodent $d
cp -rp notInOthers $d
# update for chr9 (2006-03-24 kate)
set c = chr9
extractRepeats 1 dates/$c.fa.dates.out > \
notInRodent/$c.out.spec
extractRepeats 3 dates/$c.fa.dates.out > \
notInOthers/$c.out.spec
cp -rp notInRodent $d
cp -rp notInOthers $d
# CREATE LINEAGE-SPECIFIC REPEATS FOR BLASTZ WITH ZEBRAFISH (2006-04-07 kate)
# Treat all repeats as lineage-specific
ssh kkstore04
cd /cluster/data/panTro2
set d = /cluster/bluearc/panTro2/linSpecRep/notInZebrafish
mkdir $d
foreach f ({{?{,?}},6_hla_hap1}/chr*.fa.out)
set c = $f:t:r:r
echo $c
cp -p $f $d/$c.out.spec
end
# SET UP BLAT SERVER (2006-03-06 kate)
# Request blat server from cluster-admin, then enter ports in database
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("panTro2", "blat12", "17790", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("panTro2", "blat12", "17791", "0", "1");' \
hgcentraltest
# test it with some sequence
# GC 5 BASE TRACK (2006-03-06 kate)
ssh kkstore01
cd /cluster/data/panTro2/bed
mkdir gc5base
cd gc5base
time hgGcPercent -wigOut -doGaps -file=stdout -win=5 panTro2 \
/cluster/data/panTro2 | wigEncode stdin gc5Base.wig gc5Base.wib
sh hgwdev
cd /cluster/data/panTro2/bed/gc5base
mkdir -p /gbdb/panTro2/wib
ln -s `pwd`/*.wib /gbdb/panTro2/wib
hgLoadWiggle panTro2 gc5Base gc5Base.wig
# CPGISLANDS (2006-03-06 - kate)
# Redo 3/13/06 after second Repeatmasker run
# Redone for chr9 remasking (2006-03-24 kate)
ssh hgwdev
mkdir /cluster/data/panTro2/bed/cpgIsland
cd /cluster/data/panTro2/bed/cpgIsland
# Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
cvs co hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
make
# gcc readseq.c cpg_lh.c -o cpglh.exe
cd ../..
ln -s hg3rdParty/cpgIslands/cpglh.exe .
# cpglh.exe requires hard-masked (N) .fa's.
# There may be warnings about "bad character" for IUPAC ambiguous
# characters like R, S, etc. Ignore the warnings.
ssh kkstore01
cd /cluster/data/panTro2/bed/cpgIsland
cat > doCpg.csh << 'EOF'
foreach f (../../?{,?}/chr*.fa.masked ../../6_hla_hap1/chr*.fa.masked)
set c = $f:t:r:r
echo $c
./cpglh.exe $f > $c.cpg
end
'EOF'
# << happy emacs
csh doCpg.csh >&! doCpg.log &
# Several chroms have 0 results:
# -rw-rw-r-- 1 0 Feb 16 15:19 chr10_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:20 chr15_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chr8_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chr9_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chrM.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chrX_random.cpg
# -rw-rw-r-- 1 0 Feb 16 15:22 chrY.cpg
# Transform cpglh output to bed +
cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
$1, $2, $3, $5,$6, width,
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
# happy emacs
awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed
ssh hgwdev
cd /cluster/data/panTro2/bed/cpgIsland
hgLoadBed -strict panTro2 cpgIslandExt -tab -noBin \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
featureBits panTro2 cpgIslandExt
# 19412837 bases of 2909512873 (0.667%) in intersection
# BAC Ends (2006-03-08 IN PROGRESS kate)
# download BAC ends from NCBI.
ssh kkstore
cd /cluster/data/ncbi/bacends/chimp
mkdir bacends.chimp.2
cd bacends.chimp.2
# wget ftp.ncbi.nih.gov/genomes/CLONEEND/pan_troglodytes/*
# trouble with wget on files in this dir -- they're hanging,
# so I used ftp mget w/o prompt
# creates files: 9598_clone_end???.mfa and 95998_clone_info???.txt
# 10 files of each type
# NOTE: the *.info files contain a superset of the data in the
# previous cl_acc_gi_len file -- extract fields 1-5 and 8 to
# use as input to Terry's script that generaes the .pair file
# for compatibility with downstream tools
ftp ftp.ncbi.nih.gov
cd /genomes/CLONEEND/pan_troglodytes
prompt
mget *
gunzip *.gz
# setup for track build area
cd /cluster/data/panTro2
mkdir -p bed/bacends
cd bed/bacends
ln -s /cluster/data/ncbi/bacends/chimp/bacends.chimp.2 ncbi
# generate pairs info file by/for processing by Terry Furey's scripts
grep -h -v '^#' ncbi/*.txt | \
awk '{printf ("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, $4, $5, $8)}' \
> cl_acc_gi_len
wc -l cl_acc_gi_len
# 175757 cl_acc_gi_len
# previous load (2004): 158588 entries
faSize ncbi/*.mfa
# 175757 sequences in 10 files
# Info and sequence catch match -- good!
/cluster/bin/scripts/convertBacEndPairInfo cl_acc_gi_len
# 85911 pairs and 3935 singles
# previous: 78846 pairs and 896 singles
# Note large increase in singles -- either lower quality in
# new data set, or script could use some updating.
# Just note this for now.
wc -l bacEndPairs.txt bacEndSingles.txt
# 85911 bacEndPairs.txt
# 3935 bacEndSingles.txt
# create sequence file
cp /cluster/data/ncbi/bacends/chimp/bacends.chimp.1/convert.pl .
cat ncbi/*.mfa | ./convert.pl > BACends.fa
faSize BACends.fa
# 175757 sequences
# make accessible to track
ssh hgwdev
mkdir -p /gbdb/panTro2/bacends
ln -s /cluster/data/panTro2/bed/bacends/BACends.fa /gbdb/panTro1/bacends
# split for aligning on cluster
ssh kkstore01
cd /cluster/data/panTro2/bed/bacends
set tmp = /san/sanvol1/scratch/panTro2/bacends/
mkdir -p $tmp
faSplit BACends.fa sequence 10 $tmp
# blat vs. unmasked sequence in 5M chunks
# 652 chunks * 10 bacends files = 6520 jobs
ssh pk
cd /cluster/data/panTro2/bed/bacends
mkdir run
cd run
mkdir ../out
# list chrom chunks and bacends chunks
set dir = /cluster/data/panTro2/split5M
ls $dir/*/*.fa | sed "s^$dir/^^" > chromSplit.lst
set dir = /san/sanvol1/scratch/panTro2/bacends
ls $dir/*.fa | sed "s^$dir/^^" > bacends.lst
cat > align.csh << 'EOF'
#!/bin/csh -ef
set d = $1:h
set f = $1:t
set e = $2
mkdir -p ../out/$d
set tmp = /scratch/tmp/panTro2/align.$$
mkdir -p $tmp
cp /cluster/data/panTro2/split5M/$d/$f $tmp
cp /san/sanvol1/scratch/panTro2/bacends/$e $tmp
blat $tmp/$f $tmp/$e -ooc=/cluster/bluearc/panTro2/11.ooc ../out/$d/$f:r.$e:r.psl
rm -fr $tmp
'EOF'
# << happy emacs
chmod +x align.csh
cat > gsub << 'EOF'
#LOOP
./align.csh $(path1) $(path2) {check out line+ ../out/$(dir1)/$(root1).$(root2).psl}
#ENDLOOP
'EOF'
# << happy emacs
gensub2 chromSplit.lst bacends.lst gsub jobList
para create jobList
# 6520 jobs
para try
# NOTE: these run quickly
########################
# MAKE DOWNLOADABLE SEQUENCE FILES (2006-04-02 kate)
ssh kkstore04
cd /cluster/data/panTro2
mkdir -p downloads/bigZips downloads/chromosomes
cd downloads
cat > makeChroms.csh << 'EOF'
date
foreach f (../?{,?}/*.fa ../6_hla_hap1/*.fa)
echo $f:t:r
nice gzip -c $f > chromosomes/$f:t.gz
end
date
'EOF'
csh makeChroms.csh >&! makeChroms.log &
# 14 minutes
cat > bigZips.csh << 'EOF'
cd /cluster/data/panTro2
tar cvzf downloads/bigZips/chromAgp.tar.gz \
?{,?}/chr*.agp 6_hla_hap1/*.agp
tar cvzf downloads/bigZips/chromFa.tar.gz \
?{,?}/chr*.fa 6_hla_hap1/chr*.fa
tar cvzf downloads/bigZips/chromFaMasked.tar.gz \
?{,?}/chr*.fa.masked 6_hla_hap1/chr*.fa.masked
tar cvzf downloads/bigZips/chromOut.tar.gz \
?{,?}/chr*.fa.out 6_hla_hap1/chr*.fa.out
cd /cluster/data/panTro2/bed/simpleRepeat
tar cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMaskChrom
'EOF'
csh bigZips.csh >&! bigZips.log &
# get GenBank native mRNAs and refGene
ssh hgwdev
cd /cluster/data/genbank
time ./bin/i386/gbGetSeqs -db=panTro2 -native GenBank mrna \
/cluster/data/panTro2/downloads/bigZips/mrna.fa
cd /cluster/data/panTro2/downloads/bigZips
cd /cluster/data/panTro2/downloads/bigZips
foreach I (1000 2000 5000)
echo "upstream${I} working ... "
featureBits panTro2 refGene:upstream:${I} -fa=stdout \
| gzip -c > upstream${I}.fa.gz
echo "upstream${I} done"
end
# real 11m25.493s
ssh kkstore04
cd /cluster/data/panTro2/downloads/chromosomes
# copy previous release README.txt
scp hgwdev:/usr/local/apache/htdocs/goldenPath/panTro1/chromosomes/README.txt .
# edit it to bring it up to date
md5sum *.gz > md5sum.txt
cd /cluster/data/panTro2/downloads/bigZips
gzip mrna.fa
cp -p ../../panTro2.2bit .
# copy previous release README.txt
scp hgwdev:/usr/local/apache/htdocs/goldenPath/panTro1/bigZips/README.txt .
# edit README.txt to indicate proper version of sequence and
# RepeatMasker
md5sum *.gz *.2bit README.txt > md5sum.txt
ssh hgwdev
set d = /usr/local/apache/htdocs/goldenPath/panTro2
mkdir $d
ln -s /cluster/data/panTro2/downloads/bigZips $d
ln -s /cluster/data/panTro2/downloads/chromosomes $d
########################
# BLASTZ HUMAN hg18 (2006-05-23 kate)
# Do not use lineage-specific repeats, on recommendation of Arian Smit
# Redo with human/chimp scoring matrix
ssh pk
cd /cluster/data/panTro2/bed
mkdir blastz.hg18.2006-05-23
rm blastz.hg18
ln -s blastz.hg18.2006-05-23 blastz.hg18
cd blastz.hg18
cat << '_EOF_' > DEF
# chimp vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Chimmp panTro2
SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes
# QUERY: Human hg18 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/hg18/hg18.2bit
SEQ2_LEN=/scratch/hg/hg18/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastz.hg18.2006-05-23
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \
`pwd`/DEF >&! blastz.out &
########################
# BLASTZ HUMAN hg18 (2006-03-13 kate)
# Do not use lineage-specific repeats, on recommendation of Arian Smit
ssh pk
cd /cluster/data/panTro2/bed
mkdir blastz.hg18.2006-03-13
ln -s blastz.hg18.2006-03-13 blastz.hg18
cd blastz.hg18
cat << '_EOF_' > DEF
# chimp vs human
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Chimmp panTro2
SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes
# QUERY: Human hg18 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/hg18/hg18.2bit
SEQ2_LEN=/scratch/hg/hg18/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastz.hg18.2006-03-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \
`pwd`/DEF >&! blastz.out &
# failed during chaining step, because /scratch/hg/panTro2
# doesn't exist on minicluster nodes
# for now, just symlink /iscratch/i/panTro2 there
# so that the 2bit is available to this pipeline -- later
# we'll make sure the nodes are completely synced up.
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \
-continue=chainRun \
`pwd`/DEF >&! blastz.2.out &
# chr19 failed with out-of-mem, so try it on kolossus
# moved to hgwdev by request....
ssh kolossus
cd /cluster/data/panTro2/bed/blastz.hg18/axtChain/run
csh chain.csh panTro2.2bit:chr19: chain/panTro2.2bit:chr19:.chain >&! \
chainChr19.log &
# Create run.time file and remove liftedChain so we can continue
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-continue=chainMerge \
`pwd`/DEF >&! blastz.3.out &
# SWAP ALIGNMENTS TO PANTRO2 (2006-03-19 kate)
/cluster/bin/scripts/doBlastzChainNet.pl -swap \
-workhorse=hgwdev64 \
/cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.log &
# Redo of chr9 (2006-03-28 kate)
# Create blastz run dir for just chr9, and run manually on pk.
ssh pk
cd /cluster/data/panTro2/bed/blastz.hg18
mkdir run.blastz.chr9
cd run.blastz.chr9
cp ../run.blastz/*.csh .
grep chr9 ../run.blastz/jobList | grep -v random > jobList
ln -s ../run.blastz/qParts .
para create jobList
# 28 jobs
para push
para time > run.time
# merge chr9 psl's on small cluster ("cat" step in automated script)
ssh kki
cd /cluster/data/panTro2/bed/blastz.hg18
mkdir run.cat.chr9
cd run.cat.chr9
cp ../run.cat/cat.csh .
grep chr9 ../run.cat/jobList > jobList
para create jobList
# 14 jobs
para push
para time > run.time
cd ../axtChain
mkdir run.chr9
cd run.chr9
cp ../run/chain.csh .
grep chr9 ../run/jobList > jobList
mkdir chain
para create jobList
# 1 job
para push
para time > run.time
# replace chr9 chain in all.chain
ssh hgwdev64 # kolossus busy
cd /cluster/data/panTro2/bed/blastz.hg18/axtChain/run
mkdir chain
set all = panTro2.hg18.all.chain.gz
chainSplit chain/ ../$all
rm chain/chr9.chain
cp ../run.chr9/chain/*.chain chain
chainMergeSort chain/*.chain | nice gzip -c > ../$all
rm -fr chain/*
cd ..
mkdir chain
chainSplit chain/ $all
# automate the rest
ssh pk
cd /cluster/data/panTro2/bed/blastz.hg18
rm -fr axtNet mafNet
ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/panTro2/vsHg18"
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -workhorse=hgwdev64 \
-continue=net \
`pwd`/DEF >&! blastz.net.chr9.out &
# done
# swap after chr9 fix (2006-03-30 kate)
rm -fr /cluster/data/hg18/bed/blastz.panTro2.swap
/cluster/bin/scripts/doBlastzChainNet.pl -swap \
-workhorse=kkr4u00 \
/cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.log &
# downloads failed -- need to remove the old dir
ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/hg18/vsPanTro2"
/cluster/bin/scripts/doBlastzChainNet.pl -swap -continue=download\
/cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.download.log &
# Reciprocal best (DONE 2006-04-02 kate)
# These are best alignments in both directions
# Start with the best chains from the net (these are in the liftOver chain).
# Swap to human-reference, and net them
# Finally extract the best chains from this net, and make axt's
# REDO with improved blastz params (2006-07-27 kate)
ssh kolossus
cd /cluster/data/hg18/bed
ln -s blastz.panTro2.swap blastz.panTro2
cd blastz.panTro2/axtChain
mkdir rBestNet
cat > rbest.csh << 'EOF'
set swapChain = panTro2ToHg18.swap.chain
echo "merge and swap $swapChain"
zcat /cluster/data/panTro2/bed/liftOver/panTro2ToHg18.over.chain.gz | \
chainSwap stdin stdout | \
chainMergeSort stdin > $swapChain
echo "split chain"
chainSplit swapChain/ $swapChain
echo "run netter"
chainPreNet $swapChain /cluster/bluearc/hg18/chrom.sizes \
/cluster/bluearc/panTro2/chrom.sizes stdout | \
chainNet stdin -minSpace=1 /cluster/bluearc/hg18/chrom.sizes \
/cluster/bluearc/panTro2/chrom.sizes \
stdout /dev/null | \
netSyntenic stdin rbest.noClass.net
echo "split net"
netSplit rbest.noClass.net rBestNet
echo "extract chains from net"
netChainSubset -verbose=0 rbest.noClass.net $swapChain stdout | \
chainMergeSort stdin | \
gzip -c > hg18.panTro2.rbest.chain.gz
echo "split extracted chain"
chainSplit rBestChain/ hg18.panTro2.rbest.chain.gz
cd ..
mkdir axtRBestNet
echo "extract axts"
foreach f (axtChain/rBestNet/*.net)
echo $f:t:r
netToAxt $f axtChain/swapChain/$f:t:r.chain \
/san/sanvol1/scratch/hg18/hg18.2bit \
/cluster/bluearc/panTro2/panTro2.2bit stdout | \
axtSort stdin stdout | \
gzip -c > axtRBestNet/$f:t:r.hg18.panTro2.net.axt.gz
end
'EOF'
csh rbest.csh >&! rbest.log &
# load chains for viewing on genome-test
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.panTro2/axtChain
set chain = hg18.panTro2.rbest.chain
# skip for now -- load by request
cd rBestChain
cat > loadRBestChain.csh << 'EOF'
date
foreach c (`awk '{print $1;}' /cluster/bluearc/hg18/chrom.sizes`)
echo $c
set f = $c.chain
if (! -e $f) then
echo no chains for $c
set f = /dev/null
endif
hgLoadChain hg18 ${c}_chainRBestPanTro2 $f
end
date
'EOF'
cd ..
#end skip for now
# compare to previous set (other blastz run) and also full set
featureBits hg18 chainRBestOldPanTro2Link -chrom=chr7
# 146029338 bases of 154952424 (94.241%) in intersection
featureBits hg18 chainRBestPanTro2Link -chrom=chr7
# 145403551 bases of 154952424 (93.838%) in intersection
featureBits hg18 chainPanTro2Link -chrom=chr7
# 148298785 bases of 154952424 (95.706%) in intersection
# some additional processing of nets
# add classification info
netClass -noAr rbest.noClass.net hg18 panTro2 hg18.panTro2.rbest.net
gzip hg18.panTro2.rbest.net
# link to downloads
md5sum *.gz > md5sum.txt
cd ..
md5sum axtNet/*.gz >> axtChain/md5sum.txt
md5sum axtRBestNet/*.gz >> axtChain/md5sum.txt
cd /usr/local/apache/htdocs/goldenPath/hg18/vsPanTro2
set d = /cluster/data/hg18/bed/blastz.panTro2
ln -s $d/axtChain/*.gz .
mkdir axtRBestNet
ln -s /cluster/data/hg18/bed/blastz.panTro2/axtRBestNet/*.axt.gz axtRBestNet
# edit README.txt
# cleanup: TODO after release
cd /cluster/data/hg18/bed/blastz.panTro2/axtChain
rm -f rbest.noClass.net panTro2ToHg18.swap.chain
rm -fr swapChain rBestChain rBestNet
########################
# BLASTZ MACAQUE rheMac2 (2006-03-14 kate)
ssh pk
cd /cluster/data/panTro2/bed
mkdir blastz.rheMac2.2006-03-14
ln -s blastz.rheMac2.2006-03-14 blastz.rheMac2
cd blastz.rheMac2
cat << '_EOF_' > DEF
# chimp vs monkey
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Chimp panTro2
SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes
# QUERY: Macaque rheMac2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/rheMac2/rheMac2.2bit
SEQ2_LEN=/scratch/hg/rheMac2/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastz.rheMac2.2006-03-14
TMPDIR=/scratch/tmp
'_EOF_'
# NOTE: probably should have used chainMinScore=3000
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.rheMac2 \
`pwd`/DEF >&! blastz.out &
# Started Mar 14 18:40
# Done Mar 15 18:50
# Copy MAF files to san for use in multiple alignment
set dir = /san/sanvol1/scratch/panTro2/mafNet
mkdir -p $dir
cp -rp mafNet $dir/rheMac2
ssh hgwdev featureBits panTro2 chainRheMac2Link
# crashes out-of-mem on hgwdev (try on kolossus)
# SWAP ALIGNMENTS TO PANTRO2 (2006-03-15 kate)
/cluster/bin/scripts/doBlastzChainNet.pl -swap \
/cluster/data/panTro2/bed/blastz.rheMac2/DEF >&! swap.log &
# failed due to missing file hgwdev:/scratch/hg/rheMac2/chrom.sizes
# so I copied the file and ran the "loadUp" script
ssh hgwdev
cd /cluster/data/rheMac2/bed/blastz.panTro2.swap/axtChain
nice loadUp.csh >&! loadUp.log &
# Redo of chr9 (2006-03-28 kate)
# Create blastz run dir for just chr9, and run manually on pk.
ssh pk
cd /cluster/data/panTro2/bed/blastz.rheMac2
mkdir run.blastz.chr9
cd run.blastz.chr9
cp ../run.blastz/*.csh .
grep chr9 ../run.blastz/jobList | grep -v random > jobList
ln -s ../run.blastz/qParts .
para create jobList
# 28 jobs
para try
para push
para time > run.time
# merge chr9 psl's on small cluster ("cat" step in automated script)
ssh kki
cd /cluster/data/panTro2/bed/blastz.rheMac2
mkdir run.cat.chr9
cd run.cat.chr9
cp ../run.cat/cat.csh .
grep chr9 ../run.cat/jobList > jobList
para create jobList
# 14 jobs
para push
para time > run.time
cd ../axtChain
mkdir run.chr9
cd run.chr9
cp ../run/chain.csh .
grep chr9 ../run/jobList > jobList
mkdir chain
para create jobList
# 1 job
para push
para time > run.time
# replace chr9 chain in all.chain
ssh hgwdev64 # kolossus busy
cd /cluster/data/panTro2/bed/blastz.rheMac2/axtChain/run
mkdir chain
set all = panTro2.rheMac2.all.chain.gz
chainSplit chain/ ../$all
rm chain/chr9.chain
cp ../run.chr9/chain/*.chain chain
chainMergeSort chain/*.chain | nice gzip -c > ../$all
rm -fr chain/*
cd ..
mkdir chain
chainSplit chain/ $all
# GOT HERE
# automate the rest
ssh pk
cd /cluster/data/panTro2/bed/blastz.rheMac2
rm -fr axtNet mafNet
ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/panTro2/vsRheMac2"
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -workhorse=hgwdev64 \
-continue=net \
`pwd`/DEF >&! blastz.net.chr9.out &
# gotta remove another file for the automation
rm axtChain/panTro2.rheMac2.net.gz
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -workhorse=kkr1u00 \
-continue=load \
`pwd`/DEF >&! blastz.load.chr9.out &
# missing noClass.net, work around it
ssh hgwdev
cd /cluster/data/panTro2/bed/blastz.rheMac2/axtChain
cat net/*.net > noClass.net
# create a script from loadUp.csh, containing only netClass and hgLoadNet
csh loadUp2.csh >&! ../blastz.load2.chr9.out &
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -workhorse=kkr1u00 \
-continue=download \
`pwd`/DEF >&! blastz.download.chr9.out &
featureBits panTro2 chainRheMac2Link >&! chainRheMac2Link.fb
# 2452377221 bases of 2909512873 (84.288%) in intersection
# NOTE: had to run this on kolossus as hgwdev ran out of mem.
# (needed to push chromInfo and chr*_gap tables over first).
# swap alignments after chr9 fix
ssh pk
rm -fr /cluster/data/rheMac2/bed/blastz.panTro2.swap
cd /cluster/data/panTro2/bed/blastz.rheMac2
/cluster/bin/scripts/doBlastzChainNet.pl -swap \
-workhorse=kkr1u00 \
`pwd`/DEF >&! swap.log &
########################
# BLASTZ DOG canFam2 (2006-03-15 kate)
ssh pk
cd /cluster/data/panTro2/bed
mkdir blastz.canFam2.2006-03-15
ln -s blastz.canFam2.2006-03-15 blastz.canFam2
cd blastz.canFam2
cat << '_EOF_' > DEF
# chimp vs dog
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
# Specific settings for dog (per Webb email to Brian Raney)
BLASTZ_ABRIDGE_REPEATS=1
# NOTE: must use nibs with repeat abridging
# TARGET: Chimp panTro2
SEQ1_DIR=/scratch/hg/panTro2/nib
SEQ1_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Dog CanFam2 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_LEN=/cluster/bluearc/canFam2/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInHuman
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastz.canFam2.2006-03-15
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.canFam2 \
`pwd`/DEF >&! blastz.out &
# Started 2006-03-15 15:40
# Failed due to missing chimp nibs on small cluster /scratch/hg
# copied them over manually, then continued, running from
# fileserver, as pk is down
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
-continue=chainRun \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.canFam2 \
`pwd`/DEF >&! blastz.2.out &
# Started 2006-03-16 17:50
# Failed due to missing chrom.sizes on hgwdev:/scratch/hg/panTro2
# Copied manually then continued
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -continue=load \
`pwd`/DEF >&! blastz.3.out &
# Redo of chr9 (2006-03-28 kate)
# Create blastz run dir for just chr9, and run manually on pk.
ssh pk
cd /cluster/data/panTro2/bed/blastz.canFam2
mkdir run.blastz.chr9
cd run.blastz.chr9
cp ../run.blastz/*.csh .
grep panTro2/nib/chr9.nib ../run.blastz/jobList > jobList
para create jobList
# 14 jobs
para try
para push
para time > run.time
# merge chr9 psl's on small cluster ("cat" step in automated script)
ssh kki
cd /cluster/data/panTro2/bed/blastz.canFam2
mkdir run.cat.chr9
cd run.cat.chr9
cp ../run.cat/cat.csh .
grep chr9 ../run.cat/jobList > jobList
para create jobList
# 15 jobs
para push
para time > run.time
cd ../axtChain
mkdir run.chr9
cd run.chr9
cp ../run/chain.csh .
grep chr9 ../run/jobList | grep -v random > jobList
mkdir chain
para create jobList
# 1 job
para push
para time > run.time
# replace chr9 chain in all.chain
ssh hgwdev64 # kolossus busy
cd /cluster/data/panTro2/bed/blastz.canFam2/axtChain/run
mkdir chain
set all = panTro2.canFam2.all.chain.gz
chainSplit chain/ ../$all
rm chain/chr9.chain
cp ../run.chr9/chain/*.chain chain
chainMergeSort chain/*.chain | nice gzip -c > ../$all
rm chain/*
chainSplit chain/ ../$all
mv chain ..
# automate the rest
ssh pk
cd /cluster/data/panTro2/bed/blastz.canFam2
rm -fr axtNet mafNet
rm axtChain/{noClass.net,panTro2.canFam2.net.gz}
ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/panTro2/vsCanFam2"
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -workhorse=kkr8u00 \
-continue=net \
`pwd`/DEF >&! blastz.net.chr9.out &
# done
featureBits panTro2 chainCanFam2Link > chainCanFam2Link.fb
# 1516576565 bases of 2909512873 (52.125%) in intersection
#########################################################################
# BLASTZ galGal2 (2006-03-18 kate)
ssh pk
cd /cluster/data/panTro2/bed/
mkdir blastz.galGal2.2006-03-18
ln -s blastz.galGal2.2006-03-18 blastz.galGal2
cd blastz.galGal2
cat << '_EOF_' > DEF
# chimp vs chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7
BLASTZ_ABRIDGE_REPEATS=1
# TARGET: Chimp panTro2
SEQ1_DIR=/scratch/hg/panTro2/nib
SEQ1_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInOthers
SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once
SEQ2_DIR=/scratch/hg/galGal2/nib
SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastz.galGal2.2006-03-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.galGal2 \
`pwd`/DEF >&! blastz.out &
# error in net step -- can't find all.chain for some reason
# try restarting
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net \
`pwd`/DEF >&! blastz.2.out &
# Redo of chr9 (2006-03-28 kate)
# Create blastz run dir for just chr9, and run manually on pk.
ssh pk
cd /cluster/data/panTro2/bed/blastz.galGal2
mkdir run.blastz.chr9
cd run.blastz.chr9
cp ../run.blastz/*.csh .
grep panTro2/nib/chr9.nib ../run.blastz/jobList > jobList
# 162 jobs
para create jobList
para push
para time > run.time
# merge chr9 psl's on small cluster ("cat" step in automated script)
ssh kki
cd /cluster/data/panTro2/bed/blastz.galGal2
mkdir run.cat.chr9
cd run.cat.chr9
cp ../run.cat/cat.csh .
grep chr9 ../run.cat/jobList > jobList
para create jobList
# 15 jobs
para push
para time > run.time
cd ../axtChain
mkdir run.chr9
cd run.chr9
cp ../run/chain.csh .
grep chr9 ../run/jobList | grep -v random > jobList
mkdir chain
para create jobList
# 1 job
para push
# GOT HERE
para time > run.time
# replace chr9 chain in all.chain
ssh kkstore01 # kolossus busy
cd /cluster/data/panTro2/bed/blastz.galGal2/axtChain/run
mkdir chain
set all = panTro2.galGal2.all.chain.gz
nice chainSplit chain/ ../$all
rm chain/chr9.chain
cp ../run.chr9/chain/*.chain chain
nice chainMergeSort chain/*.chain | nice gzip -c > ../$all
rm chain/*
cd ..
mkdir chain
nice chainSplit chain/ $all
# automate the rest
ssh pk
cd /cluster/data/panTro2/bed/blastz.galGal2
rm -fr axtNet mafNet
rm axtChain/{noClass.net,panTro2.galGal2.net.gz}
ssh hgwdev "rm -fr /usr/local/apache/htdocs/goldenPath/panTro2/vsGalGal2"
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -workhorse=kkr7u00 \
-continue=net \
`pwd`/DEF >&! blastz.net.chr9.out &
featureBits panTro2 chainGalGal2Link > chainGalGal2Link.fb
# 79103883 bases of 2909512873 (2.719%) in intersection
# swap alignments to other browser
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap `pwd`/DEF >&! swap.out &
############################################################################
# GENSCAN PREDICTIONS (2006-03-18 kate)
ssh kkstore04
cd /cluster/data/panTro2
mkdir bed/genscan
# setup to break hardmasked chrom sequence into 5MB chunks at
# unbridged contigs/gaps
set sdir = /san/sanvol1/scratch/panTro2/splitHard
foreach d (`cat chrom.lst | grep -v M`)
mkdir -p $sdir/$d
foreach agp ($d/chr*.agp)
set c = $agp:t:r
echo $c
cp $agp $sdir/$d
cp $d/$c.fa.masked $sdir/$d
end
end
ssh pk
cd /cluster/data/panTro2
cd bed/genscan
# Make 3 subdirectories for genscan output files
mkdir gtf pep subopt
# finish split
cat > bed/genscan/doSplit.csh << 'EOF'
set sdir = /san/sanvol1/scratch/panTro2/splitHard
cd /san/sanvol1/scratch/panTro2/splitHard
foreach agp (*/chr*.agp)
set d = $agp:h
set c = $agp:t:r
set fa = $d/$c.fa.masked
echo splitting $agp $fa
nice splitFaIntoContigs $agp $fa . -nSize=5000000
end
'EOF'
csh bed/genscan/doSplit.csh >&! bed/genscan/doSplit.log &
# add chrM
# create list of chunks for cluster run
cd bed/genscan
rm -f genome.list
set sdir = /san/sanvol1/scratch/panTro2/splitHard
# slip in chrM
mkdir -p $sdir/M/chrM_1
cp ../../M/chrM.fa.masked $sdir/M/chrM_1/chrM_1.fa
mkdir $sdir/M/lift
awk '/chrM/ {printf("%d\t%s\t%d\t%s\t%d\n", 0, "M/chrM", $2, "chrM", $2)}' \
/cluster/data/panTro2/chrom.sizes > $sdir/M/lift/ordered.lft
cd $sdir
foreach f (*/chr*_*/*.fa)
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $sdir/$f >> \
/cluster/data/panTro2/bed/genscan/genome.list
end
# retrieve executable
ssh hgwdev
cd /cluster/data/panTro2/bed/genscan
cvs co hg3rdParty/genscanlinux
ssh kki
cd /cluster/data/panTro2/bed/genscan
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para create jobList
# 646 jobs written to batch
para try
para check
para push
# CPU time in finished jobs: 444979s 7416.32m 123.61h 5.15d 0.014 y
# IO & Wait Time: 1898s 31.63m 0.53h 0.02d 0.000 y
# Time in running jobs: 150791s 2513.18m 41.89h 1.75d 0.005 y
# Average job time: 693s 11.55m 0.19h 0.01d
# Longest running job: 150791s 2513.18m 41.89h 1.75d
# Longest finished job: 86693s 1444.88m 24.08h 1.00d
# Submission to last job: 87402s 1456.70m 24.28h 1.01d
# Excessively long on chunks with centromere not at start or end of chunk
# Next time, try to break these.
# Convert these to chromosome level files
ssh kkstore04
cd /cluster/data/panTro2/bed/genscan
set sdir = /san/sanvol1/scratch/panTro2/splitHard
cat $sdir/*/lift/*.lft > $sdir/liftAll.lft
liftUp genscan.gtf $sdir/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed $sdir/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database
ssh hgwdev
cd /cluster/data/panTro2/bed/genscan
ldHgGene panTro2 genscan genscan.gtf
# 42844 gene predictions
hgsql panTro1 -Ne "select count(*) from genscan"
# 41471
hgPepPred panTro2 generic genscanPep genscan.pep
hgLoadBed -strict panTro2 genscanSubopt genscanSubopt.bed
featureBits panTro2 genscan
# 53758249 bases of 2909485072 (1.848%) in intersection
featureBits panTro1 genscan
# 53757565 bases of 2909485072 (1.848%) in intersection
QA NOTE:
Ran mytouch for genscanPep table and changed the Update_time to 2006-07-19
18:00:00, because runJoiner was complaining.
#######################################################################
# OPOSSUM BLASTZ - (WORKING - 2006-03-20 - Hiram)
ssh pk
mkdir /cluster/data/panTro2/bed/blastzMonDom4.2006-03-20
cd /cluster/data/panTro2/bed
ln -s blastzMonDom4.2006-03-20 blastz.monDom4
cd /cluster/data/panTro2/bed/blastzMonDom4.2006-03-20
cat << '_EOF_' > DEF
# chimp vs. opossum
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
BLASTZ=blastz.v7.x86_64
# settings for more distant organism alignments
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Chimp (panTro2)
SEQ1_DIR=/scratch/hg/panTro2/nib
SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Opossum monDom4
SEQ2_DIR=/san/sanvol1/scratch/monDom4/monDom4.2bit
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastzMonDom4.2006-03-20
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > blastz.out 2>&1 &
# running 2006-03-20 11:30
# SWAP CHAINS/NET RN4 (DONE 3/27/06 angie)
# This run used the updated chr9.nib.
ssh kkstore01
mkdir /cluster/data/panTro2/bed/blastz.rn4.swap
cd /cluster/data/panTro2/bed/blastz.rn4.swap
doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.panTro2/DEF \
-workhorse kkr8u00 >& do.log & tail -f do.log
ln -s blastz.rn4.swap /cluster/data/panTro2/bed/blastz.rn4
############################################################################
### swap of Mm8 alignments - (DONE - 2006-03-30 - Hiram)
ssh pk
mv /cluster/data/panTro2/bed/blastz.mm8.swap \
/cluster/data/panTro2/bed/blastz.mm8.swap.2006-03-21
mkdir /cluster/data/panTro2/bed/blastz.mm8.swap
cd /cluster/data/panTro2/bed/blastz.mm8.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
/cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \
> blastz.out 2>&1 &
# XXX started 2006-03-30
time nice -n +19 featureBits panTro2 chainMm8Link \
> fb.panTro2.chainMm8Link
# XXXX first time was:
# 986978326 bases of 2909512873 (33.922%) in intersection
############################################################################
# SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM PANTRO1 (2006-03-13 kate)
ssh kkr1u00
cd /cluster/data/panTro2/bed
mkdir liftOver
cd liftOver
makeLoChain-split panTro2 /cluster/data/panTro2/nib >&! split.log &
########################
# BLASTZ HUMAN HG17 (2006-06-17 kate)
# Do not use lineage-specific repeats, on recommendation of Arian Smit
# Do use latest human-chimp scoring matrix and gap penalties from Webb Miller
# via rico (Richard Burhans) at PSU
# NOTE: these alignments have been extensively reviewed by Kateryna
# Makerova, Richard Burnhams and Webb Miller at Penn state to
# verify parameters are suitable. The same parameters were
# used for the HG18 alignments, below.
cat /cluster/data/penn/human_chimp.v2.q
# A C G T
# 90 -330 -236 -356
# -330 100 -318 -236
# -236 -318 100 -330
# -356 -236 -330 90
# O = 600 E = 150
ssh pk
cd /cluster/data/panTro2/bed
mkdir blastz.hg17.2006-06-17-05
ln -s blastz.hg17.2006-06-17-05 blastz.hg17
cd blastz.hg17
ssh kkr1u00
cp -p /cluster/data/hg17/hg17.2bit /scratch/hg/hg17
~kent/bin/iSync
# failed for some reason, so I copied the files manually to
# kkr*u00
cat << '_EOF_' > DEF
# chimp vs human
# Specially tuned blastz parameters from Webb Miller
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q
# TARGET: Chimmp panTro2
SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes
# QUERY: Human hg17 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/hg17/hg17.2bit
SEQ2_LEN=/cluster/bluearc/hg17/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastz.hg17.2006-06-17-05
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg17 \
`pwd`/DEF >&! blastz.out &
# failed due to badly formatted scoring matrix file, which
# fortuitously pointed out the need for a trailing line
# containing gap penalties in the scoring matrix file.
# If it's missing (as in all of our current .q files),
# axtChain uses defaults, not the penalties used for the
# blastz run (although it could now get these from the axt files).
# I'm adding this to the matrix file, also will change library
# (axt.c) to add the gap values used by axtChain to the header comment
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg17 \
-continue=chainRun \
`pwd`/DEF >&! blastz.2.out &
# failed due to ssh key failure on hgwdev
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg17 \
-continue=load \
`pwd`/DEF >&! blastz.3.out &
nice featureBits panTro2 chainHg17Link
# 2759308860 bases of 2909512873 (94.837%) in intersection
nice featureBits panTro2 chainHg17OldLink
# from alignments with default matrix
#2782705676 bases of 2909512873 (95.642%) in intersection
hgsql panTro2 -e "select count(*) from chr7_chainHg17"
# 75376
hgsql panTro2 -e "select count(*) from chr7_chainHg17Old"
# 241200
# swap alignments to hg17 reference
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk \
`pwd`/DEF >&! swap.out &
nice featureBits hg17 chainPanTro2Link
# 2719566113 bases of 2866216770 (94.883%) in intersection
nice featureBits hg17 chainPanTro2LinkOld
# 2739413164 bases of 2866216770 (95.576%) in intersection
hgsql panTro2 -e "select count(*) from chr7_chainPanTro2"
# 105398
hgsql panTro2 -e "select count(*) from chr7_chainPanTro2Old"
# 333069
nice featureBits panTro2 chainHg17Link | tee fb.panTro2.chainHg17Link
# 2759308860 bases of 2823407242 (97.730%) in intersection
########################
# BLASTZ HUMAN HG18 (2006-07-9 kate)
# Do not use lineage-specific repeats, on recommendation of Arian Smit
# Do use latest human-chimp scoring matrix and gap penalties from Webb Miller
# via rico (Richard Burhans) at PSU
cat /cluster/data/penn/human_chimp.v2.q
# A C G T
# 90 -330 -236 -356
# -330 100 -318 -236
# -236 -318 100 -330
# -356 -236 -330 90
# O = 600 E = 150
ssh pk
cd /cluster/data/panTro2/bed
mkdir blastz.hg18.2006-07-09
ln -s blastz.hg18.2006-07-09 blastz.hg18
cd blastz.hg18
cat << '_EOF_' > DEF
# chimp vs human
# Specially tuned blastz parameters from Webb Miller
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q
# TARGET: Chimmp panTro2
SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/bluearc/panTro2/chrom.sizes
# QUERY: Human hg18 - single chunk big enough to run entire genome
SEQ2_DIR=/scratch/hg/hg18/hg18.2bit
SEQ2_LEN=/cluster/bluearc/hg18/chrom.sizes
SEQ2_CHUNK=3000000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastz.hg18.2006-07-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \
`pwd`/DEF >&! blastz.out &
# Mini-cluster nodes were dead, so restart required after admins fixed.
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-chainMinScore=5000 -chainLinearGap=medium -bigClusterHub=pk \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.hg18 \
-continue=cat \
`pwd`/DEF >&! blastz.2.out &
hgsql panTro2 -Nse "select count(*) from chr7_chainHg18"
# 76122
hgsql panTro2 -Nse "select count(*) from chr7_chainHg17"
# 75376
featureBits panTro2 -chrom=chr7 chr7_chainHg18Link
# 146031865 bases of 151077879 (96.660%) in intersection
featureBits panTro2 -chrom=chr7 chr7_chainHg17Link
# 145990849 bases of 151077879 (96.633%) in intersection
nice featureBits panTro2 chainHg18Link >& fb.panTro2.chainHg18Link &
cat fb.panTro2.chainHg18Link
# 2760856313 bases of 2823407242 (97.785%) in intersection
cat ../blastz.hg17/fb.panTro2.chainHg17Link
# 2759308860 bases of 2823407242 (97.730%) in intersection
# SWAP ALIGNMENTS (2006-07-11 kate)
ssh kkstore02
cd /cluster/data/hg18/bed
mkdir blastz.panTro2.swap
cd blastz.panTro2
/cluster/bin/scripts/doBlastzChainNet.pl -swap \
/cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.log &
# failed during liftover chain copy (netChains.csh)
/cluster/bin/scripts/doBlastzChainNet.pl -swap -continue=download\
/cluster/data/panTro2/bed/blastz.hg18/DEF >&! swap.download.log &
#########################################################################
# BLASTZ danRer3 (2006-04-07 kate)
# Includes both randoms
ssh pk
mkdir /cluster/data/panTro2/bed/blastz.danRer3.2006-04-07
cd /cluster/data/panTro2/bed
ln -s blastz.danRer3.2006-04-07 blastz.danRer3
cd blastz.danRer3
cat << 'EOF' > DEF
# human target, zebrafish query
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# use parameters suggested for human-fish evolutionary distance
# recommended in doBlastzChainNet.pl help
# (previously used for hg16-fr1, danrer1-mm5)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
# TARGET: Human panTro2
SEQ1_DIR=/scratch/hg/panTro2/nib
SEQ1_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInZebrafish
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes
# QUERY: zebrafish danRer3
# Use all chroms, including both randoms (chrUn and chrNA)
SEQ2_DIR=/san/sanvol1/scratch/danRer3/nib
SEQ2_SMSK=/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
SEQ2_LEN=/cluster/bluearc/danRer3/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=1000
BASE=/cluster/data/panTro2/bed/blastz.danRer3.2006-04-07
TMPDIR=/scratch/tmp
'EOF'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.danRer3 \
`pwd`/DEF >& blastz.out &
# failed on 'cat.run' step, for no good reason.
# continuing manually with 'para shove -retries=20'...
ssh kki
cd /cluster/data/panTro2/bed/blastz.danRer3/run.cat
para time > run.time
ssh pk
cd /cluster/data/panTro2/bed/blastz.danRer3
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue=chainMerge \
`pwd`/DEF >& blastz.chainMerge.out &
# failed due to bad NFS mount on one of the mini-cluster nodes.
# finished manually, and created run,time
#########################################################################
# BLASTZ danRer4 (2006-07-10 kate)
# Includes both randoms
ssh pk
mkdir /cluster/data/panTro2/bed/blastz.danRer4.2006-07-10
cd /cluster/data/panTro2/bed
ln -s blastz.danRer4.2006-07-10 blastz.danRer4
cd blastz.danRer4
cat << 'EOF' > DEF
# human target, zebrafish query
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# use parameters suggested for human-fish evolutionary distance
# recommended in doBlastzChainNet.pl help
# (previously used for hg16-fr1, danrer1-mm5)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
# TARGET: Human panTro2
SEQ1_DIR=/scratch/hg/panTro2/nib
SEQ1_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInZebrafish
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/scratch/hg/panTro2/chrom.sizes
# QUERY: zebrafish danRer4
# Use all chroms, including both randoms (chrUn and chrNA)
SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib
SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=1000
BASE=/cluster/data/panTro2/bed/blastz.danRer4.2006-07-10
TMPDIR=/scratch/tmp
'EOF'
# << happy emacs
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.danRer4 \
`pwd`/DEF >& blastz.out &
# 2 jobs crashed on kki, pk was rebooted
# finish jobs with para push on kki, then continue
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot /san/sanvol1/scratch/panTro2/blastz.danRer4 \
-continue=chainRun \
`pwd`/DEF >& blastz.2.out &
# swap alignemnts to other assembly
/cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap `pwd`/DEF >& swap.out &
#########################################################################
# LifUunder to panTro1 (2005-07-11 kate)
ssh pk
cd /cluster/data/panTro2
doSameSpeciesLiftOver.pl panTro2 panTro1 >& liftUnder.out &
ln -s panTro1.2005-07-14 blat.panTro1
# stalled out looking for workhorse, restarting
doSameSpeciesLiftOver.pl panTro2 panTro1 \
-buildDir=/cluster/data/panTro2/bed/blat.panTro1 \
-continue=net -workhorse=kolossus >& liftOver.2.out &
#############################################################################
# 8-WAY MULTIZ ALIGNMENTS (2006-07-15 kate)
# hg18
# rheMac2
# mm8
# canFam2
# monDom4
# galGal2
# danRer4
# copy net mafs to cluster-friendly storage for multiz run
ssh kkstore04
cd /cluster/data/panTro2/bed
set d = /san/sanvol1/scratch/panTro2/mafNet
mkdir -p $d
foreach s (hg18 rheMac2 mm8 canFam2 monDom4 galGal2 danRer4)
echo $s
cp -Rp blastz.$s/mafNet $d/$s
end
# make output dir and run dir
ssh pk
cd /cluster/data/panTro2/bed
mkdir -p multiz8way.2006-07-15
ln -s multiz8way.2006-07-15 multiz8way
cd multiz8way
cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod .
cat elliotsEncode.mod
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
TRAINING_LNL: -988246.132962
BACKGROUND: 0.295 0.205 0.205 0.295
RATE_MAT:
-1.165221 0.315494 0.589884 0.259843
0.189778 -0.878194 0.208718 0.479698
0.444622 0.261535 -0.885604 0.179447
0.234867 0.720815 0.215191 -1.170872
TREE: (((((((((((((hg17:0.006690,panTro1:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn3:0.081728,mm7:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom2:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro1:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer3:0.782561):0.156067);
# create species list and stripped down tree for autoMZ
sed -n '/TREE: /s///p' elliotsEncode.mod > elliotsEncode.nh
/cluster/bin/phast/tree_doctor elliotsEncode.nh \
--prune-all-but=hg17,panTro1,rheMac1,mm7,canFam2,monDom2,galGal2,danRer3 \
--rename="hg17->hg18;panTro1->panTro2;rheMac1->rheMac2;mm7->mm8;monDom2->monDom4;danRer3->danRer4" > 8way.nh
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' 8way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
/cluster/bin/phast/all_dists 8way.nh > 8way.distances.txt
grep panTro2 8way.distances.txt | sort -k3,3n | \
awk '{printf ("%.4f\t%s\t%s\n", $3, $2, $1)}' > distances.txt
cat distances.txt
# 0.0143 chimp_panTro2 human_hg18
# 0.0910 macaque_rheMac2 chimp_panTro2
# 0.2660 dog_canFam2 chimp_panTro2
# 0.4686 mouse_mm8 chimp_panTro2
# 0.7128 monodelphis_monDom4 chimp_panTro2
# 0.9855 chicken_galGal2 chimp_panTro2
# 1.7488 zebrafish_danRer4 chimp_panTro2
mkdir -p maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = panTro2
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/mafNet
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == panTro2) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
# << happy emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/panTro2/bed/multiz8way.2006-07-15/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << happy emacs
awk '{print $1}' /cluster/data/panTro2/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para create jobList
# 52 files
para try
para check
para push
para time > run.time
# CPU time inn finished jobs: 99662s 1661.03m 27.68h 1.15d 0.003 y
# IO & Wait Time: 2009s 33.49m 0.56h 0.02d 0.000 y
# Average job time: 1955s 32.59m 0.54h 0.02d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 8172s 136.20m 2.27h 0.09d
# Submission to last job: 8172s 136.20m 2.27h 0.09d
# annotate mafs
ssh kolossus
cd /cluster/data/panTro2/bed/multiz8way
mkdir anno
cd anno
mkdir maf run
cd run
rm -f sizes nBeds
foreach i (`cat /cluster/data/panTro2/bed/multiz8way/species.lst`)
ln -s /cluster/data/$i/chrom.sizes $i.len
ln -s /cluster/data/$i/$i.N.bed $i.bed
echo $i.bed >> nBeds
echo $i.len >> sizes
end
echo date > jobs.csh
foreach i (../../maf/*.maf)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/panTro2/panTro2.2bit ../maf/`basename $i` >> jobs.csh
echo "echo $i" >> jobs.csh
end
echo date >> jobs.csh
# do smaller jobs first
tac jobs.csh > jobsRev.csh
mv jobsRev.csh jobs.csh
csh jobs.csh > jobs.log &
# Load into database
ssh hgwdev
cd /cluster/data/panTro2/bed/multiz8way/anno/maf
mkdir -p /gbdb/panTro2/multiz8way/anno/maf
ln -s /cluster/data/panTro2/bed/multiz8way/anno/maf/*.maf \
/gbdb/panTro2/multiz8way/anno/maf
cat > loadMaf.csh << 'EOF'
nice hgLoadMaf -pathPrefix=/gbdb/panTro2/multiz8way/anno/maf panTro2 multiz8way
cat *.maf | \
nice hgLoadMafSummary panTro2 -minSize=30000 -mergeGap=1500 -maxSize=200000 multiz8waySummary stdin
'EOF'
# 16414516 mafs, 1281405 summary blocks
#<< happy emacs
csh loadMaf.csh >&! loadMaf.log &
# generate .gif of tree
phyloGif
----------------------------------------------------------------------------
2009-08-20 markd with guidance from braney
###
# mafs not correctly displayed due to incorrectly using -sizes=sizes
# with mafAddIRows
###
# annotate mafs
ssh kolossus
cd /cluster/data/panTro2/bed/multiz8way/anno
rm -f maf/*.maf run/*.maf
cd run
# edit jobs.csh to remove -sizes=sizes
csh jobs.csh >& jobs.log &
>>>>>>>>>>>>>>>>>
# Load into database
ssh hgwdev
cd /cluster/data/panTro2/bed/multiz8way/anno/maf
mkdir -p /gbdb/panTro2/multiz8way/anno/maf
csh loadMaf.csh >& loadMaf.log &
#############################################################################
# PHASTCONS CONSERVATION (2006-07-18 kate)
# create cluster-friendly copies of chrom fasta files
ssh kkstore04
cd /cluster/data/panTro2
set sdir = /san/sanvol1/scratch/panTro2/chrom
mkdir -p $sdir
foreach d (`cat chrom.lst`)
echo $d
cp -p $d/chr*.fa $sdir
end
# split mafs
ssh pk
cd /cluster/data/panTro2/bed/multiz8way
mkdir cons
cd cons
mkdir run.split
cd run.split
set WINDOWS = /san/sanvol1/scratch/panTro2/multiz8way/cons/ss
rm -fr $WINDOWS
mkdir -p $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/panTro2/bed/multiz8way/maf
set WINDOWS = /san/sanvol1/scratch/panTro2/multiz8way/cons/ss
set sdir = /san/sanvol1/scratch/panTro2/chrom
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M $sdir/$c.fa -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
echo "Done" >> $c.done
'EOF'
# << happy emacs
chmod +x doSplit.csh
rm -f jobList
foreach f (../../maf/*.maf)
set c = $f:t:r
echo "doSplit.csh $c {check out line+ $WINDOWS/$c.done}" >> jobList
end
para create jobList
# 52 jobs
para try
para check
para push
#CPU time in finished jobs: 2426s 40.44m 0.67h 0.03d 0.000 y
#IO & Wait Time: 988s 16.46m 0.27h 0.01d 0.000 y
#Average job time: 66s 1.09m 0.02h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 234s 3.90m 0.07h 0.00d
#Submission to last job: 346s 5.77m 0.10h 0.00d
# run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cd ..
# create tree model file from ENCODE model file, with tree
# pruned and renamed for this set of alignments
sed '/TREE: /d' ../elliotsEncode.mod > 8way.mod
echo "TREE: `cat ../8way.nh`" >> 8way.mod
cat 8way.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#TRAINING_LNL: -988246.132962
#BACKGROUND: 0.295 0.205 0.205 0.295
#RATE_MAT:
# -1.165221 0.315494 0.589884 0.259843
# 0.189778 -0.878194 0.208718 0.479698
# 0.444622 0.261535 -0.885604 0.179447
# 0.234867 0.720815 0.215191 -1.170872
#TREE:
#(((((((hg18:0.006690,panTro2:0.007571):0.024272,rheMac2:0.059256):0.107134,mm8:0.329630):0.023026,canFam2:0.187181):0.262899,monDom4:0.371073):0.189124,galGal2:0.454691):0.279364,danRer4:0.938628);
# calculate GC in model file
awk '$1 == "BACKGROUND:" {print $3 + $4;}' 8way.mod
#0.41
mkdir run.cons
cd run.cons
cat > doPhast.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set tmp = /scratch/tmp/$f
mkdir -p $tmp
set san = /san/sanvol1/scratch/panTro2/multiz8way/cons
cp -p $san/ss/$c/$f.ss ../8way.mod $tmp
pushd $tmp > /dev/null
/cluster/bin/phast/$MACHTYPE/phastCons $f.ss 8way.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative hg18,rheMac2 \
--seqname $c --idpref $c --viterbi $f.bed --score > $f.pp
popd > /dev/null
mkdir -p $san/pp/$c $san/bed/$c
sleep 1
mv $tmp/$f.pp $san/pp/$c
mv $tmp/$f.bed $san/bed/$c
rm -fr $tmp
'EOF'
# emacs happy
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
# start with parameters from previous human 8way
cat > template << 'EOF'
#LOOP
doPhast.csh $(root1) $(file1) 13 .01 .23
#ENDLOOP
'EOF'
# happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/panTro2/multiz8way/cons
ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
/cluster/data/panTro2/bed/multiz8way/cons/run.cons/in.list
popd
gensub2 in.list single template jobList
grep chr7 jobList > jobList.chr7
para make jobList.chr7
# 346 jobs
# using para make because jobs are so small they crash
# Note: 1/3 of jobs crashed, but succeeded on retry
# These jobs are probably too small/too fast.
# Consider increasing the size of the msa_split even further
# in future runs.
para time
#CPU time in finished jobs: 10586s 176.44m 2.94h 0.12d 0.000 y
#IO & Wait Time: 5465s 91.08m 1.52h 0.06d 0.000 y
#Average job time: 46s 0.77m 0.01h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 66s 1.10m 0.02h 0.00d
#Submission to last job: 581s 9.68m 0.16h 0.01d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/panTro2/multiz8way/cons
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -name "chr*.bed" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/panTro2/bed/multiz8way/cons
# load into database
ssh hgwdev
cd /cluster/data/panTro2/bed/multiz8way/cons
hgLoadBed -strict panTro2 phastConsElements8way mostConserved.bed
# compare with previous tracks
hgsql panTro2 -e "select count(*) from phastConsElements8way"
# 1596760 elements
hgsql hg18 -e "select count(*) from phastConsElements17way"
# 1601903
# Try for 5% overall cov, and 70% CDS cov
featureBits panTro2 -enrichment refGene:cds phastConsElements8way
# refGene:cds 1.020%, phastConsElements8way 4.999%, both 0.731%, cover 71.68%, enrich 14.34x
# experiments
featureBits hg18 -chrom=chr7 -enrichment refGene:cds phastConsElements17way
# refGene:cds 0.883%, phastConsElements17way 4.838%, both 0.621%, cover 70.31%, enrich 14.53x
featureBits panTro2 -chrom=chr7 -enrichment refGene:cds phastConsElements8way
# 13 .01 .23
# refGene:cds 0.861%, phastConsElements8way 4.914%, both 0.626%, cover 72.71%, enrich 14.80x
# 13 .007 .23
# refGene:cds 0.861%, phastConsElements8way 4.780%, both 0.626%, cover 72.69%, enrich 15.21x
# 12 .007 .23
# refGene:cds 0.861%, phastConsElements8way 4.599%, both 0.623%, cover 72.34%, enrich 15.73x
# 12 .006 .23
# refGene:cds 0.861%, phastConsElements8way 4.553%, both 0.623%, cover 72.35%, enrich 15.89x
# 12 .006 .24
# refGene:cds 0.861%, phastConsElements8way 4.657%, both 0.630%, cover 73.11%, enrich 15.70x
# 12 .006 .25
# refGene:cds 0.861%, phastConsElements8way 4.765%, both 0.636%, cover 73.90%, enrich 15.51x
# 12 .004 .27
# refGene:cds 0.861%, phastConsElements8way 4.845%, both 0.647%, cover 75.14%, enrich 15.51x
# 12 .006 .27
# refGene:cds 0.861%, phastConsElements8way 4.962%, both 0.647%, cover 75.17%, enrich 15.15x
# 13 .007 .27
# refGene:cds 0.861%, phastConsElements8way 5.241%, both 0.651%, cover 75.57%, enrich 14.42x
# 12 .008 .28
# refGene:cds 0.861%, phastConsElements8way 5.152%, both 0.653%, cover 75.82%, enrich 14.72x
# 12 .007 .27
# phastConsElements8way 5.011%, both 0.648%, cover 75.19%, enrich 15.01x
# 11 .005 .27
# refGene:cds 0.861%, phastConsElements8way 4.673%, both 0.644%, cover 74.78%, enrich 16.00x
# 14 .008 .28
# refGene:cds 0.861%, phastConsElements8way 5.629%, both 0.659%, cover 76.57%, enrich 13.60x
# 10, .11
# refGene:cds 0.861%, phastConsElements8way 7.936%, both 0.671%, cover 77.92%, enrich 9.82x
# 12, .20
# refGene:cds 0.861%, phastConsElements8way 8.316%, both 0.668%, cover 77.56%, enrich 9.33x
# 12, .11
# refGene:cds 0.861%, phastConsElements8way 7.304%, both 0.666%, cover 77.34%, enrich 10.59x
# 14, .11
# refGene:cds 0.861%, phastConsElements8way 7.936%, both 0.671%, cover 77.92%, enrich 9.82x
# check conservation using consEntropy, at Adam's recommendation
# first, generate second model file
set rho = .23
tree_doctor --scale $rho
# create wiggle files
ssh pk
cd /san/sanvol1/scratch/panTro2/multiz8way/cons/
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
find ./pp -name "chr*.pp" | \
sed -e "s/\// x /g" -e "s/\./ y /g" -e "s/-/ z /g" | \
sort -k7,7 -k9,9n | \
sed -e "s/ x /\//g" -e "s/ y /\./g" -e "s/ z /-/g" | \
xargs cat | \
nice wigEncode stdin phastCons8way.wig phastCons8way.wib
# half an hour or so
mv phastCons8way.* /cluster/data/panTro2/bed/multiz8way/cons
ssh hgwdev
cd /cluster/data/panTro2/bed/multiz8way/cons
ln -s /cluster/data/panTro2/bed/multiz8way/cons/phastCons8way.wib \
/gbdb/panTro2/multiz8way/phastCons8way.wib
hgLoadWiggle -pathPrefix=/gbdb/panTro2/multiz8way panTro2 \
phastCons8way phastCons8way.wig
# add Mark D's codon framing
ssh kkstore04
cd /cluster/data/panTro2/bed/multiz8way
mkdir frames
cd frames
cp /cluster/data/hg17/bed/multiz17way/frames/mkMafFrames .
cp /cluster/data/hg17/bed/multiz17way/frames/Makefile.genes .
#edit Makefile to correct species names and set and sanDir, prune out all
# but getGenes target, as the remainder is below.
ssh hgwdev
cd /cluster/data/panTro2/bed/multiz8way/frames
make getGenes >&! getGenes.log &
ssh kkstore04
cd /cluster/data/panTro2/bed/multiz8way/frames
nice tcsh # easy way to get process niced
(cat ../maf/*.maf | genePredToMafFrames panTro2 stdin stdout panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz | gzip > multiz8way.mafFrames.gz)>&log&
ssh hgwdev
cd /cluster/data/panTro2/bed/multiz8way/frames
hgLoadMafFrames panTro2 multiz8wayFrames multiz8way.mafFrames.gz >&log&
##########################################################################
# NSCAN track - (2006-07-28 markd)
cd /cluster/data/panTro2/bed/nscan/
# obtained NSCAN predictions from michael brent's group
# at WUSTL
wget -nv -r -np http://ardor.wustl.edu/jeltje/panTro2/chr_gtf
wget -nv -r -np http://ardor.wustl.edu/jeltje/panTro2/chr_ptx
# clean up and rename downloaded directorys:
mv ardor.wustl.edu/jeltje/panTro2/chr_gtf .
mv ardor.wustl.edu/jeltje/panTro2/chr_ptx .
rm -rf ardor.wustl.edu
rm chr_*/index.html*
gzip chr_*/*
chmod a-w chr_**.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt panTro2 nscanGene chr_gtf/chr*.gtf.gz
# add .a suffix to match transcript id
# add .a suffix to match transcript id
hgPepPred -suffix=.a panTro2 generic nscanPep chr_ptx/chr*.fa.gz
rm *.tab
# update trackDb; need a panTro2-specific page to describe informants
chimp/panTro2/nscanGene.html (from CanFam1_hg17_nscan_10_18_2005/description.html )
chimp/panTro2/trackDb.ra
##########################################################################
# AUGUSTUS ab initio track (DONE, mario, 1/30/2007)
ssh hgwdev
mkdir /cluster/data/panTro2/bed/augustus
cd /cluster/data/panTro2/bed/augustus
# get the program AUGUSTUS
wget http://augustus.gobics.de/binaries/augustus.2.0.1.src.tar.gz
tar xzf augustus.2.0.src.tar.gz
# compile the binary if necessary
cd augustus/src
make augustus
# create output directory
cd /cluster/data/panTro2/bed/augustus
mkdir out
# create file with sequences and their sizes by modifying chrom.sizes
cat ../../chrom.sizes | perl -e 'while(<>){s/chr([0-9a-zA-Z]+)/\/cluster\/data\/panTro2\/$1\/chr$1.fa.masked/; print;}' > seq.lst
# create the job list
augustus/scripts/createAugustusJoblist.pl --sequences seq.lst --chunksize 5300000 --overlap 300000 --command "/cluster/data/panTro2/bed/augustus/augustus/src/augustus --AUGUSTUS_CONFIG_PATH=/cluster/data/panTro2/bed/augustus/augustus/config --species=human --sample=100 --/augustus/verbosity=0" --outputdir /cluster/data/panTro2/bed/augustus/out/ --joblist job.lst
para try
para check
para push
# CPU time in finished jobs: 3785702s 63095.03m 1051.58h 43.82d 0.120 y
# IO & Wait Time: 38307s 638.45m 10.64h 0.44d 0.001 y
# Average job time: 5785s 96.42m 1.61h 0.07d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 8767s 146.12m 2.44h 0.10d
# Submission to last job: 20341s 339.02m 5.65h 0.24d
# check the error files
cat out/*.err
# In this case no error was reported.
rm out/*.err
cat out/*.gff | augustus/scripts/join_aug_pred.pl > augustus.pep.gff
augustus/scripts/getAnnoFasta.pl augustus.pep.gff
cat augustus.pep.gff | egrep "CDS|codon"> augustus.gff
# load into database
ssh hgwdev
cd /cluster/data/panTro2/bed/augustus/
ldHgGene -bin panTro2 augustus augustus.gff
# 33218 gene predictions
hgPepPred panTro2 generic augustusPep augustus.pep.aa
featureBits panTro2 augustus
# 32820465 bases of 2909485072 (1.128%) in intersection
#######################################################################
# BLASTZ/CHAIN/NET HORSE AND CHIMP (DONE 2/21/07 Fan)
ssh kkstore05
mkdir /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19
cd /cluster/data/equCab1/bed/blastz.panTro2.2007-02-19
cat << '_EOF_' > DEF
# Horse vs. Chimp
BLASTZ_M=50
# TARGET: Horse equCab1
SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ1_LIMIT=500
SEQ1_CHUNK=30000000
SEQ1_LAP=10000
# QUERY: Chimp panTro2
SEQ2_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ2_LEN=/cluster/data/panTro2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/equCab1/bed/blastz.panTro2.2007-02-19
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/equCab1PanTro2 >& do.log &
tail -f do.log
ln -s blastz.panTro2.2007-02-19 /cluster/data/equCab1/bed/blastz.panTro2
nice featureBits equCab1 -chrom=chr1 chainPanTro2Link
# 121391547 bases of 177498097 (68.390%) in intersection
bash
time nice -n 19 featureBits equCab1 chainPanTro2Link \
> fb.equCab1.chainPanTro2Link.txt 2>&1
# 1593914101 bases of 2421923695 (65.812%) in intersection
ssh kkstore04
mkdir /cluster/data/panTro2/bed/blastz.equCab1.swap
cd /cluster/data/panTro2/bed/blastz.equCab1.swap
bash
time doBlastzChainNet.pl \
/cluster/data/equCab1/bed/blastz.panTro2.2007-02-19/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-verbose=2 -swap -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/equCab1PanTro2 > swap.log 2>&1 &
# The -blastzOutRoot /cluster/bluearc/equCab1PanTro2 option above
# should not be there. It caused an error message during cleaning up.
# Don't specify it for -swap next time.
ssh hgwdev
cd /cluster/data/panTro2/bed/blastz.equCab1.swap
bash
time nice -n 19 featureBits panTro2 chainEquCab1Link \
> fb.panTro2.chainEquCab1Link.txt 2>&1
# 1636782924 bases of 2909485072 (56.257%) in intersection
#########################################################################
# Blastz Marmoset calJac1 (DONE - 2007-11-15 - Hiram)
ssh kkstore04
screen # use screen to control this job
mkdir /cluster/data/panTro2/bed/blastzCalJac1.2007-11-13
cd /cluster/data/panTro2/bed/blastzCalJac1.2007-11-13
cat << '_EOF_' > DEF
# Chimp vs marmoset
BLASTZ_M=50
# TARGET: Chimp PanTro2
SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ1_LEN=/cluster/data/panTro2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Marmoset calJac1
SEQ2_DIR=/cluster/bluearc/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastzCalJac1.2007-11-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=pk > blastz.log 2>&1 &
# real 1039m19.171s
cat fb.panTro2.chainCalJac1Link.txt
# 2220169777 bases of 2909485072 (76.308%) in intersection
mkdir /cluster/data/calJac1/bed/blastz.panTro2.swap
cd /cluster/data/calJac1/bed/blastz.panTro2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/panTro2/bed/blastzCalJac1.2007-11-13/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# real 320m14.293s
cat fb.calJac1.chainPanTro2Link.txt
# 2264115411 bases of 2929139385 (77.296%) in intersection
###########################################################################
# Blastz Orangutan ponAbe2 (DONE - 2007-11-16 - Hiram)
ssh kkstore04
screen # use screen to control this job
mkdir /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13
cd /cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13
cat << '_EOF_' > DEF
# Chimp vs marmoset
BLASTZ_M=50
# TARGET: Chimp PanTro2
SEQ1_DIR=/scratch/hg/panTro2/panTro2.2bit
SEQ1_LEN=/cluster/data/panTro2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY: Orangutan ponAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-bigClusterHub=kk > blastz.log 2>&1 &
# real 388m20.443s
# Completed: 129160 of 129168 jobs
# Crashed: 8 jobs
# CPU time in finished jobs: 23694314s 394905.23m 6581.75h 274.24d 0.751 y
# IO & Wait Time: 2536376s 42272.93m 704.55h 29.36d 0.080 y
# Average job time: 203s 3.38m 0.06h 0.00d
# Longest finished job: 5665s 94.42m 1.57h 0.07d
# Submission to last job: 170709s 2845.15m 47.42h 1.98d
# some jobs failed due to memory limits on kk nodes
# finished them manually on kolossus and hgwdev, then continuing
time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-continue=cat -bigClusterHub=kk > cat.log 2>&1 &
# real 754m45.014s
cat fb.panTro2.chainPonAbe2Link.txt
# 2648603020 bases of 2909485072 (91.033%) in intersection
mkdir /cluster/data/ponAbe2/bed/blastz.panTro2.swap
cd /cluster/data/ponAbe2/bed/blastz.panTro2.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/panTro2/bed/blastzPonAbe2.2007-11-13/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# real 160m51.639s
cat fb.ponAbe2.chainPanTro2Link.txt
# 2766615040 bases of 3093572278 (89.431%) in intersection
# real 2597m24.771s
###########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
panTro2.upstreamGeneTbl = refGene
############################################################################
# SWAP HG19 Chain/Net (DONE - 2009-04-25 - Hiram)
# running the swap
ssh swarm
mkdir /hive/data/genomes/panTro2/bed/blastz.hg19.swap
cd /hive/data/genomes/panTro2/bed/blastz.hg19.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-swap /hive/data/genomes/hg19/bed/lastzPanTro2.2009-03-19/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \
> swap.log 2>&1 &
# real 723m41.377s
cat fb.panTro2.chainHg19Link.txt
# 2761343871 bases of 2909485072 (94.908%) in intersection
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
+
############################################################################
+# calJac3 Marmoset BLASTZ/CHAIN/NET (DONE - 2010-02-11 - Hiram)
+ screen # use a screen to manage this multi-day job
+ mkdir /hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11
+ cd /hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11
+
+ # same kind of parameters as used in human<->marmoset
+ cat << '_EOF_' > DEF
+# chimp vs. marmoset
+BLASTZ=lastz
+# maximum M allowed with lastz is only 254
+BLASTZ_M=254
+BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q
+# and place those items here
+BLASTZ_O=600
+BLASTZ_E=150
+# other parameters from panTro2 vs hg18 lastz on advice from Webb
+BLASTZ_K=4500
+BLASTZ_Y=15000
+BLASTZ_T=2
+
+# TARGET: Chimp PanTro2
+SEQ1_DIR=/scratch/data/panTro2/panTro2.2bit
+SEQ1_LEN=/scratch/data/panTro2/chrom.sizes
+SEQ1_CHUNK=20000000
+SEQ1_LAP=10000
+SEQ1_LIMIT=5
+
+# QUERY: Marmoset (calJac3)
+SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
+SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
+SEQ2_LIMIT=50
+SEQ2_CHUNK=10000000
+SEQ2_LAP=0
+
+BASE=/hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11
+TMPDIR=/scratch/tmp
+'_EOF_'
+ # << this line keeps emacs coloring happy
+
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -stop=partition -syntenicNet \
+ -chainMinScore=5000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ > do.log 2>&1 &
+ # real 311m31.090s
+ cat fb.panTro2.chainCalJac3Link.txt
+ # 2016331285 bases of 2909485072 (69.302%) in intersection
+
+ mkdir /hive/data/genomes/calJac3/bed/blastz.panTro2.swap
+ cd /hive/data/genomes/calJac3/bed/blastz.panTro2.swap
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ /hive/data/genomes/panTro2/bed/lastzCalJac3.2010-02-11/DEF \
+ -swap -noLoadChainSplit -syntenicNet \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -chainMinScore=5000 -chainLinearGap=medium > swap.log 2>&1 &
+ # real 118m42.203s
+ cat fb.calJac3.chainHg19Link.txt
+ # 1990168262 bases of 2752505800 (72.304%) in intersection
+
+############################################################################
+