src/hg/makeDb/doc/tetNig1.txt 1.21
1.21 2010/01/16 00:39:10 hiram
liftOver to tetNig2 done
Index: src/hg/makeDb/doc/tetNig1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/tetNig1.txt,v
retrieving revision 1.20
retrieving revision 1.21
diff -b -B -U 1000000 -r1.20 -r1.21
--- src/hg/makeDb/doc/tetNig1.txt 20 Sep 2009 17:16:47 -0000 1.20
+++ src/hg/makeDb/doc/tetNig1.txt 16 Jan 2010 00:39:10 -0000 1.21
@@ -1,3164 +1,3176 @@
# for emacs: -*- mode: sh; -*-
# Tetraodon Nigirividis from Genoscope, version v6 (released 5/6/02)
# Project website:
# http://www.genoscope.cns.fr/externe/tetraodon/sequencing.html
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +------------+--------------------+
# | tableName | type |
# +------------+--------------------+
# | gaze | genePred |
# | geneid | genePred geneidPep |
# | genscan | genePred |
# | hoxGenes | genePred |
# | cytokines | genePred |
# | ecoresHg17 | genePred |
# +------------+--------------------+
# DOWNLOAD SEQUENCE
ssh kksilo
mkdir /cluster/store7/tetNig1
ln -s /cluster/store7/tetNig1 /cluster/data
cd /cluster/data/tetNig1
wget http://www.genoscope.cns.fr/externe/tetraodon/Data/Reads/tetraodon6.gz
# DOWNLOAD SEQUENCE BY CHROMOSOME, GFF AND AGP FILES (DONE, 2004-08-13, hartera)
# Genoscope provided the url, username and password - this is V7
# download agp, gff and accompanying FASTA files by chromosome
ssh kksilo
cd /cluster/data/tetNig1
wget --timestamp --http-user= --http-passwd= \
http://www.genoscope.cns.fr/nda/tetraodon/GFF/tetraodon.all.gff.*
wget --timestamp --http-user= --http-passwd= \
http://www.genoscope.cns.fr/nda/tetraodon/GP/chr.agp
wget --timestamp --http-user= --http-passwd= \
http://www.genoscope.cns.fr/nda/tetraodon/GP/chr.agp.info
wget --timestamp -r -l1 --http-user= --http-passwd= \
http://www.genoscope.cns.fr/nda/tetraodon/GP/unmasked/
mv ./www.genoscope.cns.fr/secure-nda/tetraodon/GP/unmasked/* .
rm -r www.genoscope.cns.fr
rm index*
# unzip FASTA files
bzip2 -d chr*.bz2
bzip2 -d tetraodon.all.gff.bz2
# no complete mitochondrial sequence in Genbank
# Create list of chromosomes (DONE, 2004-08-13, hartera)
ssh hgwdev
cd /cluster/data/tetNig1
foreach f (*.fa)
set chr = `echo $f:h | sed -e 's/^chr//' | sed -e 's/\.fa$//'`
echo $chr >> chrom
end
sort -n chrom > chrom.lst
rm chrom
# SPLIT AGP FILES BY CHROMOSOME (DONE, 2004-08-16, hartera)
ssh kksilo
cd /cluster/data/tetNig1
foreach c (`cat chrom.lst`)
mkdir $c
mv chr${c}.fa ./$c
perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
./chr.agp > $c/chr$c.agp
end
# agp has either gaps (N) or scaffolds (S), change S to W for WGS
# agp files are not standard formats so need to reformat for
# processing programs to run - missing field 4 which is just line number
# and need to reformat gap line to:
# chr chr_start chr_end line type length gap type linkage
foreach c (`cat chrom.lst`)
cp $c/chr${c}.agp $c/chr${c}.agp.bak
echo "Processing chr${c}.agp ..."
awk 'BEGIN {OFS="\t"} \
{ if ($4 == "S") print $1, $2, $3, NR, "W", $5, $6, $7, $8, $9; \
if ($4 == "N" && $5 == "GAP_IS") \
print $1, $2, $3, NR, $4, $9, "fragment", "yes"; \
if ($4 == "N" && $5 == "GAP_UC") \
print $1, $2, $3, NR, $4, $9, "contig", "yes"; \
if ($4 == "N" && $5 == "GAP_UN") \
print $1, $2, $3, NR, $4, $9, "contig", "no"; \
if ($4 == "N" && $5 == "CENT") \
print $1, $2, $3, NR, $4, $9, "centromere", "no" }' \
$c/chr${c}.agp >> $c/chr${c}.new.agp
mv $c/chr${c}.new.agp $c/chr{$c}.agp
end
# check agp files and then remove backup files
foreach c (`cat chrom.lst`)
rm $c/chr${c}.agp.bak
end
# CHECK FASTA AND AGP FILES (DONE, 2004-08-16, hartera)
ssh kksilo
cd /cluster/data/tetNig1
foreach c (`cat chrom.lst`)
foreach f ( $c/chr$c.agp)
set agpLen = `tail -1 $f | awk '{print $3;}'`
set h = $f:r
set g = $h:r
echo "Getting size of $g.fa"
set faLen = `faSize $g.fa | awk '{print $1;}'`
if ($agpLen == $faLen) then
echo " OK: $f length = $g length = $faLen"
else
echo "ERROR: $f length = $agpLen, but $g length = $faLen"
endif
end
end
# all are OK so FASTA files sizes are in agreement with agp files
# BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
# (DONE, 2004-08-16, hartera)
ssh kksilo
cd /cluster/data/tetNig1
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c.agp)
if (-e $agp) then
set fa = $c/chr$c.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
splitFaIntoContigs $agp $fa . -nSize=5000000
endif
end
end
# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-08-16, hartera)
# This used to hold scripts -- better to keep them inline here
# Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
ssh hgwdev
mkdir /cluster/data/tetNig1/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/tetNig1/bed
# CREATING DATABASE (DONE, 2004-08-16, hartera)
# Create the database.
# next machine
ssh hgwdev
echo 'create database tetNig1' | hgsql ''
# if you need to delete that database: !!! WILL DELETE EVERYTHING !!!
echo 'drop database tetNig1' | hgsql tetNig1
# Use df to make sure there is at least 5 gig free on
df -h /var/lib/mysql
# Before loading data:
# Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 525G 1.2T 32% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-08-16, hartera)
# next machine
ssh hgwdev
# the following command copies all the data from the table
# grp in the database galGal2 to our new database ce2
echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \
| hgsql tetNig1
# if you need to delete that table: !!! WILL DELETE ALL grp data !!!
echo 'drop table grp;' | hgsql tetNig1
# REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2004-08-17, hartera)
#- Split contigs into 500kb chunks, at gaps if possible:
ssh kksilo
cd /cluster/data/tetNig1
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
#- Make the run directory and job list:
cd /cluster/data/tetNig1
cat << '_EOF_' > jkStuff/RMTetraodon
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/tetNig1/$2
/bin/cp $2 /tmp/tetNig1/$2/
cd /tmp/tetNig1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s $2
popd
/bin/cp /tmp/tetNig1/$2/$2.out ./
if (-e /tmp/tetNig1/$2/$2.align) /bin/cp /tmp/tetNig1/$2/$2.align ./
if (-e /tmp/tetNig1/$2/$2.tbl) /bin/cp /tmp/tetNig1/$2/$2.tbl ./
if (-e /tmp/tetNig1/$2/$2.cat) /bin/cp /tmp/tetNig1/$2/$2.cat ./
/bin/rm -fr /tmp/tetNig1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/tetNig1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/tetNig1
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMTetraodon
mkdir RMRun
cp /dev/null RMRun/RMJobs
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/tetNig1/jkStuff/RMTetraodon \
/cluster/data/tetNig1/$d $f \
'{'check out line+ /cluster/data/tetNig1/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
#- Do the run
ssh kk
cd /cluster/data/tetNig1/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
# para time
# Completed: 940 of 940 jobs
# CPU time in finished jobs: 6654614s 110910.24m 1848.50h 77.02d 0.211 y
# IO & Wait Time: 54843s 914.05m 15.23h 0.63d 0.002 y
# Average job time: 7138s 118.96m 1.98h 0.08d
# Longest job: 11024s 183.73m 3.06h 0.13d
# Submission to last job: 34509s 575.15m 9.59h 0.40d
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kksilo
cd /cluster/data/tetNig1
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (`cat chrom.lst`)
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
cd ..
end
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/tetNig1
hgLoadOut tetNig1 */chr*.fa.out
# MAKE LIFTALL.LFT (DONE, 2004-08-16, hartera)
ssh kksilo
cd /cluster/data/tetNig1
cat */lift/ordered.lft > jkStuff/liftAll.lft
# SIMPLE REPEATS TRACK (DONE, 2004-08-17, hartera)
# TRF runs pretty quickly now... it takes a few hours total runtime,
# so instead of binrsyncing and para-running, just do this on the
# local fileserver
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/simpleRepeat
cd /cluster/data/tetNig1/bed/simpleRepeat
mkdir trf
cp /dev/null jobs.csh
foreach d (/cluster/data/tetNig1/*/chr*_?{,?})
set ctg = $d:t
foreach f ($d/${ctg}.fa)
set fout = $f:t:r.bed
echo $fout
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
>> jobs.csh
end
end
chmod a+x jobs.csh
csh -ef jobs.csh >&! jobs.log &
# check on this with
tail -f jobs.log
wc -l jobs.csh
ls -1 trf | wc -l
endsInLf trf/*
liftUp simpleRepeat.bed /cluster/data/tetNig1/jkStuff/liftAll.lft warn \
trf/*.bed
# Load into the database:
ssh hgwdev
cd /cluster/data/tetNig1/bed/simpleRepeat
hgLoadBed tetNig1 simpleRepeat \
/cluster/data/tetNig1/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-08-17, hartera)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kksilo
cd /cluster/data/tetNig1/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/tetNig1
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (`cat chrom.lst`)
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF
# (DONE, 2004-08-17, hartera)
ssh kksilo
cd /cluster/data/tetNig1
# Soft-mask (lower-case) the contig and chr .fa's,
# then make hard-masked versions from the soft-masked.
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
foreach f (*/chr*.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
foreach c (`cat chrom.lst`)
echo "repeat- and trf-masking contigs of chr$c"
foreach d ($c/chr*_?{,?})
set ctg=$d:t
set f=$d/$ctg.fa
maskOutFa -soft $f $f.out $f
maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
maskOutFa $f hard $f.masked
end
end
# Build nib files, using the soft masking in the fa
mkdir nib
foreach f (*/chr*.fa)
faToNib -softMask $f nib/$f:t:r.nib
end
# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2004-08-17, hartera)
# Make symbolic links from /gbdb/tetNig1/nib to the real nibs.
ssh hgwdev
cd /cluster/data/tetNig1
mkdir -p /gbdb/tetNig1/nib
foreach f (/cluster/data/tetNig1/nib/chr*.nib)
ln -s $f /gbdb/tetNig1/nib
end
# Load /gbdb/tetNig1/nib paths into database and save size info
# hgNibSeq creates chromInfo table
hgNibSeq -preMadeNib tetNig1 /gbdb/tetNig1/nib */chr*.fa
echo "select chrom,size from chromInfo" | hgsql -N tetNig1 > chrom.sizes
# take a look at chrom.sizes, should be 27 lines
wc chrom.sizes
# Make one big 2bit file as well, and make a link to it in
# /gbdb/tetNig1/nib because hgBlat looks there:
faToTwoBit */chr*.fa tetNig1.2bit
ln -s /cluster/data/tetNig1/tetNig1.2bit /gbdb/tetNig1/nib/
# MAKE GOLD AND GAP TRACKS (DONE, 2004-08-17, hartera)
ssh hgwdev
cd /cluster/data/tetNig1
# the gold and gap tracks are created from the chrN.agp file
hgGoldGapGl -noGl -chromLst=chrom.lst tetNig1 /cluster/data/tetNig1 .
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR TETNIG1
# (DONE, 2004-06-18, hartera)
# UPDATED SOURCE AND ASSEMBLY DATE IN dbDb (DONE, 2004-09-09, hartera)
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
mkdir -p ~/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1
cd $HOME/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add tetNig1 in all the right places and do
make update
make alpha
cvs commit -m "Added tetNig1." makefile
# Add dbDb and defaultDb entries:
echo 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
values("tetNig1", "May 2002", \
"/gbdb/tetNig1/nib", "Tetraodon", "chr1:1000-4000", 1, \
39, "Tetraodon", "Tetraodon nigroviridis", \
"/gbdb/tetNig1/html/description.html", 0, 0, \
"Genoscope and Whitehead Institute, V6");' \
| hgsql -h genome-testdb hgcentraltest
echo 'insert into defaultDb (genome, name) \
values ("Tetraodon", "tetNig1");' \
| hgsql -h genome-testdb hgcentraltest
# add correct assembly date and source (2004-09-08, hartera)
echo 'update dbDb set description = "Feb 2004" where name = "tetNig1";' \
| hgsql -h genome-testdb hgcentraltest
echo 'update dbDb set sourceName = "Genoscope and Broad Institute, V7" \
where name = "tetNig1";' | hgsql -h genome-testdb hgcentraltest
# correction to assembly date format (2004-09-09)
echo 'update dbDb set description = "Feb. 2004" where name = "tetNig1";' \
| hgsql -h genome-testdb hgcentraltest
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-08-23, hartera)
ssh hgwdev
mkdir /cluster/data/tetNig1/html
cd /cluster/data/tetNig1/html
# make a symbolic link from /gbdb/tetNig1/html to /cluster/data/tetNig1/html
mkdir /gbdb/tetNig1
ln -s /cluster/data/tetNig1/html /gbdb/tetNig1/html
# Add a description page for zebrafish
cd /cluster/data/tetNig1/html
cp /cluster/data/fr1/html/*.html .
# Edit this for zebrafish
# create a description.html page here
mkdir -p ~/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1
cd ~/kent/src/hg/makeDb/trackDb/
cvs add tetraodon
cd tetraodon
cvs add tetNig1
# Add description page here too
cp /cluster/data/tetNig1/html/description.html \
$HOME/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1/
chmod a+r \
$HOME/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1/description.html
cd $HOME/kent/src/hg/makeDb/trackDb/tetraodon/tetNig1/
# Check it in and copy (ideally using "make alpha" in trackDb) to
# /gbdb/tetNig1/html
cvs add description.html
cvs commit description.html
# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, 2004-08-18, hartera)
ssh kkr1u00
# Chrom-level mixed nibs that have been repeat- and trf-masked:
rm -rf /iscratch/i/tetNig1/nib
mkdir -p /iscratch/i/tetNig1/nib
cp -p /cluster/data/tetNig1/nib/chr*.nib /iscratch/i/tetNig1/nib
# Pseudo-contig fa that have been repeat- and trf-masked:
rm -rf /iscratch/i/tetNig1/trfFa
mkdir /iscratch/i/tetNig1/trfFa
foreach d (/cluster/data/tetNig1/*/chr*_?{,?})
cp $d/$d:t.fa /iscratch/i/tetNig1/trfFa
end
rm -rf /iscratch/i/tetNig1/rmsk
mkdir -p /iscratch/i/tetNig1/rmsk
cp -p /cluster/data/tetNig1/*/chr*.fa.out /iscratch/i/tetNig1/rmsk
cp -p /cluster/data/tetNig1/tetNig1.2bit /iscratch/i/tetNig1/
iSync
# add to /cluster/bluearc too
ssh kksilo
# masked contigs
rm -fr /cluster/bluearc/scratch/tetra/tetNig1/trfFa
mkdir -p /cluster/bluearc/scratch/tetra/tetNig1/trfFa
foreach d (/cluster/data/tetNig1/*/chr*_?{,?})
cp $d/$d:t.fa /cluster/bluearc/scratch/tetra/tetNig1/trfFa
end
# masked chrom nibs
rm -fr /cluster/bluearc/scratch/tetra/tetNig1/nib
mkdir -p /cluster/bluearc/scratch/tetra/tetNig1/nib
cp -p /cluster/data/tetNig1/nib/chr*.nib /cluster/bluearc/scratch/tetra/tetNig1/nib
# fasta files - CHECK FOR RANDOMS
cd /cluster/data/tetNig1
rm -fr /cluster/bluearc/scratch/tetra/tetNig1/fasta
mkdir -p /cluster/bluearc/scratch/tetra/tetNig1/fasta
cp -p */*.fa /cluster/bluearc/scratch/tetra/tetNig1/fasta
# RepeatMasker *.out files - CHECK FOR RANDOMS
rm -rf /cluster/bluearc/scratch/tetra/tetNig1/rmsk
mkdir -p /cluster/bluearc/scratch/tetra/tetNig1/rmsk
cp -p ./*/chr*.fa.out /cluster/bluearc/scratch/tetra/tetNig1/rmsk
# CREATE gc5Base wiggle TRACK (DONE, 2004-08-19, hartera)
ssh kki
mkdir /cluster/data/tetNig1/bed/gc5Base
cd /cluster/data/tetNig1/bed/gc5Base
# in the script below, the 'grep -w GC' selects the lines of
# output from hgGcPercent that are real data and not just some
# information from hgGcPercent. The awk computes the number
# of bases that hgGcPercent claimed it measured, which is not
# necessarily always 5 if it ran into gaps, and then the division
# by 10.0 scales down the numbers from hgGcPercent to the range
# [0-100]. Two columns come out of the awk print statement:
# <position> and <value> which are fed into wigAsciiToBinary through
# the pipe. It is set at a dataSpan of 5 because each value
# represents the measurement over five bases beginning with
# <position>. The result files end up in ./wigData5.
# A new script is used (from makeHg17.doc) which gets around the
# problem that wigAsciiToBinary was calculating chromEnd to be
# beyond the real chromosome end
mkdir wigData5 dataLimits5
cat << '_EOF_' > kkRun.sh
#!/bin/sh
NIB=$1
chr=${NIB/.nib/}
chrom=${chr#chr}
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 tetNig1 \
/iscratch/i/tetNig1/nib | \
grep -w GC | \
awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
wigAsciiToBinary -dataSpan=5 -chrom=${chr} \
-wibFile=wigData5/gc5Base_${chrom} \
-name=${chrom} stdin 2> dataLimits5/${chr}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x kkRun.sh
ls /iscratch/i/tetNig1/nib > nibList
cat << '_EOF_' > gsub
#LOOP
./kkRun.sh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 nibList single gsub jobList
para create jobList
para try, check, push ... etc
# para time
# Completed: 27 of 27 jobs
# CPU time in finished jobs: 587s 9.79m 0.16h 0.01d 0.000 y
# IO & Wait Time: 16s 0.26m 0.00h 0.00d 0.000 y
# Average job time: 22s 0.37m 0.01h 0.00d
# Longest job: 187s 3.12m 0.05h 0.00d
# Submission to last job: 623s 10.38m 0.17h 0.01d
# load the .wig files back on hgwdev:
ssh hgwdev
cd /cluster/data/tetNig1/bed/gc5Base
# up to here
hgLoadWiggle -pathPrefix=/gbdb/tetNig1/wib/gc5Base \
tetNig1 gc5Base wigData5/*.wig
# and symlink the .wib files into /gbdb
mkdir -p /gbdb/tetNig1/wib/gc5Base
ln -s `pwd`/wigData5/*.wib /gbdb/tetNig1/wib/gc5Base
# And then the zoomed data view
ssh kki
cd /cluster/data/tetNig1/bed/gc5Base
mkdir wigData5_1K dataLimits5_1K
cat << '_EOF_' > kkRunZoom.sh
#!/bin/sh
NIB=$1
chr=${NIB/.nib/}
chrom=${chr#chr}
hgGcPercent -chr=${chr} -doGaps -file=stdout -win=5 tetNig1 \
/iscratch/i/tetNig1/nib | \
grep -w GC | \
awk '{if (($3-$2) >= 5) {printf "%d\t%.1f\n", $2+1, $5/10.0} }' | \
wigZoom -dataSpan=1000 stdin | wigAsciiToBinary -dataSpan=1000 \
-chrom=${chr} -wibFile=wigData5_1K/gc5Base_${chrom}_1K \
-name=${chrom} stdin 2> dataLimits5_1K/${chr}
'_EOF_'
# << this line makes emacs coloring happy
chmod +x kkRunZoom.sh
cat << '_EOF_' > gsubZoom
#LOOP
./kkRunZoom.sh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 nibList single gsubZoom jobListZoom
para create jobListZoom
para try, check, push, ... etc.
# para time
# Completed: 27 of 27 jobs
# CPU time in finished jobs: 571s 9.51m 0.16h 0.01d 0.000 y
# IO & Wait Time: 40s 0.67m 0.01h 0.00d 0.000 y
# Average job time: 23s 0.38m 0.01h 0.00d
# Longest job: 223s 3.72m 0.06h 0.00d
# Submission to last job: 986s 16.43m 0.27h 0.01d
# Then load these .wig files into the same database as above
ssh hgwdev
cd /cluster/data/tetNig1/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/tetNig1/wib/gc5Base \
-oldTable tetNig1 gc5Base wigData5_1K/*.wig
# and symlink these .wib files into /gbdb
mkdir -p /gbdb/tetNig1/wib/gc5Base
ln -s `pwd`/wigData5_1K/*.wib /gbdb/tetNig1/wib/gc5Base
# CREATE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2004-08-20, hartera)
# Use -repMatch=130 (based on size - for human 1024 was used, Tetraodon
# is about 380 Mb in length and it is 12.7% the size of the human genome)
ssh kkr1u00
mkdir /cluster/data/tetNig1/bed/ooc
cd /cluster/data/tetNig1/bed/ooc
mkdir -p /cluster/bluearc/tetNig1
ls -1 /cluster/data/tetNig1/nib/chr*.nib > nib.lst
blat nib.lst /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/tetNig1/11.ooc -repMatch=130
# Wrote 8776 overused 11-mers to /cluster/bluearc/tetNig1/11.ooc
# For 10.ooc, repMatch = 4096 for human, so use 520
blat nib.lst /dev/null /dev/null -tileSize=10 \
-makeOoc=/cluster/bluearc/tetNig1/10.ooc -repMatch=520
# Wrote 1976 overused 11-mers to /cluster/bluearc/tetNig1/11.ooc
# keep copies of ooc files in this directory and copy to iscratch
cp /cluster/bluearc/tetNig1/*.ooc .
cp -p /cluster/bluearc/tetNig1/*.ooc /iscratch/i/tetNig1/
iSync
# AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-08-24, hartera and markd)
ssh eieio
cd /cluster/data/genbank
# This is a new organism, edit the etc/genbank.conf file and add:
# tetNig1 (Tetraodon)
tetNig1.genome = /iscratch/i/tetNig1/nib/chr*.nib
tetNig1.lift = /cluster/data/tetNig1/jkStuff/liftAll.lft
tetNig1.downloadDir = tetNig1
# Default includes native genbank mRNAs and ESTs,
# genbank xeno mRNAs but no xenoESTs, native RefSeq mRNAs but
# not xeno RefSeq
cvs commit -m "Added Tetraodon." etc/genbank.conf
# Edit src/align/gbBlat to add /iscratch/i/tetNig1/11.ooc
cd ~/kent/src/hg/makeDb/genbank
# Add line: TETNIG_OOC=/iscratch/i/tetNig1/11.ooc
# Add line: tetNig*) oocOpt="-ooc=${TETNIG_OOC}" ;;
cvs diff src/align/gbBlat
make
cvs update src/align/gbBlat
cvs commit -m "Added 11.ooc for tetNig1." src/align/gbBlat
# Edit src/lib/gbGenome.c to add new genome
# ADD: static char *tetNigNames[] = {"Tetraodon nigroviridis", NULL};
# ADD: {"tetNig", tetNigNames, NULL},
# to static struct dbToSpecies dbToSpeciesMap[]
cvs diff src/lib/gbGenome.c
make
cvs update src/lib/gbGenome.c
cvs commit -m "Added Tetraodon, tetNig." src/lib/gbGenome.c
# Install to /cluster/data/genbank:
make install-server
ssh eieio
cd /cluster/data/genbank
# This is an -initial run, RefSeq mRNA only:
nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial tetNig1
# Load results for RefSeq mRNAs:
ssh hgwdev
cd /cluster/data/genbank
# There are no RefSeq mRNAs for Tetraodon
# -initial for GenBank mRNAs
nice ./bin/gbAlignStep -initial -srcDb=genbank -type=mrna tetNig1
# Load results for GenBank mRNAs
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 tetNig1
cd /cluster/data/genbank/work
mv initial.tetNig1 initial.tetNig1.genbank.mrna
# -initial for ESTs
ssh eieio
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial tetNig1
# Load results for GenBank ESTs
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 tetNig1
cd /cluster/data/genbank/work
mv initial.tetNig1 initial.tetNig1.genbank.est
# Everything loaded ok, so clean up
rm -r /cluster/data/genbank/work/initial.tetNig1*
# BLASTZ SWAP FOR hg17 vs tetNig1 BLASTZ TO CREATE tetNig1 vs hg17 BLASTZ
# (DONE, 2004-08-26, hartera)
ssh kolossus
mkdir -p /cluster/data/tetNig1/bed/blastz.hg17.swap
cd /cluster/data/tetNig1/bed/blastz.hg17.swap
# use axtChrom from blastzTetNig1 on hg17
set aliDir = /cluster/data/hg17/bed/blastz.tetNig1
cp $aliDir/S1.len S2.len
cp $aliDir/S2.len S1.len
mkdir unsorted axtChrom
cat $aliDir/axtChrom/chr*.axt \
| axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
| axtSplitByTarget stdin unsorted
# Sort the shuffled .axt files.
foreach f (unsorted/*.axt)
echo sorting $f:t:r
axtSort $f axtChrom/$f:t
end
du -sh $aliDir/axtChrom unsorted axtChrom
# 709M /cluster/data/hg17/bed/blastz.tetNig1/axtChrom
# 710M unsorted
# 709M axtChrom
rm -r unsorted
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/tetNig1/bed/blastz.hg17.swap
mkdir -p pslChrom
set tbl = "blastzHg17"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl tetNig1 $f
echo "$f Done"
end
# CHAIN HUMAN (hg17) BLASTZ (DONE, 2004-08-27, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/tetNig1/bed/blastz.hg17.swap
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Make our own linear gap file with reduced gap penalties,
# in hopes of getting longer chains - works well for species at
# chicken-human distance or greater
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V
72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V
31600^V 56600
tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V
31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V
16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -linearGap=../../chickenHumanTuned.gap $1 \
/iscratch/i/tetNig1/nib \
/iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 27 of 27 jobs
# CPU time in finished jobs: 2207s 36.78m 0.61h 0.03d 0.000 y
# IO & Wait Time: 124s 2.07m 0.03h 0.00d 0.000 y
# Average job time: 86s 1.44m 0.02h 0.00d
# Longest job: 160s 2.67m 0.04h 0.00d
# Submission to last job: 334s 5.57m 0.09h 0.00d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# apart from chrUn_random, not too many chains with score < 5000
# load chr 1 into table
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain/chain
hgLoadChain tetNig1 chr1_chainHg17 chr1.chain
# no RefSeqs for Tetraodon so use mrna table
# featureBits -chrom=chr1 tetNig1 all_mrna chainHg17Link -enrichment
# all_mrna 2.637%, chainHg17Link 9.262%, both 0.772%, cover 29.28%, enrich 3.16x
# try filtering with minScore < 5000
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
chainSplit chainFilt5k all.chain
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain/chainFilt5k
hgLoadChain tetNig1 chr1_chainHg17Filt5k chr1.chain
# featureBits -chrom=chr1 tetNig1 all_mrna chainHg17Filt5kLink -enrichment
# all_mrna 2.637%, chainHg17Filt5kLink 8.746%, both 0.746%, cover 28.29%,
# enrich 3.23x
# add all chains for minScore=5000 filtered chains
# remove test chain tables for chr1
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain
hgsql -e "drop table chr1_chainHg17;" tetNig1
hgsql -e "drop table chr1_chainHg17Link;" tetNig1
hgsql -e "drop table chr1_chainHg17Filt5k;" tetNig1
hgsql -e "drop table chr1_chainHg17Filt5kLink;" tetNig1
mv chainFilt5k chain
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain tetNig1 ${c}_chainHg17 $i
echo done $c
end
# NET HUMAN (hg17) BLASTZ (DONE, 2004-08-27, hartera)
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 55447552, utime 259 s/100, stime 33
# Add classification info using db tables:
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
mkdir -p /cluster/bluearc/hg17/linSpecRep.notInTetraodon
mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInHuman
cp -p /iscratch/i/hg17/linSpecRep.notInTetraodon/* \
/cluster/bluearc/hg17/linSpecRep.notInTetraodon
cp -p /iscratch/i/tetNig1/linSpecRep.notInHuman/* \
/cluster/bluearc/tetNig1/linSpecRep.notInHuman
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain
time netClass noClass.net tetNig1 hg17 hg17.net \
-tNewR=/cluster/bluearc/tetNig1/linSpecRep.notInHuman \
-qNewR=/cluster/bluearc/hg17/linSpecRep.notInTetraodon -noAr
# 47.430u 31.160s 2:15.46 58.0% 0+0k 0+0io 197pf+0w
netFilter -minGap=10 hg17.net | hgLoadNet tetNig1 netHg17 stdin
# featureBits tetNig1 all_mrna netHg17 -enrichment
# all_mrna 3.204%, netHg17 42.716%, both 1.698%, cover 53.00%, enrich 1.24x
# create symlinks to make consistent with newer builds 2006-04-09 markd
ln -s all.chain.gz /cluster/data/tetNig1/bed/blastz.hg17/axtChain/tetNig1.hg17.all.chain.gz
ln -s hg17.net.gz /cluster/data/tetNig1/bed/blastz.hg17/axtChain/tetNig1.hg17.net.gz
# BLASTZ FOR FR1 (FUGU) (DONE, 2004-08-31, hartera)
ssh kkr1u00
# blastz requires lineage-specific repeats but there are none
# available between these two fish species to make empty files
mkdir -p /iscratch/i/tetNig1/linSpecRep.notInFugu
mkdir -p /iscratch/i/fugu/linSpecRep.notInZebrafish
cd /cluster/data/tetNig1
# for Tetraodon
foreach c (`cat chrom.lst`)
cp /dev/null /iscratch/i/tetNig1/linSpecRep.notInFugu/chr${c}.out.spec
end
# for Fugu
cp /dev/null /iscratch/i/fugu/linSpecRep.notInZebrafish/chrUn.out.spec
iSync
ssh kk
mkdir -p /cluster/data/tetNig1/bed/blastz.fr1.2004-08-31
ln -s /cluster/data/tetNig1/bed/blastz.fr1.2004-08-31 \
/cluster/data/tetNig1/bed/blastz.fr1
cd /cluster/data/tetNig1/bed/blastz.fr1
cat << '_EOF_' > DEF
# Tetraodon (tetNig1) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_L=8000
BLASTZ_H=2200
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - Tetraodon (tetNig1)
SEQ1_DIR=/iscratch/i/tetNig1/nib
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species so these files are empty
SEQ1_SMSK=/iscratch/i/tetNig1/linSpecRep.notInFugu
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Fugu (fr1)
# soft-masked chrom nib
SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib
SEQ2_RMSK=
SEQ2_SMSK=/iscratch/i/fugu/linSpecRep.notInZebrafish
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/tetNig1/bed/blastz.fr1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
# save the DEF file in the current standard place
chmod +x DEF
cp DEF ~angie/hummus/DEF.tetNig1-fr1.2004-08-31
# setup cluster run
# source the DEF file
bash
. ./DEF
/cluster/data/tetNig1/jkStuff/BlastZ_run0.sh
cd run.0
# check batch looks ok then
para try, check, push, check, ....
# para time
# Completed: 1995 of 1995 jobs
# CPU time in finished jobs: 502856s 8380.93m 139.68h 5.82d 0.016 y
# IO & Wait Time: 1190285s 19838.08m 330.63h 13.78d 0.038 y
# Average job time: 849s 14.14m 0.24h 0.01d
# Longest job: 19130s 318.83m 5.31h 0.22d
# Submission to last job: 19415s 323.58m 5.39h 0.22d
# second cluster run: lift raw alignments -> lav dir
cd /cluster/data/tetNig1/bed/blastz.fr1
bash # if a csh/tcsh user
. ./DEF
/cluster/data/tetNig1/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, check etc.
# para time
# Completed: 57 of 57 jobs
# IO & Wait Time: 511s 8.52m 0.14h 0.01d 0.000 y
# Average job time: 29s 0.48m 0.01h 0.00d
# Longest job: 58s 0.97m 0.02h 0.00d
# Submission to last job: 853s 14.22m 0.24h 0.01d
# third run: lav -> axt
ssh kki
cd /cluster/data/tetNig1/bed/blastz.fr1
mkdir axtChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/tetNig1/nib \
/cluster/bluearc/fugu/fr1/chromNib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/tetNig1/bed/blastz.fr1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 27 of 27 jobs
# CPU time in finished jobs: 73s 1.22m 0.02h 0.00d 0.000 y
# IO & Wait Time: 495s 8.25m 0.14h 0.01d 0.000 y
# Average job time: 21s 0.35m 0.01h 0.00d
# Longest job: 77s 1.28m 0.02h 0.00d
# Submission to last job: 318s 5.30m 0.09h 0.00d
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/tetNig1/bed/blastz.fr1
mkdir -p pslChrom
set tbl = "blastzFr1"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.fr1/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl -noTNameIx tetNig1 $f
echo "$f Done"
end
# default parameters and H=2000
# featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1 -enrichment
# all_mrna 2.637%, blastzFr1 78.364%, both 2.361%, cover 89.53%, enrich 1.14x
# repeat with H=2000 and L=6000 - blastzL6k
# featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1L6k -enrichment
# all_mrna 2.637%, blastzFr1L6k 77.955%, both 2.348%, cover 89.04%, enrich 1.14x# very little difference
# featureBits -chrom=chr1 danRer1 all_mrna blastzFr1 -enrichment
# all_mrna 0.981%, blastzFr1 5.542%, both 0.592%, cover 60.32%, enrich 10.88x
# blastzFr1L6k
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1L6k2 -enrichment
# all_mrna 2.637%, blastzFr1L6k2 79.917%, both 2.403%, cover 91.12%,
# enrich 1.14x
# H=2200, default parameters
# featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1H2200 -enrichment
# all_mrna 2.637%, blastzFr1H2200 78.225%, both 2.359%, cover 89.46%,
# enrich 1.14x
# rows in chr1_blastzFr1* tables:
# blastzFr1 47079
# blastzFr1L6k 26408
# blastzFr1L6k2 52340
# blastzFr1H2200 44240
# blastzFr1H2200L8k 20502
# blastzFr1H2200L10k 17355
# try increasing L to 8000
# H=2200, L=8000
# featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1H2200L8k -enrichment
# all_mrna 2.637%, blastzFr1H2200L8k 77.714%, both 2.345%, cover 88.94%,
# enrich 1.14x
# H=2200 L=10000
# featureBits -chrom=chr1 tetNig1 all_mrna blastzFr1H2200L10k -enrichment
# all_mrna 2.637%, blastzFr1H2200L10k 77.568%, both 2.340%, cover 88.72%,
# enrich 1.14x
# 6 jobs took a very long time and did not finish
# looking at some alignments in these tracks with the mRNA track on there is not
# much difference between most of them but blastzFr1L6k2 often has more
# alignments and a different strucutre. Use blastzFr1H2200L8k as this has
# fewer alingments with very little loss of coverage and more lower scoring
# alignments can be removed at the chaining step.
# CHAIN FR1 BLASTZ (DONE, 2004-09-01, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/tetNig1/bed/blastz.fr1
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/tetNig1/bed/blastz.fr1/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/tetNig1/nib \
/cluster/bluearc/fugu/fr1/chromNib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# Completed: 27 of 27 jobs
# CPU time in finished jobs: 414s 6.90m 0.11h 0.00d 0.000 y
# IO & Wait Time: 252s 4.20m 0.07h 0.00d 0.000 y
# Average job time: 25s 0.41m 0.01h 0.00d
# Longest job: 118s 1.97m 0.03h 0.00d
# Submission to last job: 972s 16.20m 0.27h 0.01d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's,try also with larger bin size.
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain/chain
hgLoadChain tetNig1 chr1_chainFr1 chr1.chain
# no RefSeqs for Tetraodon so use mrna table
# featureBits -chrom=chr1 tetNig1 all_mrna chainFr1Link -enrichment
# all_mrna 2.637%, chainFr1Link 76.099%, both 2.320%, cover 87.97%, enrich 1.16x
# try filtering with minScore < 5000
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
chainSplit chainFilt5k all.chain
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain/chainFilt5k
hgLoadChain tetNig1 chr1_chainFr1Filt5k chr1.chain
# featureBits -chrom=chr1 tetNig1 all_mrna chainFr1Filt5kLink -enrichment
# all_mrna 2.637%, chainFr1Filt5kLink 75.999%, both 2.319%, cover 87.95%,
# enrich 1.16x
# very little difference in the coverage so use the filtered version
ssh hgwdev
rm -r chain
mv chainFilt5k chain
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain tetNig1 ${c}_chainFr1 $i
echo done $c
end
# NET FUGU (FR1) BLASTZ (DONE, 2004-09-01, hartera)
# RELOADED NET WITH CORRECT TABLE NAME (2004-09-02, hartera)
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 300011520, utime 1360 s/100, stime 247
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
time netClass noClass.net tetNig1 fr1 fuguFr1.net -noAr
netFilter -minGap=10 fuguFr1.net | hgLoadNet tetNig1 netFr1 stdin
# MAKE Human Proteins track
ssh kksilo
mkdir -p /cluster/data/tetNig1/blastDb
cd /cluster/data/tetNig1
cat */*/*.lft > jkStuff/subLift.lft
for i in */*/*_*_*.fa; do ln -s `pwd`/$i blastDb; done
# remove *_random_#.fa (not listed)
cd blastDb
for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
rm *.fa
ssh kkr1u00
mkdir -p /iscratch/i/tetNig1/blastDb
cd /cluster/data/tetNig1/blastDb
cp * /iscratch/i/tetNig1/blastDb
(~kent/bin/iSync) 2>&1 > sync.out
mkdir -p /cluster/data/tetNig1/bed/tblastn.hg17KG
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
ls -1S /iscratch/i/tetNig1/blastDb/*.nsq | sed "s/\.nsq//" > tetra.lst
exit
# back to kksilo
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
mkdir kgfa
# calculate a reasonable number of jobs
calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc tetra.lst | awk "{print \\\$1}"`\)
# 40262/(150000/979) = 262.776653
split -l 263 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.1 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/tetNig1/jkStuff/subLift.lft warn $f.2
liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/tetNig1/jkStuff/liftAll.lft warn $f.3
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
chmod +x blastSome
gensub2 tetra.lst kg.lst blastGsub blastSpec
# exit from tcsh
exit
ssh kk
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
para create blastSpec
para try, push
# Completed: 144760 of 144760 jobs
# CPU time in finished jobs: 22218348s 370305.81m 6171.76h 257.16d 0.705 y
# IO & Wait Time: 3507367s 58456.11m 974.27h 40.59d 0.111 y
# Average job time: 178s 2.96m 0.05h 0.00d
# Longest job: 2810s 46.83m 0.78h 0.03d
# Submission to last job: 41709s 695.15m 11.59h 0.48d
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
ssh kki
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
para create chainSpec
para push
# Completed: 261 of 261 jobs
# CPU time in finished jobs: 828s 13.81m 0.23h 0.01d 0.000 y
# IO & Wait Time: 19521s 325.34m 5.42h 0.23d 0.001 y
# Average job time: 78s 1.30m 0.02h 0.00d
# Longest job: 270s 4.50m 0.07h 0.00d
# Submission to last job: 3878s 64.63m 1.08h 0.04d
exit
# back to kksilo
cd /cluster/data/tetNig1/bed/tblastn.hg17KG/blastOut
for i in kg??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/tetNig1/jkStuff/liftAll.lft warn stdin | \
sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > ../blastHg17KG.psl
cd ..
ssh hgwdev
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
hgLoadPsl tetNig1 blastHg17KG.psl
exit
# back to kksilo
rm -rf blastOut
# End tblastn
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-09-08, hartera)
ssh kksilo
cd /cluster/data/tetNig1
#- Build the .zip files
cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
# chrom RepeatMasker out files
zip -j zip/chromOut.zip [0-9A-Z]*/chr*.fa.out
# soft masked chrom fasta
zip -j zip/chromFa.zip [0-9A-Z]*/chr*.fa
# hard masked chrom fasta
zip -j zip/chromFaMasked.zip [0-9A-Z]*/chr*.fa.masked
# chrom TRF output files
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMask/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=tetNig1 -native GenBank mrna \
/cluster/data/tetNig1/zip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=tetNig1 -xeno GenBank mrna \
/cluster/data/tetNig1/zip/xenoMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=tetNig1 -native GenBank est \
/cluster/data/tetNig1/zip/est.fa
cd /cluster/data/tetNig1/zip
# zip GenBank native and xeno mRNAs and native ESTs
zip -j mrna.zip mrna.fa
zip -j xenoMrna.zip xenoMrna.fa
zip -j est.zip est.fa
'_EOF_'
# << this line makes emacs coloring happy
nice csh ./jkStuff/zipAll.csh |& tee ./jkStuff/zipAll.log
cd jkStuff
#- Look at zipAll.log to make sure all file lists look reasonable.
# Copy the .zip files to
# hgwdev:/usr/local/apache/...
cd /cluster/data/tetNig1/zip
#- Check zip file integrity:
foreach f (*.zip)
unzip -t $f > $f.test
tail -1 $f.test
end
wc -l *.zip.test
# 29 chromAgp.zip.test
# 29 chromFa.zip.test
# 29 chromFaMasked.zip.test
# 29 chromOut.zip.test
# 87 chromTrf.zip.test
# 3 est.zip.test
# 3 mrna.zip.test
# 3 xenoMrna.zip.test
# 212 total
ssh hgwdev
/cluster/data/tetNig1/zip
set gp = /usr/local/apache/htdocs/goldenPath/tetNig1
mkdir -p $gp/bigZips
cp -p *.zip $gp/bigZips
mkdir -p $gp/chromosomes
foreach f (../*/chr*.fa)
zip -j $gp/chromosomes/$f:t.zip $f
end
cd $gp/bigZips
md5sum *.zip > md5sum.txt
cd $gp/chromosomes
md5sum *.zip > md5sum.txt
# Take a look at bigZips/* and chromosomes/*, update their README.txt's
# MAKE VSHG17 DOWNLOADABLES (DONE, 2004-09-08, hartera)
ssh kksilo
# zip chains and nets
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChain
cp all.chain hg17.chain
zip -j /cluster/data/tetNig1/zip/hg17.chain.zip hg17.chain
rm hg17.chain
zip -j /cluster/data/tetNig1/zip/hg17.net.zip hg17.net
ssh hgwdev
# copy chains and nets to downloads area
set gp = /usr/local/apache/htdocs/goldenPath/tetNig1
mkdir -p $gp/vsHg17
cd $gp/vsHg17
mv /cluster/data/tetNig1/zip/hg17*.zip .
md5sum *.zip > md5sum.txt
# move axt files to downloads area and zip
cd /cluster/data/tetNig1/bed/blastz.hg17.swap/axtChrom
mkdir -p $gp/vsHg17/axtChrom
cp -p *.axt $gp/vsHg17/axtChrom
cd $gp/vsHg17/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# MAKE VSFR1 DOWNLOADABLES (DONE, 2004-09-08, hartera)
ssh kksilo
# zip chains and nets
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChain
cp all.chain fr1.chain
zip -j /cluster/data/tetNig1/zip/fr1.chain.zip fr1.chain
rm fr1.chain
cp fuguFr1.net fr1.net
zip -j /cluster/data/tetNig1/zip/fr1.net.zip fr1.net
rm fr1.net
ssh hgwdev
# copy chains and nets to downloads area
set gp = /usr/local/apache/htdocs/goldenPath/tetNig1
mkdir -p $gp/vsFr1
cd $gp/vsFr1
mv /cluster/data/tetNig1/zip/fr1*.zip .
md5sum *.zip > md5sum.txt
# move axt files to downloads area and zip
cd /cluster/data/tetNig1/bed/blastz.fr1/axtChrom
mkdir -p $gp/vsFr1/axtChrom
cp -p *.axt $gp/vsFr1/axtChrom
cd $gp/vsFr1/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# BLASTZ SWAP FOR mm5 vs tetNig1 BLASTZ TO CREATE tetNig1 vs MOUSE mm5 BLASTZ
# (DONE, 2004-09-08, hartera)
ssh kksilo
mkdir -p /cluster/store8/tetNig1/blastz.mm5.swap.2004-09-08
ln -s /cluster/store8/tetNig1/blastz.mm5.swap.2004-09-08 \
/cluster/data/tetNig1/bed
ln -s /cluster/data/tetNig1/bed/blastz.mm5.swap.2004-09-08 \
/cluster/data/tetNig1/bed/blastz.mm5.swap
ssh kolossus
cd /cluster/data/tetNig1/bed/blastz.mm5.swap
# use axtChrom from blastzTetNig1 on mm5
set aliDir = /cluster/data/mm5/bed/blastz.tetNig1
cp $aliDir/S1.len S2.len
cp $aliDir/S2.len S1.len
mkdir unsorted axtChrom
cat $aliDir/axtChrom/chr*.axt \
| axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
| axtSplitByTarget stdin unsorted
# Sort the shuffled .axt files.
foreach f (unsorted/*.axt)
echo sorting $f:t:r
axtSort $f axtChrom/$f:t
end
du -sh $aliDir/axtChrom unsorted axtChrom
# 522M /cluster/data/mm5/bed/blastz.tetNig1/axtChrom
# 523M unsorted
# 522M axtChrom
rm -r unsorted
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/tetNig1/bed/blastz.mm5.swap
mkdir -p pslChrom
set tbl = "blastzMm5"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl tetNig1 $f
echo "$f Done"
end
# featureBits -chrom=chr1 tetNig1 all_mrna blastzMm5 -enrichment
# all_mrna 2.637%, blastzMm5 12.633%, both 0.852%, cover 32.30%, enrich 2.56x
# CHAIN MOUSE (mm5) BLASTZ (DONE, 2004-09-09, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/tetNig1/bed/blastz.mm5.swap
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Make our own linear gap file with reduced gap penalties,
# in hopes of getting longer chains - works well for species at
# chicken-human distance or greater
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V
72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V
31600^V 56600
tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V
31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V
16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -linearGap=../../chickenHumanTuned.gap $1 \
/iscratch/i/tetNig1/nib \
/iscratch/i/mus/mm5/softNib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 27 of 27 jobs
# CPU time in finished jobs: 2124s 35.40m 0.59h 0.02d 0.000 y
# IO & Wait Time: 59s 0.99m 0.02h 0.00d 0.000 y
# Average job time: 81s 1.35m 0.02h 0.00d
# Longest job: 123s 2.05m 0.03h 0.00d
# Submission to last job: 3725s 62.08m 1.03h 0.04d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# apart from chrUn_random, not too many chains with score < 5000
# load chr 1 into table
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain/chain
hgLoadChain tetNig1 chr1_chainMm5 chr1.chain
# no RefSeqs for Tetraodon so use mrna table
# featureBits -chrom=chr1 tetNig1 all_mrna chainMm5Link -enrichment
# all_mrna 2.637%, chainMm5Link 11.552%, both 0.823%, cover 31.19%, enrich 2.70x
# try filtering with minScore < 5000
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
chainSplit chainFilt5k all.chain
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain/chainFilt5k
hgLoadChain tetNig1 chr1_chainMm5Filt5k chr1.chain
# featureBits -chrom=chr1 tetNig1 all_mrna chainMm5Filt5kLink -enrichment
# all_mrna 2.637%, chainMm5Filt5kLink 10.492%, both 0.780%, cover 29.58%,
# enrich 2.82x
# use chains with minScore=5000 filtering
# remove test chain tables for chr1
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain
hgsql -e "drop table chr1_chainMm5;" tetNig1
hgsql -e "drop table chr1_chainMm5Link;" tetNig1
hgsql -e "drop table chr1_chainMm5Filt5k;" tetNig1
hgsql -e "drop table chr1_chainMm5Filt5kLink;" tetNig1
rm -r chain
mv chainFilt5k chain
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain tetNig1 ${c}_chainMm5 $i
echo done $c
end
# NET MOUSE (mm5) BLASTZ (DONE, 2004-09-09, hartera)
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain
mkdir preNet
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 67969024, utime 336 s/100, stime 36
# Add classification info using db tables:
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
mkdir -p /cluster/bluearc/mm5/linSpecRep.notInTetraodon
mkdir -p /cluster/bluearc/tetNig1/linSpecRep.notInMouse
cp -p /iscratch/i/mm5/linSpecRep.notInTetraodon/* \
/cluster/bluearc/mm5/linSpecRep.notInTetraodon
cp -p /iscratch/i/tetNig1/linSpecRep.notInMouse/* \
/cluster/bluearc/tetNig1/linSpecRep.notInMouse
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain
time netClass noClass.net tetNig1 mm5 mouseMm5.net \
-tNewR=/cluster/bluearc/tetNig1/linSpecRep.notInMouse \
-qNewR=/cluster/bluearc/mm5/linSpecRep.notInTetraodon -noAr
# 50.960u 34.880s 2:25.82 58.8% 0+0k 0+0io 200pf+0w
netFilter -minGap=10 mouseMm5.net | hgLoadNet tetNig1 netMm5 stdin
# featureBits tetNig1 all_mrna netHg17 -enrichment
# all_mrna 3.204%, netHg17 42.716%, both 1.698%, cover 53.00%, enrich 1.24x
# MAKE VSMM5 DOWNLOADABLES (DONE, 2004-09-10, hartera)
ssh kksilo
# zip chains and nets
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChain
cp all.chain mm5.chain
zip -j /cluster/data/tetNig1/zip/mm5.chain.zip mm5.chain
rm mm5.chain
cp mouseMm5.net mm5.net
zip -j /cluster/data/tetNig1/zip/mm5.net.zip mm5.net
rm mm5.net
ssh hgwdev
# copy chains and nets to downloads area
set gp = /usr/local/apache/htdocs/goldenPath/tetNig1
mkdir -p $gp/vsMm5
cd $gp/vsMm5
mv /cluster/data/tetNig1/zip/mm5*.zip .
md5sum *.zip > md5sum.txt
# move axt files to downloads area and zip
cd /cluster/data/tetNig1/bed/blastz.mm5.swap/axtChrom
mkdir -p $gp/vsMm5/axtChrom
cp -p *.axt $gp/vsMm5/axtChrom
cd $gp/vsMm5/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# CLEANUP HUMAN (hg17) BLASTZ (DONE, 2004-09-10, hartera)
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.hg17.swap
nice rm axtChain/run1/chain/* &
nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
# CLEANUP FUGU (fr1) BLASTZ (DONE, 2004-09-10, hartera)
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.fr1
nice rm -rf raw &
nice rm -rf lav &
nice rm -rf test* &
nice rm axtChain/run1/chain/* &
nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
# CLEANUP MOUSE (mm5) BLASTZ (DONE, 2004-09-10, hartera)
ssh kksilo
cd /cluster/data/tetNig1/bed/blastz.mm5
nice rm -rf raw &
nice rm -rf lav &
nice rm -rf test* &
nice rm axtChain/run1/chain/* &
nice gzip {axt,psl}Chrom/* axtChain/{all.chain,*.net} &
# GENSCAN GENE PREDICTION ANNOTATIONS (DONE, 2004-09-10, hartera)
# Provided by Genoscope, Genscan was run on masked Tetraodon genomic DNA
# using parameters calibrated for Tetraodon genes.
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/Genoscope/genscan
cd /cluster/data/tetNig1/bed/Genoscope/genscan
awk '{if ($2 == "GSC") print;}' /cluster/data/tetNig1/tetraodon.all.gff > genscan.gff
foreach c (`cat /cluster/data/tetNig1/chrom.lst`)
echo "Processing chr$c ... "
awk '{if ($1 == "'chr${c}'") print;}' genscan.gff >> chr${c}.gff
end
# need to remove CDS from name then load
foreach f (./chr*.gff)
perl -pi.bak -e 's/CDS G/G/' $f
end
# load database with gene predictions
ssh hgwdev
cd /cluster/data/tetNig1/bed/Genoscope/genscan
# Reloaded without -genePredExt 1/6/05:
nice ldHgGene tetNig1 genscan chr*.gff
# Read 56120 transcripts in 227294 lines in 27 files
# 56120 groups 27 seqs 1 sources 2 feature types
# 28060 gene predictions
# GENOSCOPE GAZE ANNOTATIONS (DONE, 2004-09-10, hartera)
# GAZE annotations have been automatically generated by the GAZE software
# (Howe et al.(2002). Genome Res., 12: 1418-27) using a custom designed gene
# model. GAZE integrates annotation information from Geneid, Genescan,
# Exofish (Human, Mouse, Fugu), Genewise (Human, Mouse) and cDNAs (Tetraodon).
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/Genoscope/gaze
cd /cluster/data/tetNig1/bed/Genoscope/gaze
awk '{if ($2 == "GSTEN") print;}' /cluster/data/tetNig1/tetraodon.all.gff > gaze.gff
# remove ; as separators between names of gene, protein, mRNA etc.
perl -pi.bak -e "s/\s\;\s/\t/g;" gaze.gff
foreach c (`cat /cluster/data/tetNig1/chrom.lst`)
echo "Processing chr$c ... "
awk '{if ($1 == "'chr${c}'") print;}' gaze.gff >> chr${c}.gff
end
# need to remove CDS from name then load
foreach f (./chr*.gff)
perl -pi.bak -e 's/CDS G/G/' $f
end
# load database with gene predictions
ssh hgwdev
cd /cluster/data/tetNig1/bed/Genoscope/gaze
# loaded chr1 only into rahseq first to check then load all chroms
# Reloaded without -genePredExt 1/6/05:
nice ldHgGene tetNig1 gaze chr*.gff
# GALT 09-15-04 index on name(10) going to name(17) as all have long prefix
echo "drop index name on gaze ;" | hgsql tetNig1
echo "create index name on gaze (name(17)) ;" | hgsql tetNig1
# Read 91905 transcripts in 265439 lines in 27 files
# 91905 groups 27 seqs 1 sources 4 feature types
# 27918 gene predictions
# HOX GENES TRACK (DONE, 2004-09-10, hartera)
# Provided by Genoscope: Tetraodon HOX genes were annotated by human experts
# using sequence similarities with human, mouse and zebrafish HOX protein and
# nucleic sequence.
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/Genoscope/hoxGenes
cd /cluster/data/tetNig1/bed/Genoscope/hoxGenes
awk '{if ($2 == "HOX") print;}' /cluster/data/tetNig1/tetraodon.all.gff > hoxGenes.gff
# remove ; as separators between names of gene, protein, mRNA etc.
perl -pi.bak -e "s/\s\;\s/\t/g;" hoxGenes.gff
foreach c (`cat /cluster/data/tetNig1/chrom.lst`)
echo "Processing chr$c ... "
awk '{if ($1 == "'chr${c}'") print;}' hoxGenes.gff >> chr${c}.gff
end
# need to remove CDS from name then load
foreach f (./chr*.gff)
perl -pi.bak -e 's/CDS H/H/' $f
end
# load database with gene predictions
ssh hgwdev
cd /cluster/data/tetNig1/bed/Genoscope/hoxGenes
# loaded chr1 only into rahseq first to check then load all chroms
# Reloaded without -genePredExt 1/6/05:
nice ldHgGene tetNig1 hoxGenes chr*.gff
# Read 96 transcripts in 142 lines in 27 files
# 96 groups 6 seqs 1 sources 2 feature types
# 48 gene predictions
# CYTOKINE GENES TRACK (DONE, 2004-09-10, hartera)
# Provided by Genoscope: Tetraodon Cytokine genes were annotated by human
# experts using sequence similarities with human, mouse and zebrafish
# Cytokine protein and nucleic sequence.
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/Genoscope/cytokines
cd /cluster/data/tetNig1/bed/Genoscope/cytokines
awk '{if ($2 == "CYT") print;}' /cluster/data/tetNig1/tetraodon.all.gff > cytokines.gff
# remove ; as separators between names of gene, protein, mRNA etc.
perl -pi.bak -e "s/\s\;\s/\t/g;" cytokines.gff
foreach c (`cat /cluster/data/tetNig1/chrom.lst`)
echo "Processing chr$c ... "
awk '{if ($1 == "'chr${c}'") print;}' cytokines.gff >> chr${c}.gff
end
# need to remove CDS from name then load
foreach f (./chr*.gff)
perl -pi.bak -e 's/CDS A/A/' $f
end
# load database with gene predictions
ssh hgwdev
cd /cluster/data/tetNig1/bed/Genoscope/cytokines
# loaded chr1 only into rahseq first to check then load all chroms
# Reloaded without -genePredExt 1/6/05:
nice ldHgGene tetNig1 cytokines chr*.gff
# GENEID GENES (DONE 9/22/04 angie)
ssh kksilo
mkdir /cluster/data/tetNig1/bed/geneid
cd /cluster/data/tetNig1/bed/geneid
foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
wget http://genome.imim.es/genepredictions/T.nigroviridis/tetNigFeb2004/geneid_v1.2/$chr.gtf
wget http://genome.imim.es/genepredictions/T.nigroviridis/tetNigFeb2004/geneid_v1.2/$chr.prot
end
# Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
cp /dev/null geneidPep.fa
foreach f (chr*.prot)
perl -wpe 's/^(>chr\S+)/$1.1/' $f >> geneidPep.fa
end
ssh hgwdev
cd /cluster/data/tetNig1/bed/geneid
ldHgGene -gtf -genePredExt tetNig1 geneid chr*.gtf
hgPepPred tetNig1 generic geneidPep geneidPep.fa
# GENOSCOPE GENEWISE HUMAN IPI (DONE, 2004-10-14, hartera)
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/Genoscope/humanIPI
cd /cluster/data/tetNig1/bed/Genoscope/humanIPI
# Create README to describe data from Genoscope tetraodon.all.gff.readme
awk '{if ($2 == "GWS_H") print;}' /cluster/data/tetNig1/tetraodon.all.gff \
> humanIPI.gff
# remove "exon" from name field
sed -e 's/exon GWS/GWS/g' humanIPI.gff > humanIPIformat.gff
# load database with gene predictions
ssh hgwdev
cd /cluster/data/tetNig1/bed/Genoscope/humanIPI
nice ldHgGene -genePredExt tetNig1 humanIPI humanIPIformat.gff
# Read 43384 transcripts in 185494 lines in 1 files
# 43384 groups 27 seqs 1 sources 2 feature types
# 21692 gene predictions
# add to trackDb and write html page
# GENOSCOPE HUMAN IPI ECOTIGS (DONE, 2004-10-18, hartera)
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/Genoscope/humanIPIEcotigs
cd /cluster/data/tetNig1/bed/Genoscope/humanIPIEcotigs
# Create README to describe data from Genoscope tetraodon.all.gff.readme
awk '{if ($2 == "EP3_H") print;}' /cluster/data/tetNig1/tetraodon.all.gff \
> humanIPIEcotigs.gff
# remove "exon" from name field
sed -e 's/exon EP3/EP3/g' humanIPIEcotigs.gff > humanIPIEcotigsformat.gff
# load database with gene predictions - LOAD IN RAHSEQ FIRST TO CHECK
ssh hgwdev
cd /cluster/data/tetNig1/bed/Genoscope/humanIPIEcotigs
nice ldHgGene -genePredExt tetNig1 humanIPIEcotigs humanIPIEcotigsformat.gff
# Read 55800 transcripts in 204355 lines in 1 files
# 55800 groups 27 seqs 1 sources 2 feature types
# 27900 gene predictions
# GENOSCOPE FUGU (Takifugu) ECOTIGS (in progress, 2004-10-18, hartera)
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/Genoscope/fuguEcotigs
cd /cluster/data/tetNig1/bed/Genoscope/fuguEcotigs
# Create README to describe data from Genoscope tetraodon.all.gff.readme
awk '{if ($2 == "EG3_F") print;}' /cluster/data/tetNig1/tetraodon.all.gff \
> fuguEcotigs.gff
# remove "HSP" from name field, change HSP to "exon" and match to "transcript"
sed -e 's/HSP EG3/EG3/g' fuguEcotigs.gff | sed -e 's/HSP/exon/' \
| sed -e 's/match/transcript/' > fuguEcotigsformat.gff
# load database with gene predictions
# load in when know which version of Fugu this is so can name appropriately
ssh hgwdev
cd /cluster/data/tetNig1/bed/Genoscope/fuguEcotigs
nice ldHgGene -genePredExt tetNig1 fuguEcotigs fuguEcotigsformat.gff
# Read 35552 transcripts in 210128 lines in 1 files
# 35552 groups 27 seqs 1 sources 2 feature types
# 17776 gene predictions
# Adding a photograph to the description
# Permission to use photos obtained from Hugues Roest Crollius:
# Email message attached below
ssh hgwdev
cd /cluster/data/tetNig1/html
wget \
'http://www.genoscope.cns.fr/externe/Francais/Projets/Projet_C/img/tetra23.jpg' \
-O photo_tetraodon.jpg
convert -crop 1320x600+280+40 -sharpen 0 -normalize \
-geometry 300x200 -quality 80 photo_tetraodon.jpg \
Tetraodon_nigroviridis.jpg
cp -p Tetraodon_nigroviridis.jpg /usr/local/apache/htdocs/images
# add links to this image in the description.html page, request push
# After checking with a couple of journals who have used our pictures for
# their covers, it appears that none holds copyrights so you are free to use
# any of the pictures on the followings pages:
# http://www.genoscope.cns.fr/externe/English/Projets/Projet_C/album.html
# For some of the images on the first page we can provide you with high
# resolution versions if needed.
# I hope this helps.
# Best wishes,
# Hugues Roest Crollius
# -------------------------------------------------------------------------
# Hugues Roest Crollius
# Laboratoire Dynamique et Organisation des Genomes (Dyogen)
# CNRS UMR8541
# Departement de Biologie
# Ecole Normale Superieure Tel: +33 (0)1 44 32 23 70
# 46 rue d'Ulm, 75230 Paris Cedex 05 Email: hrc@ens.fr
# SELF BLASTZ (DONE, 2005-01-12, hartera)
ssh kksilo
rm /cluster/data/tetNig1/bed/blastzSelf
mkdir -p /cluster/store8/tetNig1/bed/blastzSelf.2005-01-11
ln -s /cluster/store8/tetNig1/bed/blastzSelf.2005-01-11 \
/cluster/data/tetNig1/bed/blastzSelf
cd /cluster/data/tetNig1/bed/blastzSelf
# try using contigs
mkdir -p /cluster/data/tetNig1/bed/blastzSelf/contigSeqs
cd /cluster/data/tetNig1/bed/blastzSelf/contigSeqs
foreach c (`cat /cluster/data/tetNig1/chrom.lst`)
foreach d (/cluster/data/tetNig1/$c/chr${c}*_?{,?})
echo "splitting $d"
set contig = $d:t
echo "$contig"
~/bin/i386/faSplit gap $d/$contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
end
end
# 1025 files for chr*.fa and chr*.lft - 940 for chr*.fa, 85 for chr*.lft
cat chr*.fa >> tetNig1Contigs.fa
cat chr*.lft >> 500kbcontigs.lft
rm chr*.fa chr*.lft
ssh kkr1u00
mkdir -p /iscratch/i/tetNig1/contigs
mv /cluster/data/tetNig1/bed/blastzSelf/contigSeqs/tetNig1Contigs.fa \
/iscratch/i/tetNig1/contigs
iSync
ssh kk
cd /cluster/data/tetNig1/bed/blastzSelf
# for this blastz run, M=50
cat << '_EOF_' > DEF
# tetraodon vs. tetraodon
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run1
BLASTZ=blastz
BLASTZ_L=5000
BLASTZ_H=2500
BLASTZ_ABRIDGE_REPEATS=0
# TARGET
# Tetraodon
SEQ1_DIR=/iscratch/i/tetNig1/nib
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=500
# QUERY
# Tetraodon
SEQ2_DIR=/iscratch/i/tetNig1/contigs
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=
SEQ2_LAP=
BASE=/cluster/data/tetNig1/bed/blastzSelf
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line makes emacs coloring happy
# Save the DEF file in the current standard place - TO BE DONE
chmod +x DEF
cp DEF ~angie/hummus/DEF.tetNig1-tetNig1.2005-01-11
# prepare first cluster run
bash # if a csh/tcsh user
. ./DEF
/cluster/data/tetNig1/jkStuff/BlastZ_run0.sh
cd run.0
# check batch looks ok then
para try, check, push, check, ....
# para time
# Completed: 820 of 820 jobs
# CPU time in finished jobs: 1611164s 26852.74m 447.55h 18.65d 0.051 y
# IO & Wait Time: 11448s 190.80m 3.18h 0.13d 0.000 y
# Average job time: 1979s 32.98m 0.55h 0.02d
# Longest job: 4231s 70.52m 1.18h 0.05d
# Submission to last job: 14753s 245.88m 4.10h 0.17d
# second cluster run: lift raw alignments -> lav dir
cd /cluster/data/tetNig1/bed/blastzSelf
bash # if a csh/tcsh user
. ./DEF
/cluster/data/tetNig1/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, check etc.
# para time
# Completed: 820 of 820 jobs
# CPU time in finished jobs: 2741s 45.68m 0.76h 0.03d 0.000 y
# IO & Wait Time: 13346s 222.44m 3.71h 0.15d 0.000 y
# Average job time: 20s 0.33m 0.01h 0.00d
# Longest job: 106s 1.77m 0.03h 0.00d
# Submission to last job: 926s 15.43m 0.26h 0.01d
# third run: lav -> axt
ssh kki
cd /cluster/data/tetNig1/bed/blastzSelf
mkdir axtChrom run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt -fa stdin /iscratch/i/tetNig1/nib \
/iscratch/i/tetNig1/contigs/tetNig1Contigs.fa stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/tetNig1/bed/blastzSelf/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 27 of 27 jobs
# CPU time in finished jobs: 337s 5.61m 0.09h 0.00d 0.000 y
# IO & Wait Time: 324s 5.41m 0.09h 0.00d 0.000 y
# Average job time: 24s 0.41m 0.01h 0.00d
# Longest job: 273s 4.55m 0.08h 0.00d
# Submission to last job: 1675s 27.92m 0.47h 0.02d
# do liftup here
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kksilo
cd /cluster/data/tetNig1/bed/blastzSelf
foreach d (axtChrom/chr*.axt)
echo $d
liftUp -axtQ ${d}.out.axt ./contigSeqs/500kbcontigs.lft warn $d \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
# check this is working correctly
set tetra = "/cluster/data/tetNig1"
foreach c (`cat ${tetra}/chrom.lst`)
echo lifting $c
liftUp -axtQ ./axtChrom/chr${c}.out2.axt \
$tetra/jkStuff/liftAll.lft warn \
./axtChrom/chr${c}.axt.out.axt > /dev/null
end
cd axtChrom
# need to drop Self alignments after lifting up co-ordinates
foreach c (`cat ${tetra}/chrom.lst`)
echo "Processing $c..."
axtDropSelf chr${c}.out2.axt chr${c}.dropped.axt
mv chr${c}.axt chr${c}.axt.old
mv chr${c}.dropped.axt chr${c}.axt
rm chr${c}.axt.out.axt
end
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/tetNig1/bed/blastzSelf
mkdir -p pslChrom
set tbl = "blastzSelf"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S1.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastzSelf/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl -noTNameIx tetNig1 $f
echo "$f Done"
end
# not in contigs: blastzSelf
# featureBits -chrom=chr1 tetNig1 all_mrna blastzSelf -enrichment
# all_mrna 2.601%, blastzSelf 16.728%, both 0.654%, cover 25.13%, enrich 1.50x
# featureBits -chrom=chr1 tetNig1 blastzSelf
# 2704737 bases of 16169091 (16.728%) in intersection
# blastzSelfC, query is in contigs, M=50
# featureBits -chrom=chr1 tetNig1 all_mrna blastzSelfC -enrichment
# all_mrna 2.601%, blastzSelfC 16.208%, both 0.638%, cover 24.51%, enrich 1.51x
# featureBits -chrom=chr1 tetNig1 blastzSelfC
# 2620707 bases of 16169091 (16.208%) in intersection
# blastzSelfCM100, query is in contigs, M=100
# featureBits -chrom=chr1 tetNig1 all_mrna blastzSelfCM100 -enrichment
# all_mrna 2.601%, blastzSelfCM100 16.319%, both 0.639%, cover 24.56%,
# enrich 1.51x
# featureBits -chrom=chr1 tetNig1 blastzSelfCM100
# 2638606 bases of 16169091 (16.319%) in intersection
# gain little by using M=100 except for a lot of extra alignments so use
# M=50 instead.
# L=5000
# featureBits -chrom=chr1 tetNig1 blastzSelfL5k
# 2488953 bases of 16169091 (15.393%) in intersection
# L=8000, L8k
# featureBits -chrom=chr1 tetNig1 blastzSelfL8k
# 2303191 bases of 16169091 (14.244%) in intersection
# rows in table
# blastzSelf: 517242
# blastzSelfC: 86073
# blastzSelfCM100: 132429
# blastzSelfCL5k: 62933
# blastzSelfL8k: 43137
# use blastz with contigs, M=50 and L=5000. Using L=5000 removes some of the
# short low scoring alignments but not too stringently. After chaining,
# further removal of low scoring alignments can be done.
# CHAIN SELF BLASTZ (DONE, 2005-01-14, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/tetNig1/bed/blastzSelf
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
rm axtChrom/chr*.out2.axt
rm axtChrom/chr*.old
ls -1S /cluster/data/tetNig1/bed/blastzSelf/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# axtChain now using human/chicken gap scoring matrix as default
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 \
/iscratch/i/tetNig1/nib \
/iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# Completed: 27 of 27 jobs
# CPU time in finished jobs: 864s 14.40m 0.24h 0.01d 0.000 y
# IO & Wait Time: 137s 2.28m 0.04h 0.00d 0.000 y
# Average job time: 37s 0.62m 0.01h 0.00d
# Longest job: 559s 9.32m 0.16h 0.01d
# Submission to last job: 1758s 29.30m 0.49h 0.02d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/tetNig1/bed/blastzSelf/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chainRaw all.chain
# apply chainAntiRepeat to remove chains that are the result of repeats
# and degenerate DNA
mkdir chain
cd chainRaw
foreach f (*.chain)
set c = $f:r
echo $c
nice chainAntiRepeat /cluster/bluearc/tetNig1/nib \
/cluster/bluearc/tetNig1/nib $f \
../chain/$c.chain
end
cd ..
# take a look at score distr's,try also with larger bin size.
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist10k.out
textHistogram -binSize=10000 /tmp/score.$f:t:r >> hist10k.out
echo ""
end
# trim to minScore=10000 to cut some of the fluff
mkdir chainFilt
foreach f (chain/*.chain)
chainFilter -minScore=10000 $f > chainFilt/$f:t
end
# load tables
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastzSelf/axtChain/chainFilt
foreach i (*.chain)
set c = $i:r
hgLoadChain tetNig1 ${c}_chainSelf $i
echo done $c
end
# filtering on minScore=5000
# featureBits -chrom=chr1 tetNig1 chainSelf5kLink
# 2377755 bases of 16169091 (14.706%) in intersection
# rows = 362482
# filtering on minScore=10000, use this
# featureBits -chrom=chr1 tetNig1 chainSelfLink
# 2199465 bases of 16169091 (13.603%) in intersection
# rows = 233729
# NET SELF BLASTZ (DONE, 2005-01-14, hartera)
ssh kksilo
cd /cluster/data/tetNig1/bed/blastzSelf/axtChain
mkdir preNet
cd chainFilt
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S1.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S1.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 78983168, utime 485 s/100, stime 55
# Add classification info using db tables:
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastzSelf/axtChain
time netClass noClass.net tetNig1 tetNig1 self.net -noAr
# 14.530u 4.550s 0:31.26 61.0% 0+0k 0+0io 218pf+0w
netFilter -minGap=10 self.net | hgLoadNet tetNig1 netSelf stdin
# featureBits tetNig1 netSelf
# 184618536 bases of 342403326 (53.918%) in intersection
# MAKE Human Proteins track (hg17 DONE 2005-1-15 braney)
ssh kksilo
mkdir -p /cluster/data/tetNig1/blastDb
cd /cluster/data/tetNig1/blastDb
for i in `cat ../chrom.lst`; do ls -1 ../$i/*/*.fa ; done | sed "/Un_random/d" > list
for i in `cat list`
do
ln -s $i .
formatdb -i `basename $i` -p F
done
rm *.log *.fa list
cd ..
for i in `cat chrom.lst`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft
ssh kkr1u00
mkdir -p /iscratch/i/tetNig1/blastDb
cd /cluster/data/tetNig1/blastDb
for i in nhr nin nsq; do cp *.$i /iscratch/i/tetNig1/blastDb ; echo $i; done
cd
iSync > sync.out
mkdir -p /cluster/data/tetNig1/bed/tblastn.hg17KG
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
echo /iscratch/i/tetNig1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
# back to kksilo
exit
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(300000/`wc query.lst | awk "{print \\\$1}"`\)
# 42156/(300000/593) = 83.328360
mkdir -p /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/kgfa
split -l 83 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/kgfa/kg
ln -s /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/kgfa kgfa
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/blastOut
ln -s /cluster/bluearc/tetNig1/bed/tblastn.hg17KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2
liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
# I ended up doing the scaffolds separately from the chopped up chrom segments
# but next time I wouldn't
gensub2 scaffold.lst kg.lst blastGsub scaffoldBlastSpec
ssh kk
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
para create blastSpec
para push
# Completed: 301244 of 301244 jobs
# CPU time in finished jobs: 25047824s 417463.74m 6957.73h 289.91d 0.794 y
# IO & Wait Time: 2658184s 44303.06m 738.38h 30.77d 0.084 y
# Average job time: 92s 1.53m 0.03h 0.00d
# Longest job: 226899s 3781.65m 63.03h 2.63d
# Submission to last job: 250711s 4178.52m 69.64h 2.90d
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
ssh kki
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
para create chainSpec
para push
# Completed: 1287 of 1287 jobs
# CPU time in finished jobs: 72236s 1203.94m 20.07h 0.84d 0.002 y
# IO & Wait Time: 22886s 381.43m 6.36h 0.26d 0.001 y
# Average job time: 74s 1.23m 0.02h 0.00d
# Longest job: 3374s 56.23m 0.94h 0.04d
# Submission to last job: 5982s 99.70m 1.66h 0.07d
exit
# back to kksilo
cd /cluster/data/tetNig1/bed/tblastn.hg17KG/blastOut
# again some weirdness because I did the NA and Un scaffolds separately
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > cs60.$i.psl
sort -rn cs60.$i.psl | pslUniq stdin us.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" cs60.$i.psl > ms60.$i.psl
echo $i
done
for i in kg??
do
cat $i/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
cat us.*.psl ms60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/tetNig1/bed/tblastn.hg17KG/blastHg17KG.psl
cd ..
# lift the chrUn and chrNA scaffolds
ssh hgwdev
cd /cluster/data/tetNig1/bed/tblastn.hg17KG
hgLoadPsl tetNig1 blastHg17KG.psl
# back to kksilo
rm -rf blastOut
# End tblastn
# GENOSCOPE HUMAN (hg17) ECORES (DONE, 2005-02-07, hartera)
ssh kksilo
mkdir -p /cluster/data/tetNig1/bed/Genocscope/ecoresHg17
cd /cluster/data/tetNig1/bed/Genoscope/ecoresHg17
wget --timestamp \
http://www.genoscope.cns.fr/externe//4ucsc/ExofishTnig1Hs35
# this is in gff format
# remove "Ecotig" from name field
sed -e 's/Ecotig EG/EG/g' ExofishTnig1Hs35 > ExofishTnig1Hs35.gff
# need to have tabs between fields not a space to load file into table
sed -e 's/ /\t/g' ExofishTnig1Hs35.gff > Tnig1Hs35format.gff
# if "ecore" is changed to "CDS" and "ecotig" to "transcript" this loads
# correctly into the table.
sed -e 's/ecore/CDS/' Tnig1Hs35format.gff | sed -e 's/ecotig/transcript/' \
> tetNig1vsHg17.gff
ssh hgwdev
cd /cluster/data/tetNig1/bed/Genoscope/ecoresHg17
nice ldHgGene tetNig1 ecoresHg17 tetNig1vsHg17.gff
# Read 36417 transcripts in 184981 lines in 1 files
# 36417 groups 27 seqs 1 sources 2 feature types
# 36417 gene predictions
# added ecoresHg17 entry to trackDb.ra in trackDb/tetraodon
# and created ecoresHg17.html. Genoscope will not be maintaining this
# newest data in their Exofish comparative browser display.
# ACCESSIONS FOR CONTIGS (DONE 2005-03-17 kate)
# Used for Assembly track details, and also ENCODE AGP's
cd /cluster/data/tetNig1/bed
mkdir -p contigAcc
cd contigAcc
# extract WGS contig to accession mapping from Entrez Nucleotide summaries
# To get the summary file, access the Genbank page for the project
# by searching:
# genus[ORGN] AND WGS[KYWD]
# At the end of the page, click on the list of accessions for the contigs.
# Select summary format, and send to file.
# output to file <species>.acc
grep SCAF tetra.acc | wc -l
# 25773
# edit hg/encode/regionAgp/contigAccession.pl to recognize tetra format
# and install in /cluster/bin/scripts
contigAccession tetra.acc > contigAcc.tab
wc -l contigAcc.tab
# 25773
grep SCAF /cluster/data/tetNig1/chr.agp | wc -l
# 25773
hgsql tetNig1 < $HOME/kent/src/hg/lib/contigAcc.sql
echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
hgsql tetNig1
hgsql tetNig1 -e "SELECT COUNT(*) FROM contigAcc"
# 25773
#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
# MAKE 2BIT FILES FOR 500 kb SEQUENCE CONTIGS, CHROMS CONTIGS AND RANDOM
# CHROMOSOME SCAFFOLDS
# (DONE, 2005-08-24, hartera)
ssh hgwdev
# 2bit file from 500 kb contigs file used for blastz alignments
faToTwoBit /cluster/bluearc/tetNig1/contigs/tetNig1Contigs.fa \
/cluster/bluearc/tetNig1/contigs/tetNig1Contigs.2bit
# then make FASTA file and 2bit file for just the ordered chroms and
# not the randoms.
ssh kkr1u00
mkdir -p /cluster/data/tetNig1/twoBit
cd /cluster/data/tetNig1/twoBit
grep '>' /iscratch/i/tetNig1/contigs/tetNig1Contigs.fa | sed -e 's/>//' \
> tetNig1.contigs
grep -v "random" tetNig1.contigs > tetNig1.contigs.norandoms
wc -l tetNig1.contigs.norandoms
# 507 tetNig1.contigs.norandoms
# get these sequence without the randoms, run randoms as scaffolds
faSomeRecords /iscratch/i/tetNig1/contigs/tetNig1Contigs.fa \
tetNig1.contigs.norandoms tetNig1ContigNoRandoms.fa
faToTwoBit tetNig1ContigsNoRandoms.fa tetNig1ContigsNoRandoms.2bit
mv tetNig1ContigsNoRandoms.2bit /cluster/bluearc/tetNig1/contigs/
cp /cluster/bluearc/tetNig1/contigs/tetNig1ContigsNoRandoms.2bit \
/iscratch/i/tetNig1/contigs/
# get list of scaffolds for randoms - need to make faFrag list to get
# sequences:
cd /cluster/data/tetNig1/twoBit
mkdir -p /cluster/bluearc/tetNig1/randomScaffolds
mkdir -p /cluster/bluearc/tetNig1/randomFa
mkdir run
cd run
grep 'random' /cluster/data/tetNig1/chrom.lst > chroms.random
foreach c (`cat chroms.random`)
set dir=/cluster/data/tetNig1
echo $c
cp $dir/${c}/chr{$c}.fa /cluster/bluearc/tetNig1/randomFa/
end
cat > getSeqs.csh << 'EOF'
#!/bin/csh -ef
set inFa=$1
set st=$2
set end=$3
set out=$4
faFrag -mixed $inFa $st $end $out > ${out}.log
'EOF'
chmod +x getSeqs.csh
awk '{if (($1 ~ /random/) && ($5 ~ /SCAF/)) print "/cluster/data/tetNig1/twoBit/run/getSeqs.csh","/cluster/bluearc/tetNig1/randomFa/"$1".fa",$2-1,$3,"/cluster/bluearc/tetNig1/randomScaffolds/"$5".fa"}' /cluster/data/tetNig1/chr.agp \
> jobList
para create jobList
awk '{if ($1 ~ /random/ && ($5 !~ /GAP/)) print $5;}' \
/cluster/data/tetNig1/chr.agp > randoms.scaffolds
wc -l jobList
# 24571 jobList
wc -l randoms.scaffolds
# 24571 randoms.scaffolds
para try, check, push, check etc.
# para time
# Completed: 24571 of 24571 jobs
# CPU time in finished jobs: 249907s 4165.12m 69.42h 2.89d 0.008 y
# IO & Wait Time: 93508s 1558.46m 25.97h 1.08d 0.003 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3689s 61.48m 1.02h 0.04d
# Submission to last job: 3952s 65.87m 1.10h 0.05d
# once have all the sequences, cat together and make a 2bit file
ssh kolossus
cd /cluster/bluearc/tetNig1/randomScaffolds
# replace FASTA header with scaffold name and cat sequences together
foreach f (SCAF*.fa)
set s=$f:r
perl -pi.bak -e "s/>chr[0-9Un]+_random:[0-9]+\-[0-9]+/>$s/" $f
cat $f >> tetraRandomsScaffolds.fa
end
faToTwoBit tetraRandomsScaffolds.fa tetraRandomsScaffolds.2bit
# then cat to the contigs file without randoms and make a 2bit file
cat tetraRandomsScaffolds.fa \
/cluster/data/tetNig1/twoBit/tetNig1ContigNoRandoms.fa \
> tetNig1ChrContigsRandomScafs.fa
faToTwoBit tetNig1ChrContigsRandomScafs.fa \
tetNig1ChrContigsRandomScafs.2bit
ssh kkr1u00
cd /cluster/bluearc/tetNig1/randomScaffolds
# copy 2bit files to iservers
cp *.2bit /iscratch/i/tetNig1/contigs
cp *.2bit /cluster/bluearc/tetNig1/contigs
# rsync to the iservers - this takes a long time these days
# so do this with rsync
foreach i (2 3 4 5 6 7 8)
rsync -a --progress --delete --rsh=rsh \
/iscratch/i/tetNig1/contigs/ kkr${i}u00:/iscratch/i/tetNig1/contigs/
end
# make a 2bit file with the full chroms and the random scaffolds
ssh kolossus
cd /cluster/data/tetNig1/twoBit
grep -v "random" /cluster/data/tetNig1/chrom.lst > chromsNoRandoms.lst
foreach c (`cat chromsNoRandoms.lst`)
cat /cluster/data/tetNig1/${c}/chr${c}.fa >> tetNig1chromsNoRandoms.fa
end
# then add scaffolds FASTAs
cat tetNig1chromsNoRandoms.fa \
/cluster/bluearc/tetNig1/randomScaffolds/tetraRandomsScaffolds.fa \
> tetNig1ChromsRandomScafs.fa
faToTwoBit tetNig1ChromsRandomScafs.fa \
tetNig1ChromsRandomScafs.2bit
mkdir -p /cluster/bluearc/tetNig1/twoBit
mv tetNig1ChromsRandomScafs* /cluster/bluearc/tetNig1/twoBit/
ssh kkstore03
mkdir /cluster/data/tetNig1/liftScaffoldsToChrom
cd /cluster/data/tetNig1/liftScaffoldsToChrom
foreach c (`cat chromsNoRandoms.lst`)
awk 'BEGIN {FS="\t"} {OFS="\t"} {print $2-1,$6,$3,$1;}' \
/cluster/data/tetNig1/${c}/${c}.agp > ${c}.lft
awk '{if ($1 == "'${c}'") print $2;}' chrom.sizes
#########################################################################
# MOUSE NET/CHAINS MM7 - Info contained in makeMm7.doc (200503 Hiram)
#########################################################################
# Prep for the Mm7 net/chains, need a lift file for the randoms
# (DONE - 2005-09-20 - Hiram)
ssh kkstore003
cd /cluster/data/tetNig1
# We need the lift information from the random.agp files
cp -p /cluster/data/cb2/scripts/agpToLift.pl ./jkStuff
mkdir liftScaffoldsToChrom
for AGP in ?_random/*_random.agp ??_random/*_random.agp
do
CHR=`dirname ${AGP}`
grep -v fragment "${AGP}" | ./jkStuff/agpToLift.pl /dev/stdin \
> liftScaffoldsToChrom/${CHR}.lft
done
cat liftScaffoldsToChrom/*.lft > jkStuff/chromsAndScafs.lft
cp -p jkStuff/chromsAndScafs.lft \
/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
# BLASTZ SWAP FOR ZEBRAFISH (danRer3) (DONE, 2005-10-20, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS.
# do swap of danRer3 vs. tetNig1 chain and net alignments to
# create tetNig1 vs. danRer3. see makeDanRer3.doc for details.
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
-bigClusterHub=pk \
-swap -chainMinScore=5000 >& doSwap.log &
# Start: Thu Oct 20 11:59 Finished: Oct 20 12:25
# Parameters used for danRer3 vs tetNig1 BLASTZ:
# BLASTZ_M=50
# BLASTZ_H=2500
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# #BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
# BLASTZ_ABRIDGE_REPEATS=0
# Add entries for danRer3 chains and net to trackDb.ra for monDom2 and
# Modify track descriptions to describe the process using scaffolds for danRer3
# chrNA and chrUn and the fact that dynamic masking was used for the Blastz
# alignments - see makeDanRer3.doc for details.
############################################################################
## BLASTZ swap from mm8 alignments (DONE - 2006-02-19 - Hiram)
ssh kk
cd /cluster/data/mm8/bed/blastzTetNig1.2006-02-19
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap -continue=net `pwd`/DEF > swap-net.out 2>&1 &
time nice -n +19 featureBits tetNig1 chainMm8Link
# 47024263 bases of 342403326 (13.734%) in intersection
###########################################################################
# BLASTZ SWAP TO CREATE CHAIN AND NET ALIGNMENTS FOR ZEBRAFISH (danRer4)
# AND CREATE AXNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS
# (DONE, 2006-04-30, hartera)
ssh pk
# Used these BLASTZ parameters - see also makeDanRer4.doc
# BLASTZ_H=2500
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# and no lineage-specific repeats.
# swap directory for alignments is:
# /cluster/data/tetNig1/bed/blastz.danRer4.swap
cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& doSwap.log &
# Took about 22 minutes.
# Check coverage compared to danRer3:
featureBits tetNig1 chainDanRer4Link
# 82462283 bases of 342403326 (24.083%) in intersection
featureBits tetNig1 chainDanRer3Link
# 76373873 bases of 342403326 (22.305%) in intersection
# No refGene table for tetNig1 so use mrna:
featureBits -chrom=chr1 tetNig1 mrna chainDanRer4Link -enrichment
# mrna 2.624%, chainDanRer4Link 22.894%, both 1.297%, cover 49.42%,
# enrich 2.16x
featureBits -chrom=chr1 tetNig1 mrna chainDanRer3Link -enrichment
# mrna 2.624%, chainDanRer3Link 21.321%, both 1.205%, cover 45.92%,
# enrich 2.15x
featureBits -chrom=chr1 tetNig1 mrna netDanRer4 -enrichment
# mrna 2.624%, netDanRer4 78.279%, both 2.278%, cover 86.81%, enrich 1.11x
featureBits -chrom=chr1 tetNig1 mrna netDanRer3 -enrichment
# mrna 2.624%, netDanRer3 95.070%, both 2.435%, cover 92.80%, enrich 0.98x
# Similar coverage for chains and nets for danRer4 and danRer3.
# nets coverage is a little lower for danRer4 than for danRer3.
# Try also Human Proteins (hg17):
featureBits -chrom=chr1 tetNig1 blastHg17KG netDanRer4 -enrichment
# blastHg17KG 5.728%, netDanRer4 78.279%, both 5.666%, cover 98.92%,
# enrich 1.26x
featureBits -chrom=chr1 tetNig1 blastHg17KG netDanRer3 -enrichment
# blastHg17KG 5.728%, netDanRer3 95.070%, both 5.687%, cover 99.29%,
# enrich 1.04x
# Similar coverage here too. Looked at chain and net tracks for danRer4
# on the tetNig1 Browser and they look ok.
#########################################################################
## Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
hgsql -h genome-testdb hgcentraltest \
-e "update dbDb set orderKey = 460 where name = 'tetNig1';"
#########################################################################
# BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-29 - Hiram)
## Align to fr2 scaffolds,
## results lifted to fr2 chrUn coordinates
ssh kkstore03
mkdir /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25
cd /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25
cat << '_EOF_' > DEF
# Tetraodon vs. Fugu
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Tetraodon tetNig1
SEQ1_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=1
# QUERY: Fugu fr2
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/tetNig1Fr2 > do.log 2>&1 &
# real 259m39.548s
## Swap to fr2
mkdir /cluster/data/fr2/bed/blastz.tetNig1.swap
cd /cluster/data/fr2/bed/blastz.tetNig1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -swap > swap.log 2>&1 &
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=net -bigClusterHub=pk -swap > net_swap.log 2>&1 &
# real 40m40.471s
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25
time nice -n +19 featureBits tetNig1 chainFr2Link \
> fb.tetNig1.chainFr2Link.txt 2>&1
# 246828605 bases of 342403326 (72.087%) in intersection
cd /cluster/data/fr2/bed/blastz.tetNig1.swap
time nice -n +19 featureBits fr2 chainTetNig1Link \
> fb.fr2.chainTetNig1.txt 2>&1
# 247086553 bases of 393312790 (62.822%) in intersection
#########################################################################
## SWAP oryLat1 to tetNig1 (DONE - 2007-01-18 - Hiram)
ssh kkstore04
cd /cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13
cat fb.oryLat1.chainTetNig1Link.txt
# 160388236 bases of 700386597 (22.900%) in intersection
# And the swap
ssh kkstore04
cd /cluster/data/tetNig1/bed
mv blastz.oryLat1.swap blastz.oryLat1.swap.2006-12-08
mkdir blastz.oryLat1.swap
cd /cluster/data/tetNig1/bed/blastz.oryLat1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=kk > swap.log 2>&1 &
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/oryLat1/bed/blastz.tetNig1.2006-12-13/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=net -swap -bigClusterHub=kk > net-swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.oryLat1.swap
nice -n 19 featureBits -noRandom tetNig1 chainOryLat1Link \
>fb.tetNig1.oryLat1.txt 2>&1
# 130327582 bases of 342403326 (38.063%) in intersection
##########################################################################
# N-SCAN gene predictions (nscanGene) - (2007-08-16 markd)
cd /cluster/data/tetNig1/bed/nscan/
# obtained NSCAN predictions from michael brent's group
# at WUSTL
wget http://mblab.wustl.edu/predictions/Tetraodon/tetNig1.gtf
wget http://mblab.wustl.edu/predictions/Tetraodon/tetNig1.prot.fa
bzip2 tetNig1.*
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt tetNig1 nscanGene tetNig1.gtf.bz2
# load protein
hgPepPred tetNig1 generic nscanPep tetNig1.prot.fa.bz2
rm *.tab
# update trackDb; need a tetNig1-specific page to describe informants
tetraodon/tetNig1/nscanGene.html (copy from mm8 and edit)
tetraodon/tetNig1/trackDb.ra
##############################################################################
# SWAP mm9 alignments to tetNig1 (DONE - 2007-09-07 - Hiram)
# the original was done at:
cd /cluster/data/mm9/bed/blastzTetNig1.2007-09-06
cat fb.mm9.chainTetNig1Link.txt
# 46206292 bases of 2620346127 (1.763%) in intersection
mkdir /cluster/data/tetNig1/bed/blastz.mm9.swap
cd /cluster/data/tetNig1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/mm9/bed/blastzTetNig1.2007-09-06/DEF \
-chainMinScore=5000 \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-swap -bigClusterHub=kk > swap.log 2>&1 &
# real 19m58.885s
cat fb.tetNig1.chainMm9Link.txt
# 42256263 bases of 342403326 (12.341%) in intersection
##############################################################################
# SWAP DANRER5 BLASTZ RESULT TO GET DANRER5 CHAINS, NETS,
# AXTNET, MAFNET AND ALIGNMENT DOWNLOADS (DONE, 2007-09-17, hartera)
ssh kkstore04
mkdir /cluster/data/tetNig1/bed/blastz.swap.danRer5
cd /cluster/data/tetNig1/bed/blastz.swap.danRer5
# blastz parameters used to align tetNig1 as query to danRer5 as target:
# BLASTZ_H=2500
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# Results for tetNig1 blastz on danRer5 are in:
# /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -swap \
/cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16/DEF \
>& swap.log &
# 0.141u 0.097s 12:30.60 0.0% 0+0k 0+0io 7pf+0w
ssh hgwdev
cat \
/cluster/data/tetNig1/bed/blastz.danRer5.swap/fb.tetNig1.chainDanRer5Link.txt
# 70448527 bases of 342403326 (20.575%) in intersection
# look at coverage of mrna, there is no native RefSeqs
# track for tetraodon, tetNig1.
featureBits tetNig1 all_mrna chainDanRer5Link -enrichment
# all_mrna 3.104%, chainDanRer5Link 20.575%, both 1.382%, cover 44.53%, enrich
# 2.16x
featureBits tetNig1 all_mrna chainDanRer4Link -enrichment
# all_mrna 3.104%, chainDanRer4Link 24.083%, both 1.522%, cover 49.02%, enrich
# 2.04x
featureBits tetNig1 all_mrna netDanRer5 -enrichment
# all_mrna 3.104%, netDanRer5 70.075%, both 2.400%, cover 77.31%, enrich 1.10x
featureBits tetNig1 all_mrna netDanRer4 -enrichment
# all_mrna 3.104%, netDanRer4 69.480%, both 2.445%, cover 78.78%, enrich 1.13x
# clean up a little
ssh kkstore04
cd /cluster/data/tetNig1/bed
mv ./blastz.swap.danRer5/swap.log ./blastz.danRer5.swap
rm -r blastz.swap.danRer5
ln -s blastz.danRer5.swap blastz.danRer5
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-07-18)
ssh kkstore03
# bash if not using bash shell already
mkdir /cluster/data/tetNig1/blastDb
cd /cluster/data/tetNig1
cat noUn/chr*fa > temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa
cat randomContigs/*.fa > temp.fa
faSplit sequence temp.fa 150 blastDb/y
rm temp.fa
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
mkdir -p /san/sanvol1/scratch/tetNig1/blastDb
cd /cluster/data/tetNig1/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/tetNig1/blastDb
done
mkdir -p /cluster/data/tetNig1/bed/tblastn.hg18KG
cd /cluster/data/tetNig1/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/tetNig1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 386 query.lst
# we want around 150000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(150000/386) = 94.510813
mkdir -p /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/kgfa
split -l 95 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
cd /cluster/data/tetNig1/bed/tblastn.hg18KG
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/tetNig1/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit
ssh pk
cd /cluster/data/tetNig1/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 136074 of 136074 jobs
# CPU time in finished jobs: 2945512s 49091.87m 818.20h 34.09d 0.093 y
# IO & Wait Time: 569398s 9489.96m 158.17h 6.59d 0.018 y
# Average job time: 26s 0.43m 0.01h 0.00d
# Longest finished job: 881s 14.68m 0.24h 0.01d
# Submission to last job: 146804s 2446.73m 40.78h 1.70d
ssh kkstore03
cd /cluster/data/tetNig1/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh memk
cd /cluster/data/tetNig1/bed/tblastn.hg18KG/chainRun
para create chainSpec
para try, check, push, check etc.
# Completed: 387 of 387 jobs
# CPU time in finished jobs: 1982s 33.04m 0.55h 0.02d 0.000 y
# IO & Wait Time: 25746s 429.10m 7.15h 0.30d 0.001 y
# Average job time: 72s 1.19m 0.02h 0.00d
# Longest finished job: 165s 2.75m 0.05h 0.00d
# Submission to last job: 2784s 46.40m 0.77h 0.03d
ssh kkstore03
cd /cluster/data/tetNig1/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/tetNig1/bed/tblastn.hg18KG/unliftBlastHg18KG.psl
cd ..
pslCheck unliftBlastHg18KG.psl
liftUp -nohead temp.psl ../../randomContigs/tetNig1.randomContigs.lift carry unliftBlastHg18KG.psl
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl > blastHg18KG.psl
rm temp.psl
# load table
ssh hgwdev
cd /cluster/data/tetNig1/bed/tblastn.hg18KG
hgLoadPsl tetNig1 blastHg18KG.psl
# check coverage
featureBits tetNig1 blastHg18KG
# 19028060 bases of 342403326 (5.557%) in intersection
featureBits tetNig1 ensGene:cds blastHg18KG -enrichment
# ensGene:cds 9.909%, blastHg18KG 5.557%, both 4.906%, cover 49.51%, enrich 8.91x
featureBits tetNig1 nscanGene:cds blastHg18KG -enrichment
# nscanGene:cds 9.388%, blastHg18KG 5.557%, both 3.568%, cover 38.01%, enrich 6.84x
ssh kkstore03
rm -rf /cluster/data/tetNig1/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/tetNig1/bed/tblastn.hg18KG/blastOut
#end tblastn
##########################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram)
ssh kkstore03 # not too important since everything moved to hive
screen # use screen to control this job
cd /cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26
cat fb.oryLat2.chainTetNig1Link.txt
# 163171121 bases of 700386597 (23.297%) in intersection
# And, for the swap:
mkdir /cluster/data/tetNig1/bed/blastz.oryLat2.swap
cd /cluster/data/tetNig1/bed/blastz.oryLat2.swap
time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
/cluster/data/oryLat2/bed/blastz.tetNig1.2008-08-26/DEF \
-swap -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
# real 22m48.350s
cat fb.tetNig1.chainOryLat2Link.txt
# 139059872 bases of 342403326 (40.613%) in intersection
##########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
hgsql -e 'update dbDb set
sourceName="Genoscope and Broad Institute V7 (NCBI project 12350, CAAE01000000)" where name="tetNig1";' hgcentraltest
###########################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
+
############################################################################
+# LIFTOVER TO TetNig2 (DONE - 2010-01-15 - Hiram )
+ mkdir /hive/data/genomes/tetNig1/bed/blat.tetNig2.2010-01-15
+ cd /hive/data/genomes/tetNig1/bed/blat.tetNig2.2010-01-15
+ # -debug run to create run dir, preview scripts...
+ doSameSpeciesLiftOver.pl -debug tetNig1 tetNig2
+ # Real run:
+ time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
+ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
+ tetNig1 tetNig2 > do.log 2>&1
+ # real 36m16.693s
+#############################################################################