src/hg/makeDb/doc/danRer2.txt 1.6
1.6 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/danRer2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer2.txt,v
retrieving revision 1.5
retrieving revision 1.6
diff -b -B -U 1000000 -r1.5 -r1.6
--- src/hg/makeDb/doc/danRer2.txt 28 Mar 2007 17:35:39 -0000 1.5
+++ src/hg/makeDb/doc/danRer2.txt 25 Nov 2009 21:48:38 -0000 1.6
@@ -1,5375 +1,5375 @@
# for emacs: -*- mode: sh; -*-
# Danio Rerio (zebrafish) from Sanger, version Zv4 (released 6/30/04)
# Project website:
# http://www.sanger.ac.uk/Projects/D_rerio/
# Assembly notes:
# http://www.sanger.ac.uk/Projects/D_rerio/Zv4_assembly_information.shtml
# NOTE: Error in scaffolds agp file. Notified Sanger and got new scaffolds
# agp and recreated FASTA files from this new one (2004-11-29)
# Previous agp file was missing a scaffold from the end of most chromosomes.
# There is also a chrUn set of scaffolds that are in the chunks agp file - these# just have the identifier Zv4_scaffoldN instead of a chromosome number and
# they are scaffolds that correspond to FPC contigs but their position is
# unknown so they are not mapped to a chromosome.
# DOWNLOAD SEQUENCE (DONE, 2004-10-18, hartera)
# ERRORS IN SCAFFOLDS AGP SO GET NEW AGP FROM SANGER AND DOWNLOAD
# (hartera, 2004-11-29) from Mario Caccamo: mc2@sanger.ac.uk
ssh kksilo
mkdir /cluster/store8/danRer2
ln -s /cluster/store8/danRer2 /cluster/data
cd /cluster/data/danRer2
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/README
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/stats
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.chunks.agp
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.scaffolds.agp
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/Zv4.fasta
# get new agp file and download to /cluster/data/danRer2 (hartera, 2004-11-29)
# Remove all chrom directories to start processing with new agp file
ssh kksilo
cd /cluster/data/danRer2
foreach c (`cat chrom.lst`)
rm -r $c
end
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2004-10-18, hartera)
# DOWNLOAD SEQUENCE AND CREATE FILES AGAIN (hartera, 2004-11-30)
# ADD chrM.chunks.agp (DONE, 2004-12-06, hartera)
# Error reported by a user: chrM.agp is space delimited rather
# than tab delimited so correct this. NCBI defines the agp format as
# being tab delimited. (DONE, 2005-04-25, hartera)
ssh kksilo
mkdir -p /cluster/data/danRer2/M
cd /cluster/data/danRer2/M
# go to http://www.ncbi.nih.gov/ and search Nucleotide for
# "Danio mitochondrion genome". That shows the gi number:
# 8576324 for the accession, AC024175
# Use that number in the entrez linking interface to get fasta:
wget -O chrM.fa \
'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
# Edit chrM.fa: make sure the header line says it is the
# Danio Rerio mitochondrion complete genome, and then replace the
# header line with just ">chrM".
perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
rm *.bak
# Make a "pseudo-contig" for processing chrM too:
mkdir ./chrM_1
sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
mkdir ./lift
echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
echo "chrM_1" > ./lift/ordered.lst
echo "0 M/chrM_1 16596 chrM 16596" > ./lift/ordered.lft
# create a .agp file for chrM as hgGoldGapGl and other
# programs require a .agp file so create chrM.agp
cat << '_EOF_' > ./chrM.agp
chrM 1 16596 1 F AC024175.3 1 16596 +
'_EOF_'
# Create a chrM.chunks.agp (hartera, 2004-12-06)
cd /cluster/data/danRer2/M/agps
awk 'BEGIN {OFS="\t"} \
{print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' ../chrM.agp \
> chrM.chunks.agp
# chrM.agp is space delimited instead of tab delimited
# so correct this (hartera, 2005-04-25)
cd /cluster/data/danRer2/M
# edit chrM.agp and change field delimiters from spaces to tabs.
# add this new file to zipped files of agp and get it pushed to the
# downloads area - see "MAKE DOWNLOADABLE SEQUENCE FILES" section of
# this make doc.
# Create list of chromsosomes (DONE, 2004-10-18, hartera)
# Add "M" for mitochondrion chromosome (2004-10-25, hartera)
# Add "Un" for chrUn (2004-11-29, hartera)
ssh kksilo
cd /cluster/data/danRer2
awk '{print $1;}' Zv4.scaffolds.agp | sort -n | uniq > chrom.lst
# add NA - these are contigs in the chunks agp
echo "NA" >> chrom.lst
# add chrM
echo "M" >> chrom.lst
# add chrUn
echo "Un" >> chrom.lst
# SPLIT AGP FILES BY CHROMOSOME (DONE, 2004-10-19, hartera)
# AGP USED TO CREATE FASTA WAS SCAFFOLDS AGP
# RE-DO SPLITTING AGP FILES AFTER GETTING NEW SCAFFOLDS AGP
# (hartera, 2004-11-29)
ssh kksilo
cd /cluster/data/danRer2
# There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and
# then one for chunks (contigs on danRer1) showing how they map on to
# scaffolds.
# add "chr" prefix for the agp files
perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
# for chromosomes:
foreach c (`cat chrom.lst`)
mkdir $c
perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
./Zv4.chunks.agp \
> $c/chr$c.chunks.agp
perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
./Zv4.scaffolds.agp \
> $c/chr$c.scaffolds.agp
end
# CREATE AGP AND FASTA FOR chrNA (DONE, 2004-10-25, hartera)
# REMAKE AGP WITH NEW SCAFFOLDS AGP FILE AND CREATE chrUn AGP
# (hartera, 2004-11-29)
ssh kksilo
cd /cluster/data/danRer2
# for NA make agp files
grep "Zv4_NA" Zv4.chunks.agp > NA.chunks.agp
# make a scaffolds agp for NA - use first 9 fields of chunks file
# and remove ".1" from 6th field
awk 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5, $6, $7, $8, $9}' \
NA.chunks.agp | perl -pi.bak -e 's/(Zv4_NA[0-9]+)\.1+/$1/' \
> NA.scaffolds.agp
# move agps to NA directory created above
mv NA.scaffolds.agp NA.chunks.agp ./NA
# from scaffolds agp, get name of scaffolds to get from FASTA file for NA
foreach c (NA)
awk '{print $1;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
$HOME/bin/i386/faSomeRecords /cluster/data/danRer2/Zv4.fasta \
$c/chr$c.scaffolds.lst $c/chr$c.fa
end
# create agp with 1000Ns between scaffolds as contig gaps for chrNA
foreach c (NA)
$HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
mv $c/chr$c.fa $c/chr$c.scaffolds.fa
perl -pi -e "s/chrUn/chr$c/" $c/chr$c.*
end
# change the type to "W" in the agp for WGS
perl -pi.bak -e "s/D/W/;" NA/chrNA.agp
cd NA
mv chrNA.agp chrNA.scaffolds.agp
rm *.bak
# use this chrNA.scaffolds.agp to create chrNA.chunks.agp
cat << '_EOF_' > /cluster/data/danRer2/jkStuff/createChunksAgp.pl
#!/usr/bin/perl -w
use strict;
# input is list of scaffolds and chunks and file of chrN.scaffolds.agp with Ns.
my $accs = $ARGV[0];
my $scaf = $ARGV[1];
open (ACCS, $accs) || die "Can not open $accs: $!";
open (SCAFS, $scaf) || die "Can not open $scaf: $!";
my %accsHash;
while (<ACCS>) {
chomp;
my @fi = split(/\t/);
$accsHash{$fi[0]} = $fi[1];
}
close ACCS;
while (my $line = <SCAFS>) {
chomp $line;
my @f = split(/\t/, $line);
my $acc;
if ($f[4] ne "N") {
if (exists($accsHash{$f[5]}) ) {
$acc = $accsHash{$f[5]};
else {
print "$f[5] can not be found\n";
}
print "$f[0]\t$f[1]\t$f[2]\t$f[3]\t$f[4]\t$acc\t$f[6]\t$f[7]\t$f[8]";
print "\t$f[5]\t$f[6]\t$f[7]";
}
else {
print $line;
}
print "\n";
}
'_EOF_'
chmod +x /cluster/data/danRer2/jkStuff/createChunksAgp.pl
awk 'BEGIN {OFS="\t";} { if ($1 ~ /^Zv4_NA/) print $1,$6 }' \
/cluster/data/danRer2/Zv4.chunks.agp > NA.accs
perl ../jkStuff/createChunksAgp.pl NA.accs chrNA.scaffolds.agp \
> chrNA.chunks.agp
# Also create agp for chrUn - these are scaffolds that map to FPC contigs
# but are unplaced on the chromosomes
# for Un, make agp files
ssh kksilo
cd /cluster/data/danRer2
mkdir Un
# make a scaffolds agp for Un - use first 9 fields of chunks file
# and remove ".1" from 6th field
awk 'BEGIN {OFS="\t"} {if ($1 ~ /Zv4_scaffold/) \
print $1, $2, $3, $4, $5, $6, $7, $8, $9}' \
Zv4.chunks.agp | perl -pi.bak -e 's/(Zv4_NA[0-9]+)\.1+/$1/' \
> Un/Un.scaffolds.agp
# from scaffolds agp, get name of scaffolds to get from FASTA file for Un
foreach c (Un)
awk '{print $1;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
$HOME/bin/i386/faSomeRecords /cluster/data/danRer2/Zv4.fasta \
$c/chr$c.scaffolds.lst $c/chr$c.fa
end
# create agp with 1000Ns between scaffolds as contig gaps for chrUn
foreach c (Un)
$HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
mv $c/chr$c.fa $c/chr$c.scaffolds.fa
end
# change the type to "W" in the agp for WGS
perl -pi.bak -e "s/D/W/;" Un/chrUn.agp
cd Un
mv chrUn.agp chrUn.scaffolds.agp
rm *.bak
# create chunks agp for chrUn
# this includes accessions so need to get from Zv4.chunks.agp
# modify perl script above to do this (2004-12-06, hartera)
awk 'BEGIN {OFS="\t";} { if ($1 ~ /^Zv4_/) print $1,$6 }' \
/cluster/data/danRer2/Zv4.chunks.agp > Un.accs
perl ../jkStuff/createChunksAgp.pl Un.accs chrUn.scaffolds.agp \
> chrUn.chunks.agp
# BUILD CHROM-LEVEL SEQUENCE (DONE, 2004-10-21, hartera)
# Move scaffolds files for NA into scaffolds directory (2004-11-22, hartera)
# RE-BUILD SEQUENCE WITH NEW AGPS FROM CORRECTED SCAFFOLDS AGP
# (2004-11-30, hartera)
ssh kksilo
cd /cluster/data/danRer2
# Sequence is already in upper case so no need to change
foreach c (`cat chrom.lst`)
echo "Processing ${c}"
$HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
$c/chr$c.fa ./Zv4.fasta
echo "${c} - DONE"
end
# some Ns in sequence files are in lower case so change to upper case
foreach c (`cat chrom.lst`)
cat $c/chr${c}.fa | tr 'n' 'N' > $c/chr${c}.fa.tr
if ($c == "Un") then
perl -pi.bak -e 's/^>chrUN/>chrUn/' $c/chr${c}.fa.tr
endif
mv $c/chr${c}.fa.tr $c/chr${c}.fa
end
# move scaffolds agp to be chrom agp and clean up (2004-11-30)
foreach c (`cat chrom.lst`)
cd $c
rm *.bak
cp chr${c}.scaffolds.agp chr${c}.agp
mkdir agps
mv chr${c}.*.agp ./agps/
cd ..
end
# move scaffolds files for NA into scaffolds directory (2004-11-22)
# and again (2004-11-30)
foreach c (NA Un)
mkdir -p /cluster/data/danRer2/$c/scaffolds
cd /cluster/data/danRer2/$c
mv chr$c.scaffolds.* ./scaffolds
rm $c.*.agp
cd ..
end
# CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2004-10-21, hartera)
# CHECKED THESE ARE OK (hartera, 2004-11-30)
# Check that the size of each chromosome .fa file is equal to the
# last coord of the .agp:
ssh hgwdev
cd /cluster/data/danRer2
foreach c (`cat chrom.lst`)
foreach f ( $c/chr$c.agp )
set agpLen = `tail -1 $f | awk '{print $3;}'`
set h = $f:r
set g = $h:r
echo "Getting size of $g.fa"
set faLen = `faSize $g.fa | awk '{print $1;}'`
if ($agpLen == $faLen) then
echo " OK: $f length = $g length = $faLen"
else
echo "ERROR: $f length = $agpLen, but $g length = $faLen"
endif
end
end
# all are the OK so FASTA files are the expected size
# BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
# (DONE, 2004-10-25, hartera)
# RE-DONE (2004-11-30, hartera)
ssh kksilo
cd /cluster/data/danRer2
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c.agp)
if (-e $agp) then
set fa = $c/chr$c.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
splitFaIntoContigs $agp $fa . -nSize=5000000
endif
end
end
# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-10-25, hartera)
# This used to hold scripts -- better to keep them inline here
# Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
mkdir /cluster/data/danRer2/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/danRer2/bed
# CREATING DATABASE (DONE, 2004-10-25, hartera)
# Create the database.
# next machine
ssh hgwdev
echo 'create database danRer2' | hgsql ''
# if you need to delete that database: !!! WILL DELETE EVERYTHING !!!
echo 'drop database danRer2' | hgsql danRer2
# Delete and re-create database as above (hartera, 2004-11-30)
# Use df to make sure there is at least 10 gig free on
df -h /var/lib/mysql
# Before loading data:
# Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 637G 1023G 39% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-10-25, hartera)
# RECREATE GRP TABLE (hartera, 2004-11-30)
# next machine
ssh hgwdev
# the following command copies all the data from the table
# grp in the database danRer1 to the new database danRer2
echo "create table grp (PRIMARY KEY(NAME)) select * from danRer1.grp" \
| hgsql danRer2
# if you need to delete that table: !!! WILL DELETE ALL grp data !!!
echo 'drop table grp;' | hgsql danRer2
# REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2004-10-26, hartera)
# There is a new Repeat library at WUSTL that has new repeats for Danio rerio
# This is Dr.repeats.020501
# Add a README about these repeats
ssh kksilo
cd /cluster/data/danRer2
wget --timestamp \
http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501
mv Dr.repeats.020501 /cluster/bluearc/RepeatMasker/Libraries/danioRW.lib
# add danioRW.lib to danio.lib
cd /cluster/bluearc/RepeatMasker/Libraries
cp danio.lib danioRMandRW.lib
cat danioRW.lib >> danioRMandRW.lib
# add type as "unknown" to this file as these repeats are not classified
perl -pi.bak -e 's/^(>Dr[0-9]+)/$1#Unknown/' danioRMandRW.lib
# these new repeats are not classified by type so add "DrRW" as type later
# Add a README about these repeats from WUSTL
wget --timestamp \
http://www.genetics.wustl.edu/fish_lab/repeats/Readme.txt
#- Split contigs into 500kb chunks, at gaps if possible:
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
#- Make the run directory and job list:
cd /cluster/data/danRer2
# use RepeatMasker from January 2004
cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/danRer2/$2
/bin/cp $2 /tmp/danRer2/$2/
cd /tmp/danRer2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRW.lib $2
popd
/bin/cp /tmp/danRer2/$2/$2.out ./
if (-e /tmp/danRer2/$2/$2.align) /bin/cp /tmp/danRer2/$2/$2.align ./
if (-e /tmp/danRer2/$2/$2.tbl) /bin/cp /tmp/danRer2/$2/$2.tbl ./
if (-e /tmp/danRer2/$2/$2.cat) /bin/cp /tmp/danRer2/$2/$2.cat ./
/bin/rm -fr /tmp/danRer2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2
'_EOF_'
chmod +x jkStuff/RMZebrafish2
mkdir RMRun
cp /dev/null RMRun/RMJobs
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/danRer2/jkStuff/RMZebrafish \
/cluster/data/danRer2/$d $f \
'{'check out line+ /cluster/data/danRer2/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
#- Do the run
ssh kk
cd /cluster/data/danRer2/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
# para time
# CPU time in finished jobs: 10326858s 172114.29m 2868.57h 119.52d 0.327 y
# IO & Wait Time: 33702s 561.71m 9.36h 0.39d 0.001 y
# Average job time: 3081s 51.35m 0.86h 0.04d
# Longest job: 4065s 67.75m 1.13h 0.05d
# Submission to last job: 39673s 661.22m 11.02h 0.46d
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kksilo
cd /cluster/data/danRer2
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (`cat chrom.lst`)
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
cd ..
end
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/danRer2
hgLoadOut danRer2 */chr*.fa.out
# When try masking sequence (see below) there are 60 instances of negative
# co-ordinates:
# 2 Dr000074
# 48 Dr000158
# 6 Dr000375
# 1 Dr000511
# 1 Dr000759
# 1 Dr000975
# 5 Dr001181
# Sent sample output from chr1_3_21 to Arian Smit and he suggested that
# Dr000158 is a Satellite. When this classification is added to the FASTA
# header: >Dr000158#Satellite, the negative co-ordinates disappeared.
# If the classification is changed to "Unknown" then there are negative
# co-ordinates.
# Took a look at these 7 repeats above and found overlapping matches
# Yi Zhou at Boston Children's Hospital looked at the repeats and split
# them up into smaller chunks: danioRMandRWsplit.libe
# Running RepeatMasker with this library removed a lot of negative co-ordinates
# but some new ones appeared. There are 7 instances of negative co-ordinates.
# TDR5, (TAGA)n, Dr000355, Dr001182
# Dr001182 has two repeats, the second being an imperfect replicate of the
# first so this was split into two repeats and RepeatMasker run again (11/18/04)
# This time there were TDR5, (TAGA)n, Dr000355, Dr000509 with negative repeats
# but only 5 instances.
# 11/13/04
# try RepeatMasker with 7 repeats with negative co-ords split into smaller
# repeats. get list of repeat names without these then get those sequences
ssh kksilo
cd /cluster/data/danRer2/bed/
$HOME/bin/i386/faSomeRecords \
/cluster/bluearc/RepeatMasker/Libraries/danioRMandRW.lib \
rmandrw.txt danioRMandRWsplit.lib
# splitreps is list of split repeats
cat splitreps >> danioRMandRWsplit.lib
mv danioRMandRWsplit.lib /cluster/bluearc/RepeatMasker/Libraries/
# then run repeat masker on problem repeat areas e.g. chr1_3_21.fa
mkdir /cluster/data/danRer2/RMRun/testSplitReps
cd /cluster/data/danRer2/RMRun/testSplitReps
nice /cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -lib danioRMandRWsplit.lib chr1_3_21.fa
# works well so change all the classes to unknown and re-run with this library
perl -pi.bak -e 's/^>(Dr[0-9a-z\-]+)#DrRW/>$1#Unknown/' danioRMandRWsplit.lib
# then run RepeatMasker as above with new split library (DONE, 2004-11-16)
# edit RMZebrafish to use this library (library is danioRMandRWsplit.lib)
# and run from RMRun2 directory
# need to remove chrN.fa.masked files
# then convert chrN.fa back to upper case
ssh kksilo
cd /cluster/data/danRer2
foreach c (`cat chrom.lst`)
cd $c
echo "Processing $c ..."
rm chr${c}.fa.masked
cat chr${c}.fa | tr '[a-z]' '[A-Z]' > chr${c}.tmp
perl -pi.bak -e 's/^>CHR([0-9A-Z]+)/>chr$1/' chr${c}.tmp
mv chr${c}.tmp chr${c}.fa
rm chr${c}.tmp.bak
cd ..
end
# still get some negative co-ordinates when try masking
# Dr001182 is a repeat sequence which contains two instances of a repeat with
# the second one not being a perfect match to the first
# split these into two repeats and then re-run RepeatMasker
#
# e-mailed Kerstin Jekosch at Sanger as it looks like they have used this
# WUSTL Repeat library to mask the Zv4 assembly for Ensembl. Kerstin recommended
# downloading this new library from RepBase as it has been properly
# formatted for RepBase version 10. In RepBase, it is the Zebrafish Unclassified
# library and it consists of 958 unfinished consensus sequences of unclassified
# repeats extracted from the library at
# http://www.genetics.wustl.edu/fish_lab/repeats/Dr.repeats.020501
# which has a total of 1225 repeats. 267 repeats present in the library have
# been replaced by a set of consensus sequences of classified transposable
# elements that are reported in Repbase Update and Repbase Reports.
# DOWNLOAD NEW VERSION OF THE WUSTL ZEBRAFISH REPEATS FROM REPBASE
# (DONE, 2004-11-18, hartera)
# Go to http://www.girinst.org/server/RepBase/
# and select zebunc (Zebrafish unclassified library)
# click on repeatmaskerlibraries and then download
# repeatmaskerlibrariesJuly2004.tar.gz
gunzip repeatmaskerlibrariesJuly2004.tar.gz
tar xvf repeatmaskerlibrariesJuly2004.tar
perl -pi.bak -e 's/^>(Dr[0-9]+)/>$1#Unknown/' zebunc.ref
cat danio.lib zebunc.ref >> danioandzebunc.lib
# REDO REPEATMASKER RUN AND LOAD NEW RESULTS (DONE, 2004-11-22, hartera)
# REDONE (hartera, 2004-12-01)
# sequences already split into 500kb chunks - see above
# use RepeatMasker open-3.0 version, Sep 2004, this was in
# /cluster/bluearc/RepeatMasker.040909 and is now the default
# new zebrafish library was downloaded from RepBase - zebunc.ref
# copy to /cluster/bluearc/RepeatMasker.040909/Libraries
# add "Unknown" as classification for these repeats
perl -pi.bak -e 's/>(Dr[0-9]+)/>$1#Unknown \@danio [S:]' zebunc.ref
# add to RepeatMasker library
mv RepeatMasker.lib RepeatMasker.lib.old
cat RepeatMasker.lib.old zebunc.ref >> RepeatMasker.lib
cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/danRer2/$2
/bin/cp $2 /tmp/danRer2/$2/
cd /tmp/danRer2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species danio $2
popd
/bin/cp /tmp/danRer2/$2/$2.out ./
if (-e /tmp/danRer2/$2/$2.align) /bin/cp /tmp/danRer2/$2/$2.align ./
if (-e /tmp/danRer2/$2/$2.tbl) /bin/cp /tmp/danRer2/$2/$2.tbl ./
if (-e /tmp/danRer2/$2/$2.cat) /bin/cp /tmp/danRer2/$2/$2.cat ./
/bin/rm -fr /tmp/danRer2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer2
'_EOF_'
chmod +x jkStuff/RMZebrafish
rm -r RMRun
mkdir -p RMRun
cp /dev/null RMRun/RMJobs
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/danRer2/jkStuff/RMZebrafish \
/cluster/data/danRer2/$d $f \
'{'check out line+ /cluster/data/danRer2/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
#- Do the run
ssh kk
cd /cluster/data/danRer2/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
# para time
# Completed: 3899 of 3899 jobs
# CPU time in finished jobs: 11636116s 193935.26m 3232.25h 134.68d 0.369 y
# IO & Wait Time: 39078s 651.31m 10.86h 0.45d 0.001 y
# Average job time: 2994s 49.91m 0.83h 0.03d
# Longest job: 4064s 67.73m 1.13h 0.05d
# Submission to last job: 19022s 317.03m 5.28h 0.22d
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kksilo
cd /cluster/data/danRer2
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (`cat chrom.lst`)
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
cd ..
end
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/danRer2
hgLoadOut danRer2 */chr*.fa.out
# Note: 23 records dropped due to repStart > repEndm (2004-12-01)
# Processing 1/chr1.fa.out
bad rep range in: 1354 157 0 0 chr1 7313059 7313288 -5489473
5 - (0) (917) (917) 689 5 3
bad rep range in: 794 247 63 0 chr1 22624284 22624474
-39583549 - (0) (860) (860) 659 0 1
*
bad rep range in: 841 234 44 0 chr1 27730487 27730692
-34477331 - (0) (555) (555) 342 5 1
# Processing 11/chr11.fa.out
bad rep range in: 2007 124 74 108 chr11 6853360 6853376 -3076000
2 + HATN10_DR DNA hAT 167 166 -289 2
# Processing 13/chr13.fa.out
bad rep range in: 939 65 0 158 chr13 11182894 11182923
-26476056 + DIRS1_DR LTR DIRS1 6133 6132
0 1
# Processing 14/chr14.fa.out
bad rep range in: 350 125 29 137 chr14 39592288 39592300
-20407524 + HAT1_DR DNA hAT 612 611 -2681
8
# Processing 16/chr16.fa.out
bad rep range in: 311 225 0 136 chr16 38708823 38708835
-4054346 + Dr000294 Unknown Unknown 403 400
-549 6
# Processing 18/chr18.fa.out
bad rep range in: 9780 128 42 58 chr18 12479604 12480762
-35227702 + Dr000158 Unknown Unknown 45 -2229
-3930 1
# Processing 19/chr19.fa.out
bad rep range in: 249 169 0 167 chr19 25584255 25584266
-26439321 + Dr000331 Unknown Unknown 314 311
-844 3
# Processing 2/chr2.fa.out
bad rep range in: 596 206 44 0 chr2 24978519 24978655
-27097055 - (1) (317) (317) 176 5 4
bad rep range in: 326 56 19 0 chr2 40153004 40153058
-11922652 - (129) (79) (79) 25 5 2
bad rep range in: 1454 56 0 0 chr2 50993722 50993901
-1081809 - (0) (1155) (1155) 977 5 8
# Processing 3/chr3.fa.out
bad rep range in: 605 76 0 11 chr3 42820214 42820307
-1974931 - (6) (5062) (5062) 4971 5 3
# Processing 4/chr4.fa.out
bad rep range in: 1072 143 21 100 chr4 920087 920366 -3235941
8 - (1) (540) (540) 284 5 1
bad rep range in: 330 194 109 29 chr4 1398685 1398823 -3188096
1 - (204) (375) (375) 227 5 14
bad rep range in: 978 212 58 90 chr4 24351201 24351580
-8928204 - (594) (553) (553) 187 5 1
# Processing 5/chr5.fa.out
bad rep range in: 4380 113 49 44 chr5 4937637 4937659 -6284667
7 + TDR23 DNA DNA 889 888 -259 1
# Processing 6/chr6.fa.out
bad rep range in: 649 14 0 0 chr6 7782737 7782809 -2542980
7 - (956) (419) (419) 348 5 1
# Processing 7/chr7.fa.out
bad rep range in: 272 158 0 50 chr7 19882140 19882200
-42635991 - (800) (419) (419) 363 5 7
# Processing NA/chrNA.fa.out
bad rep range in: 1068 181 113 122 chrNA 75025954 75025966
-243271514 + TDR18 DNA DNA 188 187 -385
5
bad rep range in: 493 109 3 153 chrNA 202800523 20280058
5 -115496895 + Dr000876 Unknown Unknown 1 -32
-157 1
bad rep range in: 444 271 4 164 chrNA 249521533 24952154
8 -68775932 + CR1-1_DR LINE L2 4833 4832
-490 9
# Processing Un/chrUn.fa.out
bad rep range in: 237 149 0 152 chrUn 96855194 96855206
-76596190 + Dr000331 Unknown Unknown 312 311
-844 1
# To display the new repeats which are classed as "Unknown", add this class
# to $HOME/kent/src/hg/hgTracks/rmskTrack.c
# to the repeatClassNames and repeatClasses arrays
# check coverage of repeats masked
# featureBits -chrom=chr1 danRer1 rmsk
# 11589712 bases of 40488791 (28.624%) in intersection
# featureBits -chrom=chr1 danRer2 rmsk
# 26879295 bases of 61678023 (43.580%) in intersection
# MAKE LIFTALL.LFT (DONE, 2004-10-26, hartera)
# RE-DONE (hartera, 2004-12-01)
ssh kksilo
cd /cluster/data/danRer2
cat */lift/ordered.lft > jkStuff/liftAll.lft
# SIMPLE REPEAT [TRF] TRACK (DONE, 2004-10-26, hartera)
# RE-DONE (DONE, hartera, 2004-12-02)
# TRF runs pretty quickly now... it takes a few hours total runtime,
# so instead of binrsyncing and para-running, just do this on the
# local fileserver
ssh kksilo
rm -r /cluster/data/danRer2/bed/simpleRepeat
mkdir -p /cluster/data/danRer2/bed/simpleRepeat
cd /cluster/data/danRer2/bed/simpleRepeat
mkdir trf
cp /dev/null jobs.csh
foreach d (/cluster/data/danRer2/*/chr*_?{,?})
set ctg = $d:t
foreach f ($d/${ctg}.fa)
set fout = $f:t:r.bed
echo $fout
echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
>> jobs.csh
end
end
chmod a+x jobs.csh
csh -ef jobs.csh >&! jobs.log &
# check on this with
tail -f jobs.log
wc -l jobs.csh
ls -1 trf | wc -l
endsInLf trf/*
# kksilo overloaded so take chrNA_35 onwards as jobs2.csh and run on kolossus
liftUp simpleRepeat.bed /cluster/data/danRer2/jkStuff/liftAll.lft warn \
trf/*.bed
# Load into database
ssh hgwdev
cd /cluster/data/danRer2/bed/simpleRepeat
hgLoadBed danRer2 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-10-26, hartera)
# RE-DONE (2004-12-02, hartera)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kksilo
cd /cluster/data/danRer2/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/danRer2
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (`cat chrom.lst`)
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF
# (DONE, 2004-11-22, hartera)
# RE-DONE (2004-12-02, hartera)
# RE-MAKE MASKED CHR1 SEQUENCE AS INCOMPLETELY MASKED (2005-06-06, hartera)
# Although there was an error in the RepeatMasker output for chr1, this
# loaded ok for the chr1_rmsk table so that is complete. It is just the
# actual sequence that was not masked properly.
ssh kksilo
cd /cluster/data/danRer2
# Soft-mask (lower-case) the contig and chr .fa's,
# then make hard-masked versions from the soft-masked.
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
# errors in the RepeatMasker output with negative co-ordinates
# (60 instances) - mainly for specific sequences in the new zebrafish
# library so took a look at these together with Yi Zhou from Boston
# Children's Hospital who split these repeats into smaller sequences.
# These were used to replace the original repeats in the repeat library and # RepeatMasker was re-run. Finally, it was found that a newer version of
# RepeatMasker open-3.0.5 version reduced the negative co-ordinates to
# 2 instances and a cleaned up version of the new zebrafish library
# from RepBase was also used - see above.
foreach f (*/chr*.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
# with the new version of RepeatMasker, there are 2 instances of negative
# co-ordinates which can just be ignored.
# WARNING: negative rEnd: -2229 chr18:12479605-12480762 Dr000158
# WARNING: negative rEnd: -32 chrNA:202800524-202800585 Dr000876
# with new agp and chrom sequence, there is also an instance of a * instead
# of a co-ordinate (2004-12-02)
# repeat- and trf-masking 1/chr1.fa
# invalid signed number: "*"
# mask contigs also
foreach c (`cat chrom.lst`)
echo "repeat- and trf-masking contigs of chr$c"
foreach d ($c/chr*_?{,?})
set ctg=$d:t
set f=$d/$ctg.fa
maskOutFa -soft $f $f.out $f
maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
maskOutFa $f hard $f.masked
end
end
# same warnings here too:
# WARNING: negative rEnd: -2229 chr18_3:2110746-2111903 Dr000158
# WARNING: negative rEnd: -32 chrNA_41:1844381-1844442 Dr000876
# repeat- and trf-masking contigs of chr1
# invalid signed number: "*" (2004-12-02) and the co-ordinates for chr18_3
# are: WARNING: negative rEnd: -2229 chr18_3:1862490-1863647 Dr000158
# Build nib files, using the soft masking in the fa
mkdir nib
foreach f (*/chr*.fa)
faToNib -softMask $f nib/$f:t:r.nib
end
# Re-do masking for chr1.fa. A user noticed that the masking for this
# is very low compared to other chroms (2% vs almost 50%). Re-running
# maskOutFa and debugging revealed that the error reported above
# "invalid signed number: "*" " is causing the program to abort
# so the chrom is incompletely masked. This row of the RepeatMasker file
# (chr1.fa.out) is incorrect and the maskOutFa program fails when it
# tries to load this line into a data structure.
# Solution: remove the erroneous line (line 52575) and re-run maskOutFa
ssh kkstore01
mkdir /cluster/data/danRer2/tmpMask
cd /cluster/data/danRer2/tmpMask
cp /cluster/data/danRer2/1/chr1.fa .
cp /cluster/data/danRer2/1/chr1.fa.out .
# make sure it is all upper case
cat chr1.fa | tr '[a-z]' '[A-Z]' > chr1.tmp
sed -e 's/>CHR1/>chr1/' chr1.tmp > chr1.fa
# remove line 52575 from chr1.fa.out
# re-run maskOutFa
set trfCtg=/cluster/data/danRer2/bed/simpleRepeat/trfMask
set trfChr=/cluster/data/danRer2/bed/simpleRepeat/trfMaskChrom
foreach f (chr1.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
# then add chr1New as FASTA heading and copy to the directory for chr1
sed -e 's/>chr1/>chr1New/' chr1.fa > chr1New.fa
sed -e 's/>chr1/>chr1New/' chr1.fa.masked > chr1New.fa.masked
mv chr1.fa.out /cluster/data/danRer2/1/chr1New.fa.out
mv chr1New.fa /cluster/data/danRer2/1/
mv chr1New.fa.masked /cluster/data/danRer2/1/
cd ..
rm -r tmpMask
cd 1
faSize chr1.fa
# 62208023 bases (3421437 N's 58786586 real 57507587 upper 1278999 lower)
faSize chr1New.fa
# 2.1% are in lower case
# 62208023 bases (3421437 N's 58786586 real 31874160 upper 26912426 lower)
# 43% masked as lower case
faSize chr1.fa.masked
# 62208023 bases (4700436 N's 57507587 real 57507587 upper 0 lower)
# 7.6% are Ns
faSize chr1New.fa.masked
# 62208023 bases (30333863 N's 31874160 real 31874160 upper 0 lower)
# 49% are Ns
# just use this new masked sequence for downloads and replace the
# masked chr1 sequences for downloads - see DOWNLOADABLE SEQUENCE
# section below
# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2004-11-22, hartera)
# RE-DONE (2004-12-02, hartera)
# MOVE danRer2.2bit out of gbdb nib directory (2004-12-15, hartera)
# Make symbolic links from /gbdb/danRer2/nib to the real nibs.
ssh hgwdev
cd /cluster/data/danRer2
mkdir -p /gbdb/danRer2/nib
foreach f (/cluster/data/danRer2/nib/chr*.nib)
ln -s $f /gbdb/danRer2/nib
end
# Load /gbdb/danRer2/nib paths into database and save size info
# hgNibSeq creates chromInfo table
hgNibSeq -preMadeNib danRer2 /gbdb/danRer2/nib */chr*.fa
echo "select chrom,size from chromInfo" | hgsql -N danRer2 > chrom.sizes
# take a look at chrom.sizes, should be 28 lines
wc chrom.sizes
# Make one big 2bit file as well, and make a link to it in
# /gbdb/danRer2/nib because hgBlat looks there:
faToTwoBit */chr*.fa danRer2.2bit
ln -s /cluster/data/danRer2/danRer2.2bit /gbdb/danRer2/nib/
# Move this link out of nib directory (2004-12-15, hartera)
rm /gbdb/danRer2/nib/danRer2.2bit
ln -s /cluster/data/danRer2/danRer2.2bit /gbdb/danRer2/
# MAKE GOLD AND GAP TRACKS (DONE, 2004-11-22, hartera)
# RE-DONE (2004-12-03, hartera)
ssh hgwdev
cd /cluster/data/danRer2
# the gold and gap tracks are created from the chrN.agp file and this is
# the scaffolds or supercontigs agp
# move other agp files out of the way to an agps directory
foreach c (`cat chrom.lst`)
mkdir ./$c/agps
mv ./$c/chr${c}.chunks.agp ./$c/agps/
mv ./$c/chr${c}.scaffolds.agp ./$c/agps/
end
# move the scaffolds agp for NA to agps directory
mv ./NA/scaffolds/chrNA.scaffolds.agp ../agps
# the gold and gap tracks are created from the chrN.agp file
hgGoldGapGl -noGl -chromLst=chrom.lst danRer2 /cluster/data/danRer2 .
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DANRER2
# (DONE, 2004-11-22, hartera)
# Correct nibPath for danRer2 in dbDb table (2004-12-15, hartera)
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
cd $HOME/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add danRer2 in all the right places and do
make update
make alpha
cvs commit -m "Added danRer2." makefile
# Add dbDb and defaultDb entries:
echo 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
values("danRer2", "June 2004", \
"/gbdb/danRer1/nib", "Zebrafish", "chr2:16,330,443-16,335,196", 1, \
37, "Zebrafish", "Danio rerio", \
"/gbdb/danRer2/html/description.html", 0, 0, \
"Sanger Centre, Danio rerio Sequencing Project Zv4");' \
| hgsql -h genome-testdb hgcentraltest
# set danRer2 to be the default assembly for Zebrafish
echo 'update defaultDb set name = "danRer2" \
where genome = "Zebrafish";' \
| hgsql -h genome-testdb hgcentraltest
# dbDb table has danRer1 instead of danRer2 in nib path. However, this
# should point to the position of the danRer2.2bit file
# (2004-12-15, hartera)
echo 'update dbDb set nibPath="/gbdb/danRer2" \
where name = "danRer2";' | hgsql hgcentraltest
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-11-22, hartera)
ssh hgwdev
mkdir /cluster/data/danRer2/html
# make a symbolic link from /gbdb/danRer2/html to /cluster/data/danRer2/html
ln -s /cluster/data/danRer2/html /gbdb/danRer2/html
# Add a description page for zebrafish
cd /cluster/data/danRer2/html
cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer1/description.html .
# Edit this for zebrafish danRer2
# create a description.html page here
mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
cd ~/kent/src/hg/makeDb/trackDb/zebrafish
cvs add danRer2
cvs commit danRer2
# Add description page here too
cd danRer2
cp /cluster/data/danRer2/html/description.html .
cvs add description.html
cvs commit -m "First draft of description page for danRer2." \
description.html
# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE, 2004-11-22, hartera)
# RE-DONE (2004-12-03, hartera)
# PUT MASKED SEQUENCE ON BLUEARC (DONE, 2004-12-22 and 2004-12-23, hartera)
ssh kkr1u00
# Chrom-level mixed nibs that have been repeat- and trf-masked:
rm -rf /iscratch/i/danRer2/nib
mkdir -p /iscratch/i/danRer2/nib
cp -p /cluster/data/danRer2/nib/chr*.nib /iscratch/i/danRer2/nib
# Pseudo-contig fa that have been repeat- and trf-masked:
rm -rf /iscratch/i/danRer2/trfFa
mkdir /iscratch/i/danRer2/trfFa
foreach d (/cluster/data/danRer2/*/chr*_?{,?})
cp -p $d/$d:t.fa /iscratch/i/danRer2/trfFa
end
rm -rf /iscratch/i/danRer2/rmsk
mkdir -p /iscratch/i/danRer2/rmsk
cp -p /cluster/data/danRer2/*/chr*.fa.out /iscratch/i/danRer2/rmsk
cp -p /cluster/data/danRer2/danRer2.2bit /iscratch/i/danRer2/
iSync
# add to the bluearc
ssh kksilo
mkdir -p /cluster/bluearc/danRer2/nib
cp -p /cluster/data/danRer2/nib/chr*.nib /cluster/bluearc/danRer2/nib
mkdir -p /cluster/bluearc/danRer2/trfFa
foreach d (/cluster/data/danRer2/*/chr*_?{,?})
cp -p $d/$d:t.fa /cluster/bluearc/danRer2/trfFa
end
# CREATE gc5Base wiggle TRACK (DONE, 2004-12-03, hartera)
ssh kksilo
mkdir -p /cluster/data/danRer2/bed/gc5Base
cd /cluster/data/danRer2/bed/gc5Base
# The number of bases that hgGcPercent claimed it measured is calculated, # which is not necessarily always 5 if it ran into gaps, and then the
# division by 10.0 scales down the numbers from hgGcPercent to the range
# [0-100]. wigEncode now replaces wigAsciiToBinary and the previous
# processing step between these two programs. The result file is *.wig.
# Each value represents the measurement over five bases beginning with
# <position>. wigEncode also calculates the zoomed set of data.
nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer2 \
/iscratch/i/danRer2/nib | \
wigEncode stdin gc5Base.wig gc5Base.wib
# load the .wig file back on hgwdev:
ssh hgwdev
cd /cluster/data/danRer2/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/danRer2/wib/gc5Base \
danRer2 gc5Base gc5Base.wig
# and symlink the .wib file into /gbdb
mkdir -p /gbdb/danRer2/wib/gc5Base
ln -s `pwd`/gc5Base.wib /gbdb/danRer2/wib/gc5Base
# MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2004-12-03, hartera)
# Use -repMatch=512 (based on size -- for human we use 1024, and
# the zebrafish genome is ~50% of the size of the human genome
ssh kkr1u00
mkdir /cluster/data/danRer2/bed/ooc
cd /cluster/data/danRer2/bed/ooc
mkdir -p /cluster/bluearc/danRer2
ls -1 /cluster/data/danRer2/nib/chr*.nib > nib.lst
blat nib.lst /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/danRer2/11.ooc -repMatch=512
# Wrote 44008 overused 11-mers to /cluster/bluearc/danRer2/11.ooc
# For 10.ooc, repMatch = 4096 for human, so use 2048
blat nib.lst /dev/null /dev/null -tileSize=10 \
-makeOoc=/cluster/bluearc/danRer2/10.ooc -repMatch=2048
# Wrote 10639 overused 10-mers to /cluster/bluearc/danRer2/10.ooc
# keep copies of ooc files in this directory and copy to iscratch
cp /cluster/bluearc/danRer2/*.ooc .
cp -p /cluster/bluearc/danRer2/*.ooc /iscratch/i/danRer2/
iSync
# ADD CONTIGS TRACK (DONE, 2004-12-06, hartera)
# make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from lift
# RELOAD CONTIGS TRACK - NOT ALL CONTIGS HAD LOADED INTO DATABASE TABLE
# (DONE, 2005-06-15, hartera)
ssh kksilo
mkdir -p /cluster/data/danRer2/bed/ctgPos2
cd /cluster/data/danRer2/bed/ctgPos2
# ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
foreach c (`cat /cluster/data/danRer2/chrom.lst`)
awk 'BEGIN {OFS="\t"} \
{if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
/cluster/data/danRer2/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
end
ssh hgwdev
cd /cluster/data/danRer2/bed/ctgPos2
# Reload table (2005-06-16, hartera) - not all the contigs had loaded
# into the table because the unique primary key for ctgPos2.sql consisted
# of only the first 14 characters of the contig name. However, contigs for
# danRer2 have names such as "Zv4_scaffold99.1" and "Zv4_scaffold99.3" for
# which the first 14 characters are not unique so only the first one is
# loaded. Also, there are cases where the contig has an accession and more
# than one contig share the same accession as their name so names are not
# unique. Therefore, this was changed in ctgPos2.sql so that the first
# 20 characters of the contig name and the chromStart are the composite
# primary key: PRIMARY KEY(contig(20),chromStart),
echo "drop table ctgPos2" | hgsql danRer2
hgsql danRer2 < ~/kent/src/hg/lib/ctgPos2.sql
echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \
| hgsql danRer2
# Add trackDb.ra entry and ctgPos2.html
# ENSEMBL GENES (DONE, 2004-12-06, hartera)
ssh hgwdev mkdir -p /cluster/data/danRer2/bed/ensembl
cd /cluster/data/danRer2/bed/ensembl
# Get the ensembl protein data from
# http://www.ensembl.org/Danio_rerio/martview
# Follow this sequence through the pages:
# Page 1) Make sure that the Danio_rerio choice is selected. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Structures" box.
# Page 4) Choose GTF as the ouput. choose gzip compression. hit export.
# Save as ensemblGene.gtf.gz
# the Ensembl gene predictions are mapped to chromosomes except for
# chrNA and chrUn. So, a lift file needs to be created to lift from the
# scaffold to chromosome co-ordinates
ssh kksilo
mkdir -p /cluster/data/danRer2/bed/ensembl/liftSupertoChrom
cd /cluster/data/danRer2/bed/ensembl/liftSupertoChrom
# the lift files for scaffolds to chrom for NA and Un are already created
cp /cluster/data/danRer2/Un/chrUn.lft .
cp /cluster/data/danRer2/NA/chrNA.lft .
cat *.lft >> liftUnScaffoldToChrom.lft
# get chrUn and chrNA records
cd /cluster/data/danRer2/bed/ensembl
awk '$1 ~ /^Zv4_NA[0-9]+/ || $1 ~ /^Zv4_scaffold[0-9]+/' ensemblGene.gtf \
> ensemblGenechrUns.gtf
# get records for all other chroms
awk '$1 ~ /^[0-9]+/' ensemblGene.gtf > ensemblGenechroms.gtf
liftUp -type=.gtf ensemblGenechrUns.lifted \
./liftSupertoChrom/liftUnScaffoldToChrom.lft warn ensemblGenechrUns.gtf
# Got 38882 lifts in ./liftSupertoChrom/liftUnScaffoldToChrom.lft
sed -e "s/^/chr/" ensemblGenechroms.gtf > ensGene.gtf
cat ensemblGenechrUns.lifted >> ensGene.gtf
# check some of the lifted co-ordinates
# load into database
ssh hgwdev
cd /cluster/data/danRer2/bed/ensembl
/cluster/bin/i386/ldHgGene danRer2 ensGene \
/cluster/data/danRer2/bed/ensembl/ensGene.gtf
# Read 32062 transcripts in 592139 lines in 1 files
# 32062 groups 27 seqs 1 sources 4 feature types
# 32062 gene predictions
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format. Result name ensGtp.
gunzip ensGtp.tsv.gz
hgsql danRer2 < ~/kent/src/hg/lib/ensGtp.sql
# remove header line from ensGtp.txt
echo "load data local infile 'ensGtp.tsv' into table ensGtp" \
| hgsql -N danRer2
# Get the ensembl peptide sequences from
# http://www.ensembl.org/Danio_rerio/martview
# Choose Danio Rerio as the organism
# Follow this sequence through the pages:
# Page 1) Choose the Ensembl Genes choice. Hit next.
# Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
# Page 3) Choose the "Sequences" tab.
# Page 4) Choose Transcripts/Proteins and peptide Only as the output,
# choose text/fasta and gzip compression,
# name the file ensemblPep.fa.gz and then hit export.
gunzip ensemblPep.fa.gz
hgPepPred danRer2 ensembl ensemblPep.fa
# AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2004-12-06, hartera)
# sequences already downloaded for danRer1
ssh hgwdev
cd /projects/compbio/data/microarray/affyZebrafish
cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /cluster/bluearc/affy/
# Set up cluster job to align Zebrafish consensus sequences to danRer2
ssh kkr1u00
mkdir -p /cluster/data/danRer2/bed/affyZebrafish.2004-12-06
ln -s /cluster/data/danRer2/bed/affyZebrafish.2004-12-06 \
/cluster/data/danRer2/bed/affyZebrafish
cd /cluster/data/danRer2/bed/affyZebrafish
mkdir -p /iscratch/i/affy
cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
iSync
ssh kk
cd /cluster/data/danRer2/bed/affyZebrafish
ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst
ls -1 /iscratch/i/danRer2/trfFa/ > genome.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/iscratch/i/danRer2/11.ooc /iscratch/i/danRer2/trfFa/$(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 genome.lst affy.lst template.sub para.spec
mkdir psl
para create para.spec
para try, check, push ... etc.
# para time
# Completed: 299 of 299 jobs
# CPU time in finished jobs: 4702s 78.37m 1.31h 0.05d 0.000 y
# IO & Wait Time: 3077s 51.28m 0.85h 0.04d 0.000 y
# Average job time: 26s 0.43m 0.01h 0.00d
# Longest job: 128s 2.13m 0.04h 0.00d
# Submission to last job: 685s 11.42m 0.19h 0.01d
ssh kksilo
cd /cluster/data/danRer2/bed/affyZebrafish
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyZebrafish.psl
pslSort dirs raw.psl tmp psl
# only use alignments that have at least
# 95% identity in aligned region.
# do not use minCover since a lot of sequence is in Un, NA and Finished
# so genes may be split up so good to see all alignments
pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl
# shorten names in psl file
sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.bak
mv affyZebrafish.psl.bak affyZebrafish.psl
pslCheck affyZebrafish.psl
# psl is good
# load track into database
ssh hgwdev
cd /cluster/data/danRer2/bed/affyZebrafish
hgLoadPsl danRer2 affyZebrafish.psl
# Add consensus sequences for Zebrafish chip
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/hgFixed/affyProbes
ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /gbdb/hgFixed/affyProbes
hgLoadSeq -abbr=Zebrafish: danRer2 \
/gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
# Clean up
rm batch.bak contig.psl raw.psl
# add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/zebrafish/danRer2
# RH MAP TRACK (DONE, 2004-12-07, hartera)
# create sym links to RH sequences in /gbdb/danRer2 (2004-12-17, hartera)
# RE-DO RH MAP TRACK WITH MORE STRINGENT POST-BLAT FILTERING
# (DONE, 2005-05-15, hartera)
# Data from Leonard Zon's lab at the Childrens Hospital, Boston
# Radiation hybdrid (RH) map data consists of 8707 genomic sequences:
# (1) 2835 BAC clone ends (Max Planck Inst. Robert
# Geisler's lab, Tuebingen, Germany,
# (2) 2015 NCBI ESTs (submitted from around the planet,
# (3) 171 RH shotgun sequences (Max Planck Inst. & Children^?s
# Hosp. Boston,
# (4) 3686 WZ compiled ESTs from WashU
#
ssh kkr1u00
mkdir -p /cluster/data/danRer2/bed/ZonLab/rhMap
cd /cluster/data/danRer2/bed/ZonLab/rhMap
# sequences for RHmap are in /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rh.fa\
# rhcontigs.fa is the sequences all in one file with formatted header
# copy to iscratch if not there already
mkdir -p /iscratch/i/danRer2/rhmap
cp -p /cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \
/iscratch/i/danRer2/rhmap
iSync
# do the blat run to map RH map sequences to danRer2
ssh kk
cd /cluster/data/danRer2/bed/ZonLab/rhmap
mkdir psl
ls -1S /iscratch/i/danRer2/rhmap/rhcontigs.fa > rhmap.lst
ls -1S /iscratch/i/danRer2/trfFa/*.fa > genome.lst
# try same parameters as for bacEnds
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/iscratch/i/danRer2/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
gensub2 genome.lst rhmap.lst gsub spec
para create spec
para try
para check
para push, try ... etc.
# Completed: 299 of 299 jobs
# CPU time in finished jobs: 7977s 132.94m 2.22h 0.09d 0.000 y
# IO & Wait Time: 3114s 51.91m 0.87h 0.04d 0.000 y
# Average job time: 37s 0.62m 0.01h 0.00d
# Longest job: 54s 0.90m 0.01h 0.00d
# Submission to last job: 155s 2.58m 0.04h 0.00d
ssh kksilo
cd /cluster/data/danRer2/bed/ZonLab/rhMap
# Make & check the psl table
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create rhmap.psl
pslSort dirs raw.psl tmp psl
# only use alignments that have at least 80% identity in aligned region.
# these are the parameters used for STS markers
# pslReps -nearTop=0.0001 -minAli=0.8 -noIntrons raw.psl \
# contig.psl /dev/null
# Processed 667164 alignments
# try more stringent filtering for these: minAli=0.96 and minCover=0.40
# was suggested by Jim (DONE, 2005-05-15, hartera) - these parameters
# worked well for danRer1 - see makeDanRer1.doc
mv rhMap.psl rhMap.psl.old
mv contig.psl contig.psl.old
pslReps -nearTop=0.0001 -minAli=0.96 -minCover=0.40 raw.psl \
contig.psl /dev/null
# Processed 667164 alignments
liftUp rhMap.psl /cluster/data/danRer2/jkStuff/liftAll.lft warn contig.psl
pslCheck rhMap.psl
# psl is ok.
# for previous parameters, the sequence with the most alignments had 1128.
# over 1400 have multiple alignments and 7064 sequences have 1 alignment
# now the most is 8. 78 sequences have 3-8 alignments, 820 have 2 and
# 6709 have 1 alignment.
# for the new filtered psl there are 8609 alignments, there were 13821
# for the previous parameters (8492 sequences aligned, 98%)
# Load sequence alignments into database.
# Reload rhMap alignments - more stringent filtering (hartera, 2005-05-15)
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/rhMap
echo "drop table rhMap;" | hgsql danRer2
hgLoadPsl danRer2 rhMap.psl
hgsql -N -e "select qName from rhMap;" danRer2 | sort | uniq | wc
# 7607 out of 8690 sequences aligned (88%)
# Add RH Map sequences
# Copy sequences to gbdb if they are not there already
# create sym link in /gbdb/danRer2 directory instead of /gbdb/danRer1
# (2004-12-17, hartera)
mkdir -p /gbdb/danRer2/rhMap
ln -s \
/cluster/data/danRer1/bed/ZonLab/rhmap/seq/rhcontigs.fa \
/gbdb/danRer2/rhMap
cd /cluster/data/danRer2/bed/ZonLab/rhMap
hgLoadSeq danRer2 /gbdb/danRer1/rhmap/rhcontigs.fa
# change extFile entry
echo 'update extFile set path = "/gbdb/danRer2/rhMap/rhcontigs.fa" where id = 15513;' | hgsql danRer2
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DANRER2
# (DONE, 2004-12-08, hartera)
# hgcentraltest is now on hgwdev
ssh hgwdev
# DNA port is "0", trans prot port is "1"
echo 'insert into blatServers values("danRer2", "blat14", "17789", "0", "0"); insert into blatServers values("danRer2", "blat14", "17788", "1", "0");' \
| hgsql hgcentraltest
# if you need to delete those entries
echo 'delete from blatServers where db="danRer2";' \
| hgsql hgcentraltest
# to check the entries:
echo 'select * from blatServers where db="danRer2";' \
| hgsql hgcentraltest
# AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-12-09, hartera)
ssh eieio
cd /cluster/data/genbank
# This is a new assembly, edit the etc/genbank.conf file and add:
# danRer2 (zebrafish)
danRer2.genome = /iscratch/i/danRer2/nib/chr*.nib
danRer2.lift = /cluster/data/danRer2/jkStuff/liftAll.lft
danRer2.downloadDir = danRer2
# Default includes native genbank mRNAs and ESTs,
# genbank xeno mRNAs but no xenoESTs, native RefSeq mRNAs but
# not xeno RefSeq
cvs commit -m "Added danRer2" etc/genbank.conf
# No need to edit ~/kent/src/hg/makeDb/genbank/src/align/gbBlat
# since the 11.ooc file for danRer1 will be good for both assemblies.
# as no new sequence was added, just an improvement on the mapping of
# unmapped sequence to chromosomes
# ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
# danRer genome information
# Install to /cluster/data/genbank:
make install-server
ssh eieio
cd /cluster/data/genbank
# This is an -initial run, all sequences:
nice bin/gbAlignStep -verbose=1 -initial danRer2
# Load results for all sequences:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad danRer2
# Move results need to be moved out the way
cd /cluster/data/genbank/work
mv initial.danRer2 initial.danRer2.allseqs
# BLASTZ SWAP FOR hg17 vs. danRer2 BLASTZ TO CREATE danRer2 vs. hg17
# (DONE, 2004-12-09, hartera)
# USE RESCORED ALIGNMENTS (see makeHg17.doc)
ssh kolossus
mkdir -p /cluster/data/danRer2/bed/blastz.hg17.swap
cd /cluster/data/danRer2/bed/blastz.hg17.swap
# use rescored axtChrom from blastzDanRer2 on hg17
set aliDir = /cluster/data/hg17/bed/blastz.danRer2
cp $aliDir/S1.len S2.len
cp $aliDir/S2.len S1.len
mkdir unsorted axtChrom
cat $aliDir/axtChrom/chr*.axt \
| axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
| axtSplitByTarget stdin unsorted
# Sort the shuffled .axt files.
foreach f (unsorted/*.axt)
echo sorting $f:t:r
axtSort $f axtChrom/$f:t
end
du -sh $aliDir/axtChrom unsorted axtChrom
# 1.1G /cluster/data/hg17/bed/blastz.danRer2/axtChrom
# 1.1G unsorted
# 1.1G axtChrom
rm -r unsorted
# translate sorted axt files into psl
cd /cluster/data/danRer2/bed/blastz.hg17.swap
mkdir -p pslChrom
set tbl = "blastzHg17"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.hg17.swap/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl danRer2 $f
echo "$f Done"
end
# CHAIN HUMAN (hg17) BLASTZ (DONE, 2004-12-10, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/danRer2/bed/blastz.hg17.swap
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
# Reuse gap penalties from hg16 vs chicken run.
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V
72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V
31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V
16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap $1 \
/iscratch/i/danRer2/nib \
/iscratch/i/gs.18/build35/bothMaskedNibs $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# create input list and batch for cluster run
ls -1S /cluster/data/danRer2/bed/blastz.hg17.swap/axtChrom/*.axt \
> input.lst
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 2416s 40.27m 0.67h 0.03d 0.000 y
# IO & Wait Time: 351s 5.85m 0.10h 0.00d 0.000 y
# Average job time: 99s 1.65m 0.03h 0.00d
# Longest job: 144s 2.40m 0.04h 0.00d
# Submission to last job: 524s 8.73m 0.15h 0.01d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# only chrNA has a large number of chains with score <= 5000 (>200,000 chains)
# chrUn has about 90,000 chains with score <=5000
# apply filter with minScore=5000
rm -r chain
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chain
chainSplit chain all.chain
# remove repeats from chains and reload into database
# (2004-12-21, hartera)
mv chain chainRaw
mkdir chain
cd chainRaw
foreach f (*.chain)
set c = $f:r
echo $c
nice chainAntiRepeat /iscratch/i/danRer2/nib \
/iscratch/i/gs.18/build35/bothMaskedNibs $f \
../chain/$c.chain
end
cd ..
chainMergeSort ./chain/*.chain > all.chain.antirepeat
chainSplit chainAR all.chain.antirepeat
# load filtered chains and check
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain/chainAR
foreach i (*.chain)
set c = $i:r
hgLoadChain danRer2 ${c}_chainHg17 $i
echo done $c
end
# featureBits -chrom=chr1 danRer2 refGene:cds chainHg17 -enrichment
# refGene:cds 0.512%, chainHg17 34.242%, both 0.444%, cover 86.63%,
# enrich 2.53x
# featureBits -chrom=chr1 danRer2 refGene:cds chainHg17Link -enrichment
# refGene:cds 0.512%, chainHg17Link 4.760%, both 0.399%, cover 77.80%,
# enrich 16.3
# featureBits -chrom=chr1 danRer1 refGene:cds chainHg17Link -enrichment
# refGene:cds 0.529%, chainHg17Link 3.924%, both 0.409%, cover 77.24%,
# enrich 19.69x
# number of rows for chr1:
# chainHg17Link 185694
# after using chainAntiRepeat
# chainHg17ARLink 176455
# after chainAntiRepeat done on filtered chains:
# featureBits -chrom=chr1 danRer2 refGene:cds chainHg17ARLink -enrichment
# refGene:cds 0.512%, chainHg17ARLink 4.625%, both 0.398%, cover 77.79%,
# enrich 16.82x
# NET HUMAN (hg17) BLASTZ (DONE, 2004-12-10,hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
mkdir preNet
cd chainAR
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 126734336, utime 657 s/100, stime 69
# Add classification info using db tables:
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
# linSpecRep.notInHuman and linSpecRep.notInZebrafish exist
# (see makeHg17.doc)
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
time netClass noClass.net danRer2 hg17 humanhg17.net \
-tNewR=/cluster/bluearc/danRer2/linSpecRep.notInHuman \
-qNewR=/cluster/bluearc/hg17/linSpecRep.notInZebrafish -noAr
# 87.940u 51.110s 4:05.95 56.5% 0+0k 0+0io 631pf+0w
# load net into database
netFilter -minGap=10 humanhg17.net | hgLoadNet danRer2 netHg17 stdin
# featureBits danRer1 refGene:cds netHg17 -enrichment
# refGene:cds 0.462%, netHg17 27.868%, both 0.384%, cover 83.21%, enrich 2.99x
# featureBits danRer2 refGene:cds netHg17 -enrichment
# refGene:cds 0.468%, netHg17 30.608%, both 0.406%, cover 86.82%, enrich 2.84x
# BLASTZ SWAP FOR mm5 vs danRer2 BLASTZ TO CREATE danRer2 vs mm5 BLASTZ
# (DONE, 2004-12-13, hartera)
ssh kolossus
mkdir -p /cluster/data/danRer2/bed/blastz.mm5.swap
cd /cluster/data/danRer2/bed/blastz.mm5.swap
# use rescored axtChrom from blastzDanRer1 on mm5
set aliDir = /cluster/data/mm5/bed/blastz.danRer2
cp $aliDir/S1.len S2.len
cp $aliDir/S2.len S1.len
mkdir unsorted axtChrom
cat $aliDir/axtChrom/chr*.axt \
| axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
| axtSplitByTarget stdin unsorted
# Sort the shuffled .axt files.
foreach f (unsorted/*.axt)
echo sorting $f:t:r
axtSort $f axtChrom/$f:t
end
du -sh $aliDir/axtChrom unsorted axtChrom
# 919M /cluster/data/mm5/bed/blastz.danRer2/axtChrom
# 921M unsorted
# 919M axtChrom
rm -r unsorted
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/danRer2/bed/blastz.mm5.swap
mkdir -p pslChrom
set tbl = "blastzMm5"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.mm5.swap/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl danRer2 $f
echo "$f Done"
end
# CHAIN MOUSE (mm5) BLASTZ (DONE, 2004-12-13, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/danRer2/bed/blastz.mm5.swap
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/danRer2/bed/blastz.mm5.swap/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Reuse gap penalties from hg16 vs chicken run.
cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize^V 11
smallSize^V 111
position^V 1^V 2^V 3^V 11^V 111^V 2111^V 12111^V 32111^V 72111^V 152111^V 252111
qGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600
tGap^V 325^V 360^V 400^V 450^V 600^V 1100^V 3600^V 7600^V 15600^V 31600^V 56600
bothGap^V 625^V 660^V 700^V 750^V 900^V 1400^V 4000^V 8000^V 16000^V 32000^V 57000
'_EOF_'
# << this line makes emacs coloring happy
# create chains with only default filtering on score - minScore = 1000
cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
-linearGap=../../chickenHumanTuned.gap $1 \
/iscratch/i/danRer2/nib \
/cluster/bluearc/scratch/mus/mm5/softNib $2 >& $3
'_EOF_'
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 2405s 40.08m 0.67h 0.03d 0.000 y
# IO & Wait Time: 447s 7.46m 0.12h 0.01d 0.000 y
# Average job time: 102s 1.70m 0.03h 0.00d
# Longest job: 147s 2.45m 0.04h 0.00d
# Submission to last job: 523s 8.72m 0.15h 0.01d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# only chrUn and chrNA have >100,000 chains with score <=5000
# filter on minScore=5000
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k
rm -r chain
chainSplit chain all.chainFilt5k
# remove repeats from chains and reload into database
# (2004-12-21, hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
mv chain chainRaw
mkdir chain
cd chainRaw
foreach f (*.chain)
set c = $f:r
echo $c
nice chainAntiRepeat /iscratch/i/danRer2/nib \
/cluster/bluearc/scratch/mus/mm5/softNib $f \
../chain/$c.chain
end
cd ..
chainMergeSort ./chain/*.chain > all.chain.antirepeat
chainSplit chainAR all.chain.antirepeat
# load chains into database
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain/chainAR
foreach i (*.chain)
set c = $i:r
hgLoadChain danRer2 ${c}_chainMm5 $i
echo done $c
end
# featureBits -chrom=chr1 danRer2 refGene:cds chainMm5 -enrichment
# refGene:cds 0.512%, chainMm5 34.341%, both 0.436%, cover 85.08%, enrich 2.48x
# featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.512%, chainMm5Link 4.588%, both 0.395%, cover 77.08%,
# enrich 16.80x
# featureBits -chrom=chr1 danRer1 refGene:cds chainMm5 -enrichment
# refGene:cds 0.529%, chainMm5 42.554%, both 0.459%, cover 86.70%, enrich 2.04x
# featureBits -chrom=chr1 danRer1 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.529%, chainMm5Link 7.948%, both 0.412%, cover 77.80%,
# enrich 9.79x
# after chainAntiRepeats:
# featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.512%, chainMm5Link 4.499%, both 0.395%, cover 77.08%,
# enrich 17.13x
# before chainAntiRepeat:
# chr1_chainMm5Link 167661
# after chainAntiRepeat:
# chr1_chainMm5Link 162853
# NET MOUSE (mm5) BLASTZ (DONE, 2004-12-13, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
mkdir preNet
cd chainAR
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 119873536, utime 849 s/100, stime 81
# Add classification info using db tables:
# netClass looks for ancient repeats in one of the databases
# hg17 has this table - hand-curated by Arian but this is for
# human-rodent comparisons so do not use here, use -noAr option
mkdir -p /cluster/bluearc/mm5/linSpecRep.notInZebrafish
mkdir -p /cluster/bluearc/danRer2/linSpecRep.notInMouse
cp /iscratch/i/mm5/linSpecRep.notInZebrafish/* \
/cluster/bluearc/mm5/linSpecRep.notInZebrafish
cp /iscratch/i/danRer2/linSpecRep.notInMouse/* \
/cluster/bluearc/danRer2/linSpecRep.notInMouse
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
time netClass noClass.net danRer2 mm5 mouseMm5.net \
-tNewR=/cluster/bluearc/danRer2/linSpecRep.notInMouse \
-qNewR=/cluster/bluearc/mm5/linSpecRep.notInZebrafish -noAr
# 86.210u 51.710s 4:02.29 56.9% 0+0k 0+0io 206pf+0w
netFilter -minGap=10 mouseMm5.net | hgLoadNet danRer2 netMm5 stdin
# featureBits danRer2 refGene:cds netMm5 -enrichment
# refGene:cds 0.468%, netMm5 30.706%, both 0.404%, cover 86.40%, enrich 2.81x
# featureBits danRer1 refGene:cds netMm5 -enrichment
# refGene:cds 0.461%, netMm5 36.622%, both 0.395%, cover 85.70%, enrich 2.34x
# add html and trackDb.ra entries
# after chainAntiRepeat:
# featureBits danRer2 refGene:cds netMm5 -enrichment
# refGene:cds 0.468%, netMm5 30.565%, both 0.405%, cover 86.40%, enrich 2.83x
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-12-14, hartera)
# CORRECTION MADE TO chrM.agp FILE - MADE TAB DELIMITED INSTEAD OF
# SPACE DELIMITED SO REPLACE OLD AGP FILE IN DOWNLOADS
# (DONE, 2005-04-25, hartera)
# REPLACE chr1 WITH chr1New IN chromFa and chromFaMasked - chr1 was
# incompletely masked due to an error in the RepeatMasker output for chr1.
# See section on "MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF"
# (2005-06-06, hartera)
# UPDATED README FILES WITH NEW SEQUENCE FTP FOR SANGER (2005-08-04,hartera)
ssh kksilo
cd /cluster/data/danRer2
#- Build the .zip files
cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
# chrom RepeatMasker out files
zip -j zip/chromOut.zip */chr*.fa.out
# soft masked chrom fasta
zip -j zip/chromFa.zip */chr*.fa
# hard masked chrom fasta
zip -j zip/chromFaMasked.zip */chr*.fa.masked
# chrom TRF output files
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=danRer2 -native GenBank mrna \
/cluster/data/danRer2/zip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=danRer2 -xeno GenBank mrna \
/cluster/data/danRer2/zip/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=danRer2 -native refseq mrna \
/cluster/data/danRer2/zip/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=danRer2 -native GenBank est \
/cluster/data/danRer2/zip/est.fa
cd /cluster/data/danRer2/zip
# zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
zip -j mrna.zip mrna.fa
zip -j xenoMrna.zip xenoMrna.fa
zip -j refMrna.zip refMrna.fa
zip -j est.zip est.fa
'_EOF_'
# << this line makes emacs coloring happy
csh ./jkStuff/zipAll.csh |& tee ./jkStuff/zipAll.log
cd zip
# remake just the agp files zip with correcte chrM.agp (2005-04-25,hartera)
ssh kksilo
cd /cluster/data/danRer2
rm /cluster/data/danRer2/zip/chromAgp.zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
#- Look at zipAll.log to make sure all file lists look reasonable.
# Make upstream files and Copy the .zip files to
# hgwdev:/usr/local/apache/...
ssh hgwdev
cd /cluster/data/danRer2/zip
# make upstream files for zebrafish RefSeq
featureBits danRer2 refGene:upstream:1000 -fa=upstream1000.fa
zip upstream1000.zip upstream1000.fa
featureBits danRer2 refGene:upstream:2000 -fa=upstream2000.fa
zip upstream2000.zip upstream2000.fa
#- Check zip file integrity:
foreach f (*.zip)
unzip -t $f > $f.test
tail -1 $f.test
end
wc -l *.zip.test
set gp = /usr/local/apache/htdocs/goldenPath/danRer2
mkdir -p $gp/bigZips
cp -p *.zip $gp/bigZips
mkdir -p $gp/chromosomes
foreach f (../*/chr*.fa)
zip -j $gp/chromosomes/$f:t.zip $f
end
cd $gp/bigZips
md5sum *.zip > md5sum.txt
cd $gp/chromosomes
md5sum *.zip > md5sum.txt
# Take a look at bigZips/* and chromosomes/*, update their README.txt's
# Remake just the chromAgp.zip with correct chrM.agp (2005-04-25,hartera)
ssh kksilo
cd /cluster/data/danRer2
rm /cluster/data/danRer2/zip/chromAgp.zip
# chrom AGP's
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
ssh hgwdev
cd /cluster/data/danRer2/zip
#- Check zip file integrity:
foreach f (chromAgp.zip)
unzip -t $f > $f.test
tail -1 $f.test
end
wc -l chromAgp.zip.test
# looks good
set gp = /usr/local/apache/htdocs/goldenPath/danRer2
rm $gp/bigZips/chromAgp.zip
rm $gp/bigZips/md5sum.txt
cp -p chromAgp.zip $gp/bigZips
# go to directory with zip files and remake the md5sum.txt file
cd $gp/bigZips
md5sum *.zip > md5sum.txt
# Remake chr1 zips for masked file with chr1New.fa (2005-06-06, hartera)
ssh kkstore01
cd /cluster/data/danRer2
# move old chr1 files temporarily out of the way
mkdir 1/tmp
mv 1/chr1.fa.out ./tmp
mv 1/chr1.fa ./tmp
mv 1/chr1.fa.masked ./tmp
rm zip/chromOut.zip zip/chromFa.zip zip/chromFaMasked.zip
# chrom RepeatMasker out files
zip -j zip/chromOut.zip */chr*.fa.out
# soft masked chrom fasta
zip -j zip/chromFa.zip */chr*.fa
# hard masked chrom fasta
zip -j zip/chromFaMasked.zip */chr*.fa.masked
cd /cluster/data/danRer2/zip
#- Check zip file integrity:
foreach f (*.zip)
unzip -t $f > $f.test
tail -1 $f.test
end
wc -l *.zip.test
ssh hgwdev
cd /cluster/data/danRer2/zip
set gp = /usr/local/apache/htdocs/goldenPath/danRer2
# copy over just those modified zip files
cp -p chromOut.zip $gp/bigZips
cp -p chromFa.zip $gp/bigZips
cp -p chromFaMasked.zip $gp/bigZips
# remove chr1.fa.zip and zip chr1New
rm $gp/chromosomes/chr1.fa.zip
zip -j $gp/chromosomes/chr1New.fa.zip ../1/chr1New.fa
# replace md5sum
cd $gp/bigZips
rm md5sum.txt
md5sum *.zip > md5sum.txt
cd $gp/chromosomes
rm md5sum.txt
md5sum *.zip > md5sum.txt
# Update README.txt for each to state that a new masked chr1 was produced
# updated bigZips and chromosomes README.txt files with new ftp site
# for the assembly sequence at Sanger (2005-08-04, hartera):
# now ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv4release/
# MAKE VSHG17 DOWNLOADABLES (DONE, 2004-12-14, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
# (DONE, 2004-12-21, hartera)
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChrom
set gp = /usr/local/apache/htdocs/goldenPath/danRer2
mkdir -p $gp/vsHg17/axtChrom
cp -p *.axt $gp/vsHg17/axtChrom
cd $gp/vsHg17/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# copy chains and net to downloads area
# re-make chains and net downloadables (2004-12-21, hartera)
rm $gp/vsHg17/human*.gz $gp/vsHg17/md5sum.txt
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/humanHg17.chain.gz
gzip -c humanhg17.net > /cluster/data/danRer2/zip/humanHg17.net.gz
cd $gp/vsHg17
mv /cluster/data/danRer2/zip/human*.gz .
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# MAKE VSMM5 DOWNLOADABLES (DONE, 2004-12-14, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
# (DONE, 2004-12-21, hartera)
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChrom
set gp = /usr/local/apache/htdocs/goldenPath/danRer2
mkdir -p $gp/vsMm5/axtChrom
cp -p *.axt $gp/vsMm5/axtChrom
cd $gp/vsMm5/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# copy chains and nets to downloads area
# re-make chains and net downloadables (2004-12-21, hartera)
rm $gp/vsMm5/mouse*.gz $gp/vsMm5/md5sum.txt
cd /cluster/data/danRer2/bed/blastz.mm5.swap/axtChain
gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/mouseMm5.chain.gz
gzip -c mouseMm5.net > /cluster/data/danRer2/zip/mouseMm5.net.gz
cd $gp/vsMm5
mv /cluster/data/danRer2/zip/mouse*.gz .
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# BLASTZ HG17 CLEANUP (DONE, 2004-12-14, hartera)
# RE-DONE (DONE, 2004-12-21, hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.hg17.swap
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/noClass.net &
nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
# BLASTZ MM5 CLEANUP (DONE, 2004-12-14, hartera)
# RE-DONE (DONE, 2004-12-21, hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.mm5.swap
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/noClass.net &
nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/*.net &
nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
# BLASTZ FOR FUGU (fr1) (DONE, 2004-12-14, hartera)
# Fugu is quite distant from zebrafish, more distant than human/chicken
# so use the HoxD55.q matrix in future for blastz (2005-06-01, hartera)
# Blastz requires lineage-specific repeats but there are none
# for these two fish.
ssh kk
mkdir -p /cluster/data/danRer2/bed/blastz.fr1.2004-12-13
ln -s /cluster/data/danRer2/bed/blastz.fr1.2004-12-13 \
/cluster/data/danRer2/bed/blastz.fr1
cd /cluster/data/danRer2/bed/blastz.fr1
# use same parameters as for danRer1 vs fr1
cat << '_EOF_' > DEF
# zebrafish (danRer2) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - zebrafish (danRer2)
SEQ1_DIR=/iscratch/i/danRer2/nib
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Fugu (fr1)
# soft-masked chrom nib
SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/danRer2/bed/blastz.fr1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
# save the DEF file in the current standard place
chmod +x DEF
cp DEF ~angie/hummus/DEF.danRer2-fr1.2004-12-13
# setup cluster run
# source the DEF file
bash
. ./DEF
/cluster/data/danRer2/jkStuff/BlastZ_run0.sh
cd run.0
# check batch looks ok then
para try, check, push, check, ....
# para time
# Completed: 6055 of 6055 jobs
# CPU time in finished jobs: 1440485s 24008.08m 400.13h 16.67d 0.046 y
# IO & Wait Time: 132074s 2201.24m 36.69h 1.53d 0.004 y
# Average job time: 260s 4.33m 0.07h 0.00d
# Longest job: 2054s 34.23m 0.57h 0.02d
# Submission to last job: 4060s 67.67m 1.13h 0.05d
ssh kki
cd /cluster/data/danRer2/bed/blastz.fr1
bash # if a csh/tcsh user
. ./DEF
/cluster/data/danRer2/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# para time
# Completed: 173 of 173 jobs
# CPU time in finished jobs: 249s 4.16m 0.07h 0.00d 0.000 y
# IO & Wait Time: 695s 11.58m 0.19h 0.01d 0.000 y
# Average job time: 5s 0.09m 0.00h 0.00d
# Longest job: 16s 0.27m 0.00h 0.00d
# Submission to last job: 296s 4.93m 0.08h 0.00d
# Third cluster run to convert lav's to axt's
ssh kki
cd /cluster/data/danRer2/bed/blastz.fr1
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin /iscratch/i/danRer2/nib \
/cluster/bluearc/fugu/fr1/chromNib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer2/bed/blastz.fr1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 71s 1.19m 0.02h 0.00d 0.000 y
# IO & Wait Time: 765s 12.75m 0.21h 0.01d 0.000 y
# Average job time: 30s 0.50m 0.01h 0.00d
# Longest job: 66s 1.10m 0.02h 0.00d
# Submission to last job: 223s 3.72m 0.06h 0.00d
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/danRer2/bed/blastz.fr1
mkdir -p pslChrom
set tbl = "blastzFr1"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.fr1/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl danRer2 $f
echo "$f Done"
end
# featureBits -chrom=chr1 danRer2 refGene:cds blastzFr1 -enrichment
# refGene:cds 0.512%, blastzFr1 10.880%, both 0.439%, cover 85.71%, enrich 7.88x
# featureBits -chrom=chr1 danRer1 refGene:cds blastzFr1 -enrichment
# refGene:cds 0.529%, blastzFr1 5.542%, both 0.463%, cover 87.50%, enrich 15.79x
# CHAIN FUGU (fr1) BLASTZ (DONE, 2004-12-14, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-21, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/danRer2/bed/blastz.fr1
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/danRer2/bed/blastz.fr1/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/danRer2/nib \
/cluster/bluearc/fugu/fr1/chromNib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 402s 6.71m 0.11h 0.00d 0.000 y
# IO & Wait Time: 276s 4.59m 0.08h 0.00d 0.000 y
# Average job time: 24s 0.40m 0.01h 0.00d
# Longest job: 56s 0.93m 0.02h 0.00d
# Submission to last job: 101s 1.68m 0.03h 0.00d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# filter with minScore=5000
mv all.chain all.chain.unfiltered
chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k
rm -r chain
chainSplit chain all.chainFilt5k
# remove repeats from chains and reload into database
# (2004-12-21, hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
gunzip all.chainFilt5k.gz
chainSplit chainRaw all.chainFilt5k
mkdir chain
cd chainRaw
foreach f (*.chain)
set c = $f:r
echo $c
nice chainAntiRepeat /iscratch/i/danRer2/nib \
/cluster/bluearc/fugu/fr1/chromNib $f \
../chain/$c.chain
end
cd ..
chainMergeSort ./chain/*.chain > all.chain.antirepeat
chainSplit chainAR all.chain.antirepeat
# Load chains into database
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain/chainAR
foreach i (*.chain)
set c = $i:r
hgLoadChain danRer2 ${c}_chainFr1 $i
echo done $c
end
# featureBits -chrom=chr1 danRer2 refGene:cds chainFr1 -enrichment
# refGene:cds 0.512%, chainFr1 23.334%, both 0.451%, cover 88.07%, enrich 3.77x
# featureBits -chrom=chr1 danRer2 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.512%, chainFr1Link 7.794%, both 0.426%, cover 83.16%, enrich 10.67x
# featureBits -chrom=chr1 danRer1 refGene:cds chainFr1 -enrichment
# refGene:cds 0.529%, chainFr1 19.003%, both 0.479%, cover 90.41%, enrich 4.76x
# featureBits -chrom=chr1 danRer1 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.529%, chainFr1Link 4.686%, both 0.450%, cover 85.11%, enrich 18.16x
# after chainAntiRepeat:
# featureBits -chrom=chr1 danRer2 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.512%, chainFr1Link 7.726%, both 0.426%, cover 83.16%,
# enrich 10.76x
# NET FUGU (fr1) BLASTZ (DONE, 2004-12-14, hartera)
# REMADE NET FOR FUGU (2004-12-15, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-21,hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
mkdir preNet
cd chainAR
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 108306432, utime 690 s/100, stime 58
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
# use -noAr option - don't look for ancient repeats
# redo net as used danRer1 for netClass instead of danRer2 (2004-12-15)
# and load new net into database
netClass -noAr noClass.net danRer2 fr1 fuguFr1.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
netFilter -minGap=10 fuguFr1.net | hgLoadNet danRer2 netFr1 stdin
# featureBits danRer2 refGene:cds netFr1 -enrichment
# refGene:cds 0.468%, netFr1 19.389%, both 0.411%, cover 87.95%, enrich 4.54x
# featureBits danRer1 refGene:cds netFr1 -enrichment
# refGene:cds 0.461%, netFr1 17.530%, both 0.391%, cover 84.78%, enrich 4.84x
# after chainAntiRepeat:
# featureBits danRer2 refGene:cds netFr1 -enrichment
# refGene:cds 0.468%, netFr1 19.372%, both 0.412%, cover 87.97%, enrich 4.54x
# MAKE VSFR1 DOWNLOADABLES (DONE, 2004-12-15, hartera)
# REPLACE FR1 NET WITH NEW VERSION IN DOWNLOADS (2004-12-15, hartera)
# REMAKE FOR CHAINS AND NET AFTER USING chainAntiRepeat
# (DONE, 2004-12-21, hartera)
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.fr1/axtChrom
set gp = /usr/local/apache/htdocs/goldenPath/danRer2
mkdir -p $gp/vsFr1/axtChrom
cp -p *.axt $gp/vsFr1/axtChrom
cd $gp/vsFr1/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# copy chains and nets to downloads area
# re-make chains and net downloadables (2004-12-21, hartera)
rm $gp/vsFr1/fugu*.gz $gp/vsFr1/md5sum.txt
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
gzip -c all.chain.antirepeat > /cluster/data/danRer2/zip/fuguFr1.chain.gz
gzip -c fuguFr1.net > /cluster/data/danRer2/zip/fuguFr1.net.gz
cd $gp/vsFr1
mv /cluster/data/danRer2/zip/fugu*.gz .
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# BLASTZ FR1 CLEANUP (DONE, 2004-12-15, hartera)
# RE-DONE (DONE, 2004-12-21, hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.fr1
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/noClass.net &
nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/all.chain.unfiltered axtChain/all.chainFilt5k axtChain/*.net &
nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
nice rm -fr axtChain/chain axtChain/chainRaw axtChain/preNet &
# BLASTZ FOR TETRAODON (tetNig1) (DONE, 2004-12-21, hartera)
# Tetraodon is quite distant from zebrafish, more distant than human/chicken
# so use the HoxD55.q matrix in future for blastz (2005-06-01, hartera)
# blastz requires lineage-specific repeats but there are none
# available between these two fish species
ssh kk
mkdir -p /cluster/data/danRer2/bed/blastz.tetNig1.2004-12-21
ln -s /cluster/data/danRer2/bed/blastz.tetNig1.2004-12-21 \
/cluster/data/danRer2/bed/blastz.tetNig1
cd /cluster/data/danRer2/bed/blastz.tetNig1
# use tetraodon sequence in contigs for dynamic masking - see below
# for dynamic masking: M=50
cat << '_EOF_' > DEF
# zebrafish (danRer2) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
# use BLASTZ_M=50 in blastz-run1
ALIGN=blastz-run1
BLASTZ=blastz
BLASTZ_H=2500
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - zebrafish (danRer2)
SEQ1_DIR=/iscratch/i/danRer2/nib
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=500000
SEQ1_LAP=500
# QUERY - Tetraodon (tetNig1)
# soft-masked chrom nib
SEQ2_DIR=/iscratch/i/tetNig1/contigs
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=
SEQ2_LAP=
BASE=/cluster/data/danRer2/bed/blastz.tetNig1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
# save the DEF file in the current standard place
chmod +x DEF
cp DEF ~angie/hummus/DEF.danRer2-tetNig1.2004-12-21
# setup cluster run
# source the DEF file
bash
. ./DEF
/cluster/data/danRer2/jkStuff/BlastZ_run0.sh
cd run.0
# check batch looks ok then
para try, check, push, check, ....
# para time
# Completed: 3193 of 3193 jobs
# CPU time in finished jobs: 4460310s 74338.50m 1238.97h 51.62d 0.141 y
# IO & Wait Time: 41176s 686.27m 11.44h 0.48d 0.001 y
# Average job time: 1410s 23.50m 0.39h 0.02d
# Longest job: 2398s 39.97m 0.67h 0.03d
# Submission to last job: 12372s 206.20m 3.44h 0.14d
ssh kki
cd /cluster/data/danRer2/bed/blastz.tetNig1
bash # if a csh/tcsh user
. ./DEF
/cluster/data/danRer2/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# para time
# Completed: 3193 of 3193 jobs
# CPU time in finished jobs: 327s 5.46m 0.09h 0.00d 0.000 y
# IO & Wait Time: 8483s 141.38m 2.36h 0.10d 0.000 y
# Average job time: 3s 0.05m 0.00h 0.00d
# Longest job: 9s 0.15m 0.00h 0.00d
# Submission to last job: 560s 9.33m 0.16h 0.01d
# Third cluster run to convert lav's to axt's
ssh kki
cd /cluster/data/danRer2/bed/blastz.tetNig1
mkdir axtChrom
# a new run directory
mkdir run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt -fa stdin /iscratch/i/danRer2/nib \
/iscratch/i/tetNig1/contigs/tetNig1Contigs.fa stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
\ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
wc -l jobList
head jobList
para create jobList
para try, check, push, check,...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 224s 3.74m 0.06h 0.00d 0.000 y
# IO & Wait Time: 1240s 20.66m 0.34h 0.01d 0.000 y
# Average job time: 52s 0.87m 0.01h 0.00d
# Longest job: 148s 2.47m 0.04h 0.00d
# Submission to last job: 157s 2.62m 0.04h 0.00d
# lift up query sequences
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.tetNig1
foreach d (axtChrom/chr*.axt)
echo $d
liftUp -axtQ ${d}.out.axt \
/cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft warn $d \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
# check this is working correctly
set dr = "/cluster/data/danRer2"
foreach c (`cat ${dr}/chrom.lst`)
echo lifting $c
liftUp -axtQ ./axtChrom/chr${c}.out2.axt \
/cluster/data/tetNig1/jkStuff/liftAll.lft warn \
./axtChrom/chr${c}.axt.out.axt > /dev/null
end
cd axtChrom
foreach c (`cat ${dr}/chrom.lst`)
mv chr${c}.axt chr${c}.axt.old
mv chr${c}.out2.axt chr${c}.axt
rm chr${c}.axt.out.axt
end
# translate sorted axt files into psl
ssh kolossus
cd /cluster/data/danRer2/bed/blastz.tetNig1
mkdir -p pslChrom
# need to copy S2.len for whole tetNig1 genome - S2contigs.len
cp /cluster/data/tetNig1/bed/blastzSelf/S1.len S2contigs.len
set tbl = "blastzTetNig1"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f \
S1.len S2contigs.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.tetNig1/pslChrom
foreach f (./*.psl)
/cluster/bin/i386/hgLoadPsl danRer2 $f
echo "$f Done"
end
# H=2000
# featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1 -enrichment
# refGene:cds 0.512%, blastzTetNig1 22.057%, both 0.435%, cover 84.94%,
# enrich 3.85x
# H=2000 and L=8000
# featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1L8k -enrichment
# refGene:cds 0.512%, blastzTetNig1L8k 16.326%, both 0.401%, cover 78.28%,
# enrich 4.80x
# use query in contigs in one file and use dynamic masking with M=100
# should mask out more sequence in query, tetraodon has no specific repeats
# library so it is not fully repeat masked. H=2500, M=100
# featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1Contigs -enrichment
# refGene:cds 0.512%, blastzTetNig1Contigs 13.052%, both 0.427%, cover 83.43%,
# enrich 6.39x
# contigs used as above but H=2000
#featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1ContigsH2k -enrichment
# refGene:cds 0.512%, blastzTetNig1ContigsH2k 17.517%, both 0.433%,
# cover 84.51%, enrich 4.82x
# contigs used with H=2500, L=6000
# refGene:cds 0.512%, blastzTetNig1ContigsL6k 11.380%, both 0.415%,
# cover 80.94%, enrich 7.11x
# use M=50 and H=2500
#featureBits -chrom=chr1 danRer2 refGene:cds blastzTetNig1ContigsM50 -enrichment
# refGene:cds 0.512%, blastzTetNig1ContigsM50 12.643%, both 0.427%, cover 83.42%# ,enrich 6.60x
# using contigs and dynamic masking improves the enrichment and makes little
# difference to coverage. the blastz track % is lower since more sequence
# is being masked and since coverage is not much different, it is probably
# being masked in non coding regions
# at chr1:6,128,109-6,135,734 there are a lot of short alignments in low
# complexity regions that are removed using L=6000.
# Use contigs with H=2500, keep lower scoring alignments until after chaining
# Number of rows:
# blastzTetNig1 2360350
# blastzTetNig1L8k 1177874
# blastzTetNig1Contigs 725076
# blastzTetNig1ContigsH2k 825927
# blastzTetNig1ContigsL6k 538149
# blastzTetNig1ContigsM50 479228
# Using M=50 reduces the number of alignments but coverage is the same as
# for using M=100 so use the dynamic masking with M=50 and H=2500
# CHAIN TETRAODON (tetNig1) BLASTZ (DONE, 2004-12-21, hartera)
# Run axtChain on little cluster
ssh kki
cd /cluster/data/danRer2/bed/blastz.tetNig1
mkdir -p axtChain/run1
cd axtChain/run1
mkdir out chain
ls -1S /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /iscratch/i/danRer2/nib \
/iscratch/i/tetNig1/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
para try, check, push, check...
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 446s 7.43m 0.12h 0.01d 0.000 y
# IO & Wait Time: 355s 5.92m 0.10h 0.00d 0.000 y
# Average job time: 29s 0.48m 0.01h 0.00d
# Longest job: 98s 1.63m 0.03h 0.00d
# Submission to last job: 98s 1.63m 0.03h 0.00d
# now on the cluster server, sort chains
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
mkdir chain
# remove repeats from chains
cd run1/chain
foreach f (*.chain)
set c = $f:r
echo $c
nice chainAntiRepeat /iscratch/i/danRer2/nib \
/iscratch/i/tetNig1/nib $f \
../../chain/$c.chain
end
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
chainMergeSort ./chain/*.chain > all.chain.antirepeat
chainSplit chainAR all.chain.antirepeat
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r >> hist5000.out
textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
echo ""
end
# filter with minScore=5000
mv all.chain.antirepeat all.chainAR.unfiltered
chainFilter -minScore=5000 all.chainAR.unfiltered > all.chainARFilt5k
rm -r chain
chainSplit chainAR all.chainARFilt5k
# only chr1 has more than 100,000 chains with score of 5000
# or less after filtering
# Load chains into database
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain/chainAR
foreach i (*.chain)
set c = $i:r
hgLoadChain danRer2 ${c}_chainTetNig1 $i
echo done $c
end
# featureBits -chrom=chr1 danRer2 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.512%, chainTetNig1Link 11.005%, both 0.416%, cover 81.22%,
# enrich 7.38x
# NET TETRAODON (tetNig1) BLASTZ (DONE, 2004-12-21, hartera)
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
mkdir preNet
cd chainAR
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2contigs.len \
../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len \
../../S2contigs.len ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
# memory usage 102502400, utime 670 s/100, stime 69
# Add classification info using db tables:
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
# use -noAr option - don't look for ancient repeats
netClass -noAr noClass.net danRer2 tetNig1 tetraTetNig1.net
# Load the nets into database
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
netFilter -minGap=10 tetraTetNig1.net | hgLoadNet danRer2 netTetNig1 stdin
# featureBits danRer2 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.468%, netTetNig1 19.068%, both 0.410%, cover 87.50%,
# enrich 4.59x
# BACENDS (DONE, 2005-05-20, hartera)
# Removed incorrect row from bacCloneXRef table (DONE, 2006-04-19, hartera)
# provided by Anthony DiBiase, Yi Zhou and Leonard Zon at the Boston
# Children's Hospital. Collaborated with them on this track.
# Anthony DiBiase:adibiase@enders.tch.harvard.edu
# Yi Zhou:yzhou@enders.tch.harvard.edu
# BAC clone end sequences are from Robert Geisler's lab,
# Max Planck Institute for Developmental Biology, Tuebingen, Germany
# REMAKE AND RELOAD bacCloneAlias AND bacCloneXRef TABLES AFTER FIXING
# BUGS IN THE zfishBacClonesandSts PROGRAM TO STORE ALL THE ALIASES FOR EACH
# SANGER STS NAME AND TO STORE CORRECT INTERNAL AND EXTERNAL NAMES IN ALL
# CASES. DATA FILE OF UniSTS IDS AND SANGER STS NAMES WAS ALSO RE-CREATED
# AS THIS WAS INCORRECT. (DONE, 2005-05-24, hartera)
# ALSO DO TESTING OF THE DATA IN THESE TABLES TO MAKE SURE IT IS CORRECT - SEE
# SECTION ON TESTING BELOW.
ssh kksilo
mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends
cd /cluster/data/danRer2/bed/ZonLab/bacends
# copy BAC ends sequence from previous assembly
cp /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa .
faSize bacends.fa
# 486978153 bases (39070196 N's 447907957 real 447907957 upper 0 lower)
# in 594614 sequences in 1 files
# Total size: mean 819.0 sd 230.3 min 0 (zKp108-H09.za) max 5403 (zC259G13.zb) median 796
# N count: mean 65.7 sd 154.7
# U count: mean 753.3 sd 302.6
# L count: mean 0.0 sd 0.0
ssh kkr1u00
cd /cluster/data/danRer2/bed/ZonLab/bacends
mkdir -p /iscratch/i/danRer2/bacends
# if not split already, split up sequence for cluster runs
faSplit sequence bacends.fa 20 /iscratch/i/danRer2/bacends/bacends
# iSync bacends to kilokluster
/cluster/bin/iSync
ssh kk
cd /cluster/data/danRer2/bed/ZonLab/bacends
ls -1S /iscratch/i/danRer1/bacends/*.fa > bacends.lst
ls -1S /iscratch/i/danRer2/trfFa/*.fa > genome.lst
cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=/iscratch/i/danRer2/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
mkdir psl
gensub2 genome.lst bacends.lst template jobList
para create jobList
para try, check, push, check, ...
# para time
# Completed: 5980 of 5980 jobs
# CPU time in finished jobs: 5317533s 88625.55m 1477.09h 61.55d 0.169 y
# IO & Wait Time: 34446s 574.10m 9.57h 0.40d 0.001 y
# Average job time: 895s 14.92m 0.25h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 9892s 164.87m 2.75h 0.11d
# Submission to last job: 12309s 205.15m 3.42h 0.14d
# back on kksilo (now kkstore01), filter and lift results:
ssh kksilo
cd /cluster/data/danRer2/bed/ZonLab/bacends
pslSort dirs raw.psl temp psl
# took 9 hours
# With minCover=0.4, 78% of seqeunces are aligned
# No minCover, aligns 90% of sequences but with a large number
# of extra alignments - a lot of noise even compared to using minCover of
# 0.05.
# Total BAC ends sequences: 594614
# minCover minAli #alignments #sequences aligned %total sequences aligned
# 0 0.85 13969946 535465 90
# 0.05 0.85 8140775 531299 89
# 0.10 0.75 5099823 519238 87
# 0.10 0.85 5098714 519084 87
# 0.10 0.90 5096205 518616 87
# 0.10 0.92 4984170 515024 87
# 0.10 0.95 4264695 498380 84
# 0.20 0.85 4515869 501415 84
# 0.40 0.85 3806487 464067 78
# Using minCover of 0.10 seems good, there are 55017 extra BAC ends
# being aligned with an extra 1292227 alignments compared to 0.10, this
# is an average of 23.5 alignments per sequence compared to 8 alignments
# per sequence on average for minCover of 0.40.
# Using minAli of 0.90 for minCover 0.10 gives 468 less sequences
# aligned and 2509 less alignments. Using 0.75 for minCover, increases
# the number of alignments by about 1100 compared to using 0.85 and
# sequences aligned only increased by about 150. Using minAli of 0.95
# for minCover of 0.10 gives 834,019 less alignments with 20704 less
# sequences being aligned than for using minAli of 0.85.
# for minAli=0.92 and minCover=0.10, there are 114,544 less alignments
# than for minAli=0.85 and 4060 less sequences are aligned.
# try minAli=0.85 and minCover=0.40 and also minAli=0.92 and
# minCover=0.10 and compare. More alignments are gained - see below
# but minCover=0.10 means there are a lot of alignments where < 40% of the
# BAC end aligns and this does not seem reliable so stick to minCover=0.4
# Stats for minCover=0.10, minAli=0.92:
# Pairs: there are 205360 orphans and 97458 pairs, 1014 long, 19738 mismatch
# and 239 slop
# Singles: 17829 bacEnds.orphan and 205360 from pair analysis so a total of
# 223189 orphans 7828 more than for danRer1 and more than for using
# minCover=0.40 and minAli=0.85.
# for minCov=0.10, minAli=0.92:
# 90450 pairs.txt
# 69699 pairs.txt.uniq - about 4500 more than for minCover=0.40 and minAli=0.85
# 202661 singles.txt
# 178337 singles.txt.uniq - about 5399 more than for minCover=0.40,minAli=0.85
# Below, used same parameters as for danRer1: minCover=0.40 and minAli= 0.85
# all files below and tables are derived from the alignments using
# these parameters for pslReps filtering.
# location of store8 has changed to kkstore01 instead of kksilo so now
# login there (2005-04-28)
ssh kkstore01
cd /cluster/data/danRer2/bed/ZonLab/bacends
nice pslReps -nearTop=0.02 -minCover=0.40 -minAli=0.85 -noIntrons raw.psl \
bacEnds.psl /dev/null
liftUp bacEnds.lifted.psl /cluster/data/danRer2/jkStuff/liftAll.lft \
warn bacEnds.psl
# Got 299 lifts in /cluster/data/danRer2/jkStuff/liftAll.lft
wc -l bacEnds.lifted.psl
# 3806457 bacEnds.lifted.psl
# 4984175 (for minCov=0.10, minAli=0.92)
rm -r temp
pslCheck bacEnds.lifted.psl >& pslCheck.log
# 536 problems with overlapping exons reported by pslCheck
awk '{print $10;}' bacEnds.lifted.psl | sort | uniq > bacEnds.names.uniq
# edit to remove header lines
wc -l bacEnds.names.uniq
# 464067 bacEnds.names.uniq
grep '>' bacends.fa | wc -l
# 594614 bacends.fa
# 78% of BAC ends have aligned
# Need to add accession information and BAC end sequence pairs
mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
grep ">" ../bacends.fa > bacEnds
# remove '>' at beginning of each line
perl -pi.bak -e 's/>//' bacEnds
# remove ^M char from end of lines
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
# copy over necessary files from danRer1
set dr1Bacs = "/cluster/data/danRer1/bed/ZonLab/bacends"
cp $dr1Bacs/bacends.1/stripEndChar.pl.gz .
cp $dr1Bacs/bacends.1/BACEnd_accessions.txt.gz .
cp $dr1Bacs/bacends.1/zfish_accs.txt .
cp $dr1Bacs/bacends.1/getBacEndInfo.pl.gz .
gunzip *.gz
ssh kkstore01
cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
perl stripEndChar.pl < bacEnds > allBacEnds.names
rm bacEnds bacEnds.bak
# Kerstin Jekosch at the Sanger Centre: kj2@sanger.ac.uk
# provided accession data for BAC End clones - zfish_accs.txt
# and for BAC End pairs- BACEnd_accessions.txt - put these in bacends.1 dir
# write and use perl script to split BAC Ends into pairs and singles
# bacEndPairs.txt and bacEndSingles.txt
# For pslPairs, the reverse primer (T7 or .z*) should in the first column
# and the forward primer (SP6 or .y*) should be in the second column
# There are several reads for some sequences and these have similar names.
# Reads for the same sequencee should be in a comma separated list.
# Sequence read names can be translated to external clone names as found
# in NCBI's clone registry using the name without the suffix and the
# translation of prefixes as follows after removal of dashes
# and extra zeros in the name:
# library external prefix internal prefix
# CHORI-211 CH211- zC
# DanioKey DKEY- zK
# DanioKey Pilot DKEYP- zKp
# RZPD-71 RP71- bZ
# BUSM1 (PAC) BUSM1- dZ
# e.g. zC001-A03.za becomes CH211-1A03
# make table of accessions for BAC ends to load into bacEndAlias table
# find all suffixes used for sequence namesi
cat << '_EOF_' > getSuffixes.pl
#!/usr/bin/perl -w
use strict;
while(<STDIN>)
{
chomp;
my $l = $_;
if ($l =~ /^[t1_]*[z|Z|b|d][C|K|Z]p?[0-9]+\-?[a-zA-Z]+[0-9]+\.?([A-Z0-9a-z]+)/)
{
my $suffix = $1;
print "$suffix\n";
}
}
'_EOF_'
chmod +x getSuffixes.pl
perl getSuffixes.pl < allBacEnds.names > allBacEnds.suffixes
sort allBacEnds.suffixes | uniq > allBacEnds.suffixes.uniq
wc -l allBacEnds.suffixes.uniq
# 22 different suffixes
# SP6, SP6A,SP6W,T7,T7A,T7W,Za,p1kSP6,p1kSP6w,q1kT7,q1kT7w,sp6,t7,ya
# yb,yc,yd,yt,za,zb,zc,zd
# get suffixes used in BAC ends accessions file
perl getSuffixes.pl < BACEnd_accessions.txt > BACEnd.names.suffixes
sort BACEnd.names.suffixes | uniq
# SP6, SP6A, T7 and T7A - all these endings occur in list of BAC ends
# in FASTA file but if the ending is something different, need to add "A"
# to SP6 and T7 and check this ending also in the BAC end accessions list
# edit getBacEndInfo.pl to make sure it includes all these suffixes
# also treats zK32A7SP6 and zK32A7SP6A to be different
perl getBacEndInfo.pl allBacEnds.names BACEnd_accessions.txt > bacEnds.log
wc -l *.txt
# 31317 BACEnd_accessions.txt
# 180280 bacEndPairs.txt
# 16020 bacEndSingles.txt
# 594614 direction.txt
wc -l bacEndAccs.aliases
# 47558 bacEndAccs.aliases
# bacEndAccs.aliases contains sequence read names and their
# Genbank accessions.
# checked that all names from allBacEnds.names appear in either
# bacEndPairs.txt or bacEndSingles.txt
# First process BAC end alignments
ssh kkstore01
mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/pairs
cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
set bacDir = /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
# these bacEnds vary in size from around 2800 bp to 626,000 bp
~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
-max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndPairs.txt \
all_bacends bacEnds
wc -l bacEnds.*
# There are 1411 more orphans and 18147 more pairs than for danRer1
# 980 bacEnds.long
# 18083 bacEnds.mismatch
# 201646 bacEnds.orphan
# 90967 bacEnds.pairs
# 0 bacEnds.short
# 190 bacEnds.slop
# create header required by "rdb" tools
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
> ../header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
# make pairs bed file
cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairs.bed
# also need to process bacEndSingles.txt into a database table
# for singles in bacEndSingles.txt, create a dummy file where they
# are given zJA11B12T7 as dummy sequence pair. If the single is a forward
# sequence, put the dummy sequence in the second column, if the single is
# a reverse sequence put in first column. use a perl script to do this.
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/bacends
set bacDir = /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
mkdir singles
cd singles
cp /cluster/data/danRer1/bed/ZonLab/bacends/singles/formatSingles.pl.gz .
gunzip formatSingles.pl.gz
perl formatSingles.pl $bacDir/bacEndSingles.txt > \
$bacDir/bacEndSingles.format
# then run pslPairs on this formatted sequence
~/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
-max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.lifted.psl $bacDir/bacEndSingles.format \
all_bacends bacEnds
wc -l bacEnds*
# 0 bacEnds.long
# 0 bacEnds.mismatch
# 15853 bacEnds.orphan
# 0 bacEnds.pairs
# 0 bacEnds.short
# 0 bacEnds.slop
# there are 15853 orphans here ( 14126 in danRer1) 201646 from
# pair analysis so a total of 217499 orphans (2138 more than danRer1)
cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles
wc -l bacEnds.singles
# 217499 bacEnds.singles
# make singles bed file
cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndSingles.bed
cp bacEndSingles.bed ../pairs
cd ../pairs
# all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
# since orphans are already in bacEndSingles, do not add these
cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBad.bed
# add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans
# twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans
cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
| row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBadNoOrphans.bed
extractPslLoad -noBin ../bacEnds.lifted.psl bacEndPairs.bed \
bacEndPairsBadNoOrphans.bed bacEndSingles.bed | \
sorttbl tname tstart | headchg -del > bacEnds.load.psl
/gbdb/danRer2/bacends/BACends.fa
hgLoadSeq danRer2 /gbdb/danRer2/bacends/BACends.fa
# There are rows where the aligments were the same but the lfNames are
# different. This is due to the presence of multiple reads for the
# same BAC end sequence. Sometimes they are slightly different lengths
# so the alignments are a little different. It would be good to consolidate # all of these. Firstly, the identical rows were merged into one with a
# list of all the lfNames corrsponding to that alignment.
# load all alignments into temporary database
ssh hgwdev
echo "create database bacsDr2_rah;" | hgsql danRer2
cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
hgLoadBed bacsDr2_rah bacEndPairs bacEndPairs.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 84405 elements of size 11
# Loaded 72272 elements of size 11 (danRer1)
# create a bacEndSingles table like bacEndPairs if not created already
cp $HOME/kent/src/hg/lib/bacEndPairs.sql ../singles/bacEndSingles.sql
# edit to give correct table name
# or copy over table definition from danRer1
cp /cluster/data/danRer1/bed/ZonLab/bacends/singles/bacEndSingles.sql.gz \
../singles/
gunzip ../singles/bacEndSingles.sql.gz
hgLoadBed bacsDr2_rah bacEndSingles bacEndSingles.bed \
-sqlTable=../singles/bacEndSingles.sql
# Loaded 196492 elements of size 11
# Loaded 203683 elements of size 11 (danRer1)
# load all_bacends later
# NOTE - this track isn't pushed to RR, just used for assembly QA
# Use bacEndPairsBadNoOrphans.bed as orphans are in the singles bed file
hgLoadBed bacsDr2_rah bacEndPairsBad bacEndPairsBadNoOrphans.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 18544 elements of size 11
# Need to consolidate similar rows for bacEndPairs and bacEndSingles - same
# name, different lfNames and same alignments.
mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
cd /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
bacEndPairs order by name, chrom, chromStart;" \
bacsDr2_rah > pairs.txt
sort pairs.txt | uniq -c > pairs.txt.uniq
wc -l pairs*
# 84405 pairs.txt
# 65105 pairs.txt.uniq
# for danRer1:
# 72272 pairs.txt
# 53992 pairs.txt.uniq
# for replicate rows, find all the unique lfNames and put these
# in one row with the relevant lfStarts, lfSizes and correct lfCount
cp \
/cluster/data/danRer1/bed/ZonLab/bacends/duplicates/removeReplicates.pl.gz .
gunzip removeReplicates.pl.gz
# modify script so that the database to use is supplied as an argument
nice perl removeReplicates.pl pairs.txt.uniq bacEndPairs bacsDr2_rah \
> pairsNoReps.bed
# repeat for singles
hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
bacEndSingles order by name, chrom, chromStart;" bacsDr2_rah \
> singles.txt
sort singles.txt | uniq -c > singles.txt.uniq
wc -l singles*
# 196492 singles.txt
# 173025 singles.txt.uniq
# 203683 singles.txt (danRer1)
# 179170 singles.txt.uniq (danRer1)
nice perl removeReplicates.pl singles.txt.uniq bacEndSingles bacsDr2_rah \
> singlesNoReps.bed
# Using badPairs with no orphans
hgsql -N -e "select chrom, chromStart, chromEnd, name, strand from \
bacEndPairsBad order by name, chrom, chromStart;" bacsDr2_rah \
> badPairs.txt
sort badPairs.txt | uniq -c > badPairs.txt.uniq
wc -l badPairs*
# 18544 badPairs.txt
# 13803 badPairs.txt.uniq
nice perl removeReplicates.pl badPairs.txt.uniq bacEndPairsBad bacsDr2_rah \
> badPairsNoReps.bed
# sort by chrom, chromStart - done already
# Wrote perl script to choose 2 BAC ends to represent each BAC clone.
# since there are often more than one read for each BAC end in this set,
# 2 were chosen for each BAC pair or 1 for the singles. This was based on
# the ones that had the largest region aligned (using lfSizes).
ssh kkstore01
cd /cluster/data/danRer2/bed/ZonLab/bacends/duplicates
# run perl script: input bed file, pairs or singles, name of output file
perl pickLfNames.pl pairsNoReps.bed pairs pairs2lfNames.bed
mv error.log log.pairs
# this log.pairs lists the 30 cases where alignments for a BAC clone use
# a different pair of sequence reads for the ends. These were checked out
# and, in most cases, the alignments are almost identical or overlap for
# the most part so it is ok to have these extra alignments missing.
# CH211-69C14: zC069-C14.ya hits almost 100,000 bp downstream
# of the .yb version. It is slightly longer and it has a good hit by BLAT
# also to chrom 15. This is the alignment that is missing of the two for
# this clone.
# run script for singles:
perl pickLfNames.pl singlesNoReps.bed singles singles1lfName.bed
mv error.log log.pairs
# Singles: there are 80 cases where the alignments for the BAC clone use
# a different sequence for the reads. badPairs: there are 13 cases.
# These were not included in the new bed file and they were checked out -
# the BAC clone names are for these are in the log file.
# For Singles: In all but 3 cases, the alignments were almost
# identical or mostly overlapped the alignments for the other sequence
# read chosen to represent that BAC end in the bed file.
# CH211-98J20, CH211-98O21 and CH211-98G4 have alignments for other
# sequence reads for the same end represented but the alignments are to
# different chromosomes (see singlesNoReps.bed).
perl pickLfNames.pl badPairsNoReps.bed pairs badPairs2lfNames.bed
mv error.log log.badPairs
# for each of these new bed files, checks were made that there are
# only 2 BAC ends per alingmnets for pairs and 1 for singles.
# For each pair, there should only be 2 ends which can appear either
# way round depending on the orientation and there should be 1 end for
# the beginning (suffix T7, t7 or z) and one end for the end
# (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
# either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
# orientation. For singles, there should be a single BAC end for each
# alignment and for each BAC clone, a sequence for either or both types
# of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
# alignments.
# Finally overlaps in BAC clone names were checked. All BAC clones
# represented in each of the pairs, badPairs and singles bed files are
# unique to that file. Between all three bed files, 173188 BAC clones
# have alignments.
ssh hgwdev
# NOTE: using sort and uniq on hgwdev produces tab delimited output
# after merging rows with the same BAC name, the scoring is now
# wrong in the bed files.
# Scores should be 1000 if there is 1 row for that name, else
# 1500/number of rows for that sequence name - calculated by pslPairs.
# Correct the scores.
mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/scores
cd /cluster/data/danRer2/bed/ZonLab/bacends/scores
# copy over correctScores.pl script from danRer1
cp /cluster/data/danRer1/bed/ZonLab/bacends/scores/correctScores.pl.gz .
cp /cluster/data/danRer1/bed/ZonLab/bacends/scores/checkScores.pl.gz .
gunzip *.gz
# modify correctScores.pl to correctScore2.pl with extra argument to
# specify if there is a bin field.
awk '{print $4}' ../duplicates/pairs2lfNames.bed \
| sort | uniq -c > pairs.hits
perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits noBin \
> bacEndPairsGoodScores.bed
# same for singles
awk '{print $4}' ../duplicates/singles1lfName.bed \
| sort | uniq -c > singles.hits
perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits noBin \
> bacEndSinglesGoodScores.bed
# and for badPairs
awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
| sort | uniq -c > badPairs.hits
perl correctScores2.pl ../duplicates/badPairs2lfNames.bed badPairs.hits \
noBin > bacEndPairsBadGoodScores.bed
# check that the scores are correct now:
awk 'BEGIN {OFS = "\t"} {print $4, $5}' bacEndPairsGoodScores.bed \
| sort | uniq -c > pairs.count
perl checkScores.pl < pairs.count
# all the BAC clones should be in good.txt and none in bad.txt
# wc -l should give same number of lines in good.txt as in pairs.hits
# check all the GoodScores bed files in this manner - all are correct
# for singles, 3 end up in bad.txt but it is a rounding error,
# there are 7 alignments for each of these: CH211-210N9, CH211-30H16
# and DKEY-31H18. Score is: 214.285714285714
# this will be rounded to 214 when loading the database table
# Now load database tables:
hgLoadBed danRer2 bacEndPairs bacEndPairsGoodScores.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 65075 elements of size 11
hgLoadBed danRer2 bacEndSingles bacEndSinglesGoodScores.bed \
-sqlTable=../singles/bacEndSingles.sql
# Loaded 172945 elements of size 11
# load of bacEndSingles did not go as planned: 172945 record(s),
# 0 row(s) skipped, 29 warning(s) loading bed.tab
# warnings are unknown but all of bed file loaded and the number
# of warnings is small so ignore
hgLoadBed danRer2 bacEndPairsBad bacEndPairsBadGoodScores.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 13790 elements of size 11
# load BAC end sequences into seq table so alignments may be viewed
# symlink to bacends.fa sequences in danRer1
mkdir -p /gbdb/danRer2/bacends
ln -s /cluster/data/danRer1/bed/ZonLab/bacends/bacends.fa \
/gbdb/danRer2/bacends/BACends.fa
hgLoadSeq danRer2 /gbdb/danRer2/bacends/BACends.fa
ssh kkstore01
cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
# for all_bacends table, just load the alignments for those sequences
# represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
# bacEnds.load.psl is the file of alignments
# get all the names of sequences
foreach f (../scores/*GoodScores.bed)
echo $f
awk '{print $11;}' $f >> allBacEnds.names
end
wc -l allBacEnds.names
# 251810 allBacEnds.names
# this is the total number of lines in the *GoodScores.bed files
perl -pi.bak -e 's/,/\n/g' allBacEnds.names
sort allBacEnds.names | uniq > allBacEnds.names.uniq
# write script to go through and pick alignments
cat > getAlignments.pl << 'EOF'
#!/usr/bin/perl -w
use strict;
my $bacAligns = $ARGV[0]; # psl alignments of BAC ends
my $bacs = $ARGV[1]; # list of BAC ends for which to select alignments
open(ALIGNS, $bacAligns) || die "Can not open $bacAligns: $!\n";
open(BACS, $bacs) || die "Can not open $bacs:$!\n";
my %alignsHash;
# read alignments and store
while (<ALIGNS>)
{
my $l = $_;
my @fields = split(/\t/, $l);
# hash key is BAC end name
push (@{$alignsHash{$fields[9]}}, $l);
}
close ALIGNS;
while (<BACS>)
{
chomp;
my $end = $_;
if (exists($alignsHash{$end}))
{
my @als = @{$alignsHash{$end}};
foreach my $a (@als)
{
print $a;
}
}
}
close BACS;
'EOF'
chmod +x getAlignments.pl
# get alignments for just the BAC ends that are in the database tables
perl getAlignments.pl bacEnds.load.psl allBacEnds.names.uniq \
> bacEnds.load.inTables.psl
# check that alignments are present for all the BAC ends in the list
# alignments for all BAC ends in allBacEnds.names.uniq are there
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/bacends/pairs
# load all_bacends table
hgLoadPsl danRer2 -table=all_bacends bacEnds.load.inTables.psl
# load of all_bacends did not go as planned: 2615155 record(s), 0 row(s)
# skipped, 81 warning(s) loading psl.tab
ssh kkstore01
cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1
# make bacEndAlias table with Genbank accessions for ends
# need to run getBacEndInfo.pl for the BAC end names in the
# BAC tables.
# in the pairs directory, there is the allBacEnds.names.uniq file
# so use this.
mkdir bacEndAccs
cd bacEndAccs
perl ../getBacEndInfo.pl ../../pairs/allBacEnds.names.uniq \
../BACEnd_accessions.txt > bacEndAccs.aliases.log
mv bacEndAccs.aliases bacEndAccs.aliases.intables
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1/bacEndAccs
echo "drop table bacEndAlias" | hgsql danRer2
hgsql danRer2 < $HOME/kent/src/hg/lib/bacEndAlias.sql
echo "load data local infile 'bacEndAccs.aliases.intables' into table \
bacEndAlias" | hgsql danRer2
# clonemarkers.02.12.04.txt, ctgnames.02.12.04.txt and markers.02.12.04.txt
# already downloaded from Sanger for danRer1 (see makeDanRer1.doc)
ssh kkstore01
mkdir -p /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
set dr1=/cluster/data/danRer1/bed/ZonLab/bacends/cloneandStsAliases
set dr2=/cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
ln -s $dr1/clonemarkers.02.12.04.txt $dr2
ln -s $dr1/ctgnames.02.12.04.txt $dr2
ln -s $dr1/markers.02.12.04.txt $dr2
ln -s $dr1/zfish_accs.txt $dr2
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
hgsql -N -e 'select name from bacEndPairs;' danRer2 > bacEnds.names
hgsql -N -e 'select name from bacEndSingles;' danRer2 >> bacEnds.names
hgsql -N -e 'select name from bacEndPairsBad;' danRer2 >> bacEnds.names
sort bacEnds.names | uniq > bacEnds.names.uniq
# 173188 bacEnds.names.uniq
# 594614 bacEnd sequences to start with
# some of the bacEnd sequences are replicates so count lfNames from table
hgsql -N -e 'select lfNames from bacEndPairs;' danRer2 > bacEnds.lfNames
hgsql -N -e 'select lfNames from bacEndSingles;' danRer2 >> bacEnds.lfNames
hgsql -N -e 'select lfNames from bacEndPairsBad;' danRer2 >> bacEnds.lfNames
perl -pi.bak -e 's/,/\n/g' bacEnds.lfNames
sort bacEnds.lfNames | uniq > bacEnds.lfNames.uniq
# 303946 bacEnds.lfNames.uniq
# pslCheck has 536 bad alignments
# from psl file
awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
# edit to remove first few lines with no names
sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq
# 464067 bacEndsPsl.names.uniq
# about 78% align - after filtering
# Add an alias table for BAC clones
# bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
# Add a xref table to give external clone registry names, internal names
# sanger name, relationship between STS and BAC clone (method of finding
# STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT,
# Genbank accession and STS primer sequences
# bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
wc -l *.02.12.04.txt
28305 clonemarkers.02.12.04.txt
167441 ctgnames.02.12.04.txt
12008 markers.02.12.04.txt
# create list of BAC clones in the tables with chromosome aligned to
hgsql -N -e 'select name, chrom from bacEndPairs;' danRer2 \
> bacClones.namesandchrom
hgsql -N -e 'select name, chrom from bacEndSingles;' danRer2 \
>> bacClones.namesandchrom
sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
# use zfish_accs.txt provided by the Sanger Centre to create a list
# of BAC clone names, internal names and Genbank accessions
ssh kkstore01
cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
# get list of UniSTS IDs using aliases to search alias file
# print Sanger name, alias and UniSTS ID, use print_markers2.pl
# modified version of program written by Tony DiBiase (Boston
# Children's Hospital)
# create find_markers3.pl as the previous version was finding too
# many UniSTS IDs for some markers as using grep (2005-05-24, hartera)
cat << '_EOF_' > find_markers3.pl
# example:
# perl find_markers.pl UniSTS.aliases markers.02.12.04.txt
use strict;
my $verbose = 0;
my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
my $aliasFile = $ARGV[0];
my $markersFile = $ARGV[1];
open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
# store aliases from aliasFile
my ($id, $al, @alsArray, %aliasHash);
while (<ALIAS>)
{
chomp;
($id, $al) = split /\t/;
@alsArray = split(/;/, $al);
foreach my $as (@alsArray)
{
push (@{$aliasHash{$as} }, $id);
}
}
close ALIAS;
while (<MARKERS>) {
my @idArray;
($f, $t, $m, $idArray[0]) = 0;
my @ids;
chomp; ($a, $b, $aliases, @rest) = split /\|/;
if ($verbose > 3) { printf "aliases $aliases \n"; }
@alias = split /;/, $aliases;
ALIAS: foreach $s (@alias) {
if ($s =~ /[\D]+/) {
if ($verbose > 5) { printf "this $s \n"; }
if (exists($aliasHash{$s}))
{
@idArray = @{$aliasHash{$s}};
}
if ($idArray[0]) {
$f = 1; $t = $s; @ids = @idArray;
if ($verbose) { printf "this $s found $m \n"; }
last ALIAS;
}
}
}
if ($f)
{
my @sNames = split(/;/, $b);
foreach my $sn (@sNames)
{
foreach my $i (@ids)
{
printf "$sn\t$i\n";
}
}
}
}
close MARKERS;
'_EOF_'
chmod +x find_markers3.pl
perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-04-12/UniSTS.aliases \
markers.02.12.04.txt > sangerandUniSTSId.txt
# No need to reformat this for zfishBacClonesandSts
# FPC contig information (i.e. FPC contig number) from ctgnames file is
# not included in the tables as these are dynamic and constantly
# changing with the assembly.
# Received a new version of the zfish_accs.txt file:
# zfish_accs050605.txt (from Mario Caccamo at Sanger), move this to
# /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
awk 'BEGIN {OFS="\t"} {print $1, $2}' zfish_accs.txt | sort \
> zfishOld.accs.sort
awk 'BEGIN {OFS="\t"} {print $1, $2}' zfish_accs050605.txt \
| sort > zfishNew.accs.sort
comm -13 zfishOld.accs.sort zfishNew.accs.sort > zfishNewAccs.only
comm -23 zfishOld.accs.sort zfishNew.accs.sort > zfishOldAccs.only
# 1974 zfishNewAccs.only
# 303 zfishOldAccs.only
# this has 1974 new BAC clone and accessions pairs and 303 of the previous
# BAC internal name and accession pairs are missing compared to
# zfish_accs.txt
awk '{print $1};' zfishOldAccs.only | sort > zfishOld.namesonly.sort
awk '{print $1};' zfishNewAccs.only | sort > zfishNew.namesonly.sort
# also uniq and check no name has more than one accession - each BAC
# name only appears once in the file
comm -12 zfishOld.namesonly.sort zfishNew.namesonly.sort \
> accs.OldandNew.common
wc -l accs.OldandNew.common
# 24 accs.OldandNew.common
# therefore 24 BAC names have new accessions in the new file
# zfishBacClonesandSts.c only reads the first two fields of each line
# in the zfish_accs.txt file so create a new file with just these 2 fields.
# Take the 303 BAC clones and accessions from zfish_accs.txt that are
# not in zfish_accs050605.txt and add to 9000 accs in zfishNew.accs.sort
cat zfishNew.accs.sort zfishOldAccs.only > zfish_accsMerged.txt
# 9303 zfish_accsMerged.txt - BAC clone internal names and accessions
# use zfishBacClonesandSts to create tab files for loading into
# bacCloneAlias and bacCloneXRef tables
mkdir -p /cluster/bluearc/danRer2/bacEnds/out
# added code so that output directory now must be specified
# fixed bug in code so that all aliases are now stored for each Sanger
# STS name and the correct BAC clone external name is stored for internal
# names in all cases.
# remake tab files using the new file mapping Sanger names and UniSTS IDs
# and with the corrected program (2005-05-24, hartera)
nice $HOME/bin/i386/zfishBacClonesandSts ctgnames.02.12.04.txt \
clonemarkers.02.12.04.txt markers.02.12.04.txt \
bacClones.namesandchrom.uniq zfish_accsMerged.txt \
sangerandUniSTSId.txt \
/cluster/bluearc/danRer2/bacEnds/out > zfishBacs.out
# output is in /cluster/bluearc/danRer2/bacends/out so copy over
# sort alias tab file by sangerName
sort -k2 /cluster/bluearc/danRer2/bacEnds/out/bacAlias.tab \
> bacAlias.sort.tab
cp /cluster/bluearc/danRer2/bacEnds/out/bacXRef.tab .
wc -l *.tab
# 55836 bacAlias.sort.tab
# 233418 bacXRef.tab
# there are 234056 entries for danRer1 bacXRef.tab - the difference
# reflects differences in BAC ends mapped to the assemblies
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases
# Remove and reload tables after bug fixes in the zfishBacClonesandSts
# program and the new UniSTS IDs and Sanger names mapping file which
# were used to create these tab files (2005-05-25, hartera)
hgsql danRer2 -e 'drop table bacCloneXRef'
hgsql danRer2 -e 'drop table bacCloneAlias'
hgsql danRer2 < $HOME/kent/src/hg/lib/bacCloneAlias.sql
hgsql danRer2 < $HOME/kent/src/hg/lib/bacCloneXRef.sql
nice hgsql danRer2 -e \
'load data local infile "bacAlias.sort.tab" into table bacCloneAlias'
nice hgsql danRer2 -e \
'load data local infile "bacXRef.tab" into table bacCloneXRef'
cd $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer2
# checked data in tables to check that everything was correctly loaded
# from the files and errors were corrected - see TEST section below
# edit trackDb.ra to add bacEnds tracks and searches for the bacEndPairs
# and bacEndSingles tracks as for danRer1. copy over html from danRer1
# for bacEndPairs and bacEndSingles tracks. Edit to include description
# of filtering alignments so that there is only one pair of sequence reads
# for each BAC pair and only one sequence read for each BAC single alignment.
# Remove incorrect row from bacCloneXRef table (hartera, 2006-04-19)
# There is one row where the name is "CH211-155M11__" and the
# intName is "zC155M11__" which is incorrect.
# There is a correct row for intName zC155M11 and zC155M11__ and
# CH211-155M11__ is not in bacEnd{Singles,Pairs,PairsBad} tables and
# zC155M11__ is not an alias in bacEndAlias.
hgsql -e 'delete from bacCloneXRef where name = "CH211-155M11__";' danRer2
# corrected termRegex for some bacCloneXRef searches so that they work
# correctly (bacPairsSangerSts and bacSinglesSangerSts)
# (2006-04-19, hartera)
# BACENDS: TESTING FOR bacCloneAlias and bacCloneXRef TABLES
# (DONE, 2005-05-25, hartera)
# ADDED TEST TO CHECK SANGERNAMES IN ALIAS AND XREF TABLES
# (DONE, 2005-05-26, hartera)
# The following tests were carried out to check that all the data
# in the bacCloneAlias and bacCloneXRef tables is correct.
mkdir -p \
/cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
# Check that the correct aliases are associated with their Sanger STS names
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
../markers.02.12.04.txt > sNameandaliases
# write script to get one Sanger name and one alias on each line
chmod +x getSangerAndAlias.pl
perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
sort sNameandaliases.format | uniq > sNameandaliases.sort
# get Sanger names and aliases from database
hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer2 \
| sort | uniq > alias.db.sort
wc -l alias.db.sort
# 55836 alias.db.sort
diff sNameandaliases.sort alias.db.sort
# No difference between data file and data from database so ok
# Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
# (2005-05-26, hartera)
# get Sanger names from alias table
hgsql -N -e 'select sangerName from bacCloneAlias;' danRer2 \
| sort | uniq > sName.alias.sort
wc -l sName.alias.sort
# 14940 sName.alias.sort
# get Sanger names from xRef table
hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
is not null;' danRer2 | sort | uniq > sName.xRef.sort
wc -l sName.xRef.sort
# 15153 sName.xRef.sort
comm -23 sName.alias.sort sName.xRef.sort
# nothing in alias file only so all sanger names in the alias table are
# also in the xRef table
comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias
wc -l sNamexRefNotAlias
# 213 sNamexRefNotAlias - 213 sangerNames in alias table not in XRef table
# get Sanger names from clonemarkers file
awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.02.12.04.txt | sort | uniq \ > clonemarkers.sNames.sort
# get Sanger names from markers file
awk 'BEGIN {FS="|"}{print $2}' ../markers.02.12.04.txt > markers.sNames
# remove semi-colons and sort
sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
# sanger names unique to markers file
comm -13 clonemarkers.sNames.sort markers.sNames.sort
# there are none
comm -23 clonemarkers.sNames.sort markers.sNames.sort \
> sNames.clonemarkersOnly
wc -l sNames.clonemarkersOnly
# 213 sNames.clonemarkersOnly
diff sNames.clonemarkersOnly sNamexRefNotAlias
# no difference so all the extra Sanger Names in the xRef table are
# from the clonemarkers file and these have no aliases in the markers file
# so they are not in the alias table so this is all ok.
# Check that Sanger STS names and primers are associated correctly
cd /cluster/data/danRer2/bed/ZonLab/bacends/cloneandStsAliases/testTables
# get sanger names and primers from markers file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
../markers.02.12.04.txt > sNameandPrimers
# write script to reformat and write with one Sanger name per line
chmod +x getSangerandPrimers.pl
perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
sort sNameandPrimers.format > sNameandPrimers.format.sort
wc -l sNameandPrim*
# 12008 sNameandPrimers
# 14940 sNameandPrimers.format
# 14940 sNameandPrimers.format.sort
# get Sanger names and primers from database
hgsql -N -e \
'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
where sangerName is not null and leftPrimer is not null and \
rightPrimer is not null;' danRer2 | sort | uniq \
> sNamesandprimers.fromdb.sort
wc -l sNamesandprimers.fromdb.sort
# 14940 sNamesandprimers.fromdb.sort
diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
# No difference so ok
# Check that UniSTS IDs and Sanger STS names are associated correctly
# get Sanger names and UniSTS IDs from the database
hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
uniStsId is not null;' danRer2 | sort | uniq > sNameUniSTS.fromdb.sort
wc -l sNameUniSTS.fromdb.sort
# 5456 sNameUniSTS.fromdb.sort
# Need to reformat the sNameUniSTS.fromdb.sort
chmod +x formatUniSts.pl
perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
> sNameUniSTS.fromdb.format.sort
# get Sanger names from data file and see how many UniSTS IDs there are
# for each name
awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
> sangerandUniSTSId.count
# the most is 3 UniSTS IDs
# 3 etID9056.23
# 3 etID9042.2
# 3 etID8627.2
# 3 etID8281.9
# 3 etID11096.5
# 2 etID9986.14
# 2 etID9968.17
sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort
diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \
> sangerandUniSTSIdvsdb
# No difference between data from original file and that in database so ok
# Check that chrom mappings and external BAC clone names are correct
# get extNames and chroms they map to from the database
hgsql -N -e 'select name, chroms from bacCloneXRef where \
chroms is not null;' danRer2 | sort | uniq \
> nameandchromsfromdb.sort
# reformat nameandchromsfromdb.sort
perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
> nameandchromsfromdb.format.sort
# compare extNames and chroms from db to those in data file
diff ../bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
# no difference - all ok
# Check Genbank accessions and internal BAC clone names
hgsql -N -e 'select intName,genbank from bacCloneXRef where \
genbank is not null;' danRer2 | sort | uniq \
> intNamesandAccs.fromdb.sort
# this should be a subset of zfish_accsMerged.txt - not all BAC clones
# listed here appear in either our BAC ends tracks or the markers files.
sort ../zfish_accsMerged.txt > zfish_accsMerged.sort
comm -23 intNamesandAccs.fromdb.sort zfish_accsMerged.sort
# there is nothing in the database that is not in zfish_accsMerged.sort
comm -13 intNamesandAccs.fromdb.sort zfish_accsMerged.sort > onlyinzfishAccs
wc -l onlyinzfishAccs
# 87 onlyinzfishAccs
hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' danRer2
| sort | uniq > intNamesNoAcc.fromdb.sort
awk '{print $1;}' zfish_accsMerged.sort | sort > intNames.withAccs.sort
comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
> indbNoAccsandAccs.out
# none of these names are common to both so all accessions from
# zfish_accsMerged.txt are in the database for the internal names stored
# where there are accessions available.
# Test Sanger STS names, internal names and external names are all correct
# Test Sanger STS name and internal BAC clone names are associated correctly
# get internal names and Sanger names from data file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.02.12.04.txt \
| sort | uniq > intNameandSanger.sort
hgsql -N -e 'select intName, sangerName from bacCloneXRef \
where sangerName is not null;' danRer2 \
| sort | uniq > intNameandSanger.fromdb.sort
diff intNameandSanger.sort intNameandSanger.fromdb.sort
# No difference between data from file and that from database so ok
# Check BAC clone internal name and relationship fields
# get internal names and relationships from data file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.02.12.04.txt \
| sort | uniq > intNameandRelation.sort
# get internal names and relationships from database
hgsql -N -e 'select intName, relationship from bacCloneXRef \
where relationship != 0;' danRer2 \
| sort | uniq > intNameandrelation.fromdb.sort
# differences unique to database file
comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
> intNameRelation.indbonly
# differences unique to data file
comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
> intNameRelation.incloneMarkersonly
wc -l intNameRelation*
# 4345 intNameRelation.incloneMarkersonly
# 4345 intNameRelation.indbonly
awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
awk '{print $1}' intNameRelation.incloneMarkersonly \
> intNameRelation.incloneMarkersonly.names
diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
# there is no difference in the internal names with relationship fields
# no difference in names and the only places these should differ is that
# the second column should all be 3 in the data from the database only.
# this is because all the relationship entries that were blank were
# in the clonemarkers file were changed to 3 when entered into the database.
awk '{print $2}' intNameRelation.indbonly | sort | uniq
# 3 - correct so all ok
# all the differences should be that those that are blank in clonemarkers
# are 3 in the database.
# check that those that have 0 in the database bacCloneXRef relationshipe
# field are not in the list from cloneMarkers
# select these internal names with 0 relationship from the database
hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
danRer2 | sort | uniq > intNameNoRelation.fromdb.sort
# get all the internal names from the data file
awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt \
| sort | uniq > intNamefromCloneMarkers.sort
comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
# nothing in common between these two files as expected so there are
# no internal names in the db with 0 in the relationship field that
# appear in the clonemarkers file.
# Check all BAC clone internal names and external names from the
# ctgnames file are in the database
# get intName and extName from ctgnames file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.02.12.04.txt \
| sort | uniq > intNameandextNamefromCtgNames.sort
# get intName and extName from database
hgsql -N -e 'select intName,name from bacCloneXRef;' danRer2 \
| sort | uniq > intNameandextName.fromdb.sort
wc -l intNameandextName*
# 218100 intNameandextName.fromdb.sort
# 167441 intNameandextNamefromCtgNames.sort
comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
> intandextindbAndCtgNames
wc -l intandextindbAndCtgNames
# 167441 intandextindbAndCtgNames
# there are 167441 name pairs common between the file and the database
# and this is the same number of name pairs as in the data file
diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
# no difference between those name pairs from the data file and those that
# are common between the data file and the database so all internal and
# external names from ctgNames file are in the database
# get the list of extra ones from db
comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
> intandextNamesindbNotinCtgNames
wc -l intandextNamesindbNotinCtgNames
# 50659 intandextNamesindbNotinCtgNames
# get list of internal names from the clonemarkers file
awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.02.12.04.txt | sort | uniq \
> clonemarkers.intName.sort
wc -l clonemarkers.intName.sort
# 12987 clonemarkers.intName.sort
# compare these intNames to those from the database not in the ctgnames file
comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
# none of these clone markers internal names are in this list so they
# must all be in the ctgnames file too. These extra internal names will be
# translations of external names found in the list of mappings of BAC clones
# to chroms.
# Check that all the BAC clone external names from the list of chromosome
# mappings and from the ctgnames file are in the database.
# get all extNames from baclones.namesandchrom.uniq and from ctgnames
awk '{print $1}' ../bacClones.namesandchrom.uniq > \
extNames.ctgnamesandbacClones
awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.02.12.04.txt \
>> extNames.ctgnamesandbacClones
wc -l extNames.ctgnamesandbacClones
# 384421 extNames.ctgnamesandbacClones
sort extNames.ctgnamesandbacClones | uniq \
> extNames.ctgnamesandbacClones.sort
wc -l extNames.ctgnamesandbacClones.sort
# 218100 extNames.ctgnamesandbacClones.sort
# get extNames from the database
hgsql -N -e 'select name from bacCloneXRef;' danRer2 | sort | uniq \
> extNames.fromdb.sort
wc -l extNames.fromdb.sort
# 218100 extNames.fromdb.sort
comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
> extNames.fromdbandfiles
wc -l extNames.fromdbandfiles
# 218100 extNames.fromdbandfiles
# find extNames in common from data files and database
diff extNames.fromdb.sort extNames.fromdbandfiles
# no difference, all extNames from files are in db
# Check that all BAC clone internal names from the ctgnames and clonemarkers
# files are in the database
# get internal names from ctgnames and clonemarkers files
awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.02.12.04.txt \
> intNames.ctgnamesandclonemarkers
awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.02.12.04.txt \
>> intNames.ctgnamesandclonemarkers
wc -l intNames.ctgnamesandclonemarkers
# 195746 intNames.ctgnamesandclonemarkers
sort intNames.ctgnamesandclonemarkers | uniq \
> intNames.ctgnamesandclonemarkers.sort
wc -l intNames.ctgnamesandclonemarkers.sort
# 167441 intNames.ctgnamesandclonemarkers.sort
# get internal names from database
hgsql -N -e 'select intName from bacCloneXRef;' danRer2 | sort | uniq \
> intNames.fromdb.sort
wc -l intNames.fromdb.sort
# 218100 intNames.fromdb.sort
# some of these intNames are derived from the corresponding extNames
# all of the intNames from the file should be in the db
comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
> intNames.fromdbandfiles
wc -l intNames.fromdbandfiles
# 167441 intNames.fromdbandfiles
diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
# no difference, all intNames from files are in db
# Check that all translations are correct between BAC clone
# external and internal names.
# write script to get the prefixes from internal and external names
chmod +x getNamePrefixes.pl
hgsql -N -e 'select name, intName from bacCloneXRef;' danRer2 \
| sort | uniq > extandintNames.fromdb.sort
perl getNamePrefixes.pl < extandintNames.fromdb.sort \
> extandintNames.prefixes
sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
# these all look good
# BUSM1 dZ
# CH211 zC
# CH211 zc
# CH73 zH
# CT7 bP
# DKEY zK
# DKEY zk
# DKEYP zKp
# RP71 bZ
# XX bY
# zk is a internal name prefix for the external name prefix, DKEY-. There
# is only one example where this is used (DKEY-81G7) and this in the
# ctgnames file and is in the bacCloneXRef table so that is ok.
# All data looks good in these tables now.
# Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
ssh hgwdev
cd /tmp
wget \
'http://www.genome.gov/Pages/News/Photos/Science/zebrafish_image.jpg'
-O zebrafish_image.jpg
# no crop images for this one, make the thumbnail directly:
convert -geometry 300x200 -quality 80 -sharpen 0 -normalize \
zebrafish_image.jpg Danio_rerio.jpg
cp -p Danio_rerio.jpg /usr/local/apache/htdocs/images
# add links to this image in the description.html page, request push
# EXTRACT AXTs and MAFs FROM FUGU (fr1) NET (DONE, 2004-12-22, hartera)
# GZIP AXTs and MAFs (DONE, 2005-05-16, hartera)
ssh kksilo
# create axts
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
gunzip fuguFr1.net.gz
gunzip chainAR/*.chain.gz
netSplit fuguFr1.net fuguFr1Net
mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
foreach f (fuguFr1Net/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt fuguFr1Net/$c.net chainAR/$c.chain \
/cluster/data/danRer2/nib \
/cluster/data/fr1/nib ../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
'EOF'
chmod +x axtNet.csh
csh axtNet.csh >&! axtNet.log &
tail -100f axtNet.log
# sort axts before making mafs - must be sorted for multiz
cd /cluster/data/danRer2/bed/blastz.fr1
mv axtNet axtNet.unsorted
mkdir axtNet
foreach f (axtNet.unsorted/*.axt)
set c = $f:t:r
echo "Sorting $c"
axtSort $f axtNet/$c.axt
end
# create maf
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.fr1
cd axtNet
mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
foreach f (chr*.axt)
set maf = $f:t:r.fr1.maf
echo translating $f to $maf
axtToMaf $f \
/cluster/data/danRer2/chrom.sizes /cluster/data/fr1/chrom.sizes \
../mafNet/$maf -tPrefix=danRer2. -qPrefix=fr1.
end
'EOF'
chmod +x makeMaf.csh
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
cd /cluster/data/danRer2/bed/blastz.fr1/axtChain
nice gzip fuguFr1.net chainAR/*.chain &
# gzip axts and mafs (hartera, 2005-05-16)
ssh kkstore01
cd /cluster/data/danRer2/bed/blastz.fr1
nice gzip axtNet/*.axt mafNet/*.maf &
# EXTRACT AXTs and MAFs FROM TETRAODON (tetNig1) NET (DONE, 2004-12-22, hartera)
# GZIP AXTs and MAFs AND CHAINS (DONE, 2005-05-16, hartera)
ssh kksilo
# create axts
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
netSplit tetraTetNig1.net tetraTetNig1Net
mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
foreach f (tetraTetNig1Net/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt tetraTetNig1Net/$c.net chainAR/$c.chain \
/cluster/data/danRer2/nib \
/cluster/data/tetNig1/nib ../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
'EOF'
chmod +x axtNet.csh
csh axtNet.csh >&! axtNet.log &
tail -100f axtNet.log
# sort axts before making mafs - must be sorted for multiz
cd /cluster/data/danRer2/bed/blastz.tetNig1
mv axtNet axtNet.unsorted
mkdir axtNet
foreach f (axtNet.unsorted/*.axt)
set c = $f:t:r
echo "Sorting $c"
axtSort $f axtNet/$c.axt
end
# create maf
ssh kksilo
cd /cluster/data/danRer2/bed/blastz.tetNig1
cd axtNet
mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
foreach f (chr*.axt)
set maf = $f:t:r.tetNig1.maf
echo translating $f to $maf
axtToMaf $f \
/cluster/data/danRer2/chrom.sizes /cluster/data/tetNig1/chrom.sizes \
../mafNet/$maf -tPrefix=danRer2. -qPrefix=tetNig1.
end
'EOF'
chmod +x makeMaf.csh
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
# gzip axts and mafs (hartera, 2005-05-16)
ssh kkstore01
cd /cluster/data/danRer2/bed/blastz.tetNig1
nice gzip axtNet/*.axt mafNet/*.maf &
# gzip chains again
cd axtChain
nice gzip chainAR/*.chain &
# TIGR GENE INDEX (DONE, 2004-12-22, hartera)
# Data from rsultana@tigr.org (Razvan Sultana at TIGR)
ssh kksilo
mkdir -p /cluster/data/danRer2/bed/tigr
cd /cluster/data/danRer2/bed/tigr
wget ftp://ftp.tigr.org/pub/data/tgi/Danio_rerio/TGI_track_danRer2_12-2004.tgz
tar xvzf TGI*.tgz
foreach f (*g_gallus*)
set f1 = `echo $f | sed -e 's/g_gallus/chicken/g'`
mv $f $f1
end
foreach f (*drosoph*)
set f1 = `echo $f | sed -e 's/drosoph/Dmelano/g'`
mv $f $f1
end
foreach o (Dmelano chicken elegans human mouse rat zfish)
echo $o
setenv O $o
foreach f (chr*_$o*s)
tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
end
end
ssh hgwdev
cd /cluster/data/danRer2/bed/tigr
hgsql danRer2 -e "drop table tigrGeneIndex"
nice ldHgGene -exon=TC danRer2 tigrGeneIndex *.gff
# Read 114013 transcripts in 395310 lines in 196 files
# 114013 groups 28 seqs 1 sources 1 feature types
# 114013 gene predictions
hgsql danRer2 -e "update tigrGeneIndex set cdsStart = txStart;"
hgsql danRer2 -e "update tigrGeneIndex set cdsEnd = txEnd;"
checkTableCoords danRer2 tigrGeneIndex
/cluster/bin/scripts/runGeneCheck /cluster/data/danRer2/bed/tigr
# 135739 badUtrSplice
# 114013 noCDS - fixed these in table as above
# 30191 gap
gzip *.gff *TCs
# make changes in doTigrGeneIndex function in hgc to add original species
# name back when constructing URL to TIGR web site.
# 3-WAY MULTIZ MULTIPLE ALIGNMENT
# (zebrafish danRer2, fugu fr1 and tetraodon tetNig1)
# (DONE, 2004-12-23, hartera)
# use v.8 of multiz (see makeHg17.doc for 8-WAY alignment)
ssh kksilo
set multizDir = multiz.2004-12-22
set workingDir = /cluster/bluearc/danRer2/$multizDir
ln -s $workingDir /cluster/bluearc/danRer2/multiz3way
mkdir -p $workingDir
mkdir -p /cluster/data/danRer2/bed/$multizDir
ln -s /cluster/data/danRer2/bed/$multizDir \
/cluster/data/danRer2/bed/multiz3way
cd /cluster/data/danRer2/bed/multiz3way
# wrapper script for multiz
# NOTE: first arg is pairwise, 2nd arg is multiple (to add to)
# NOTE: next time, modify script so it only needs one arg -- saves the
# multiple dirname in a file for use by the next run
cat << 'EOF' > doMultiz.csh
#!/bin/csh -fe
mkdir -p $3:h
/cluster/bin/penn/multiz $1 $2 - > $3
'EOF'
# << for emacs
cat << 'EOF' > gsub
#LOOP
../doMultiz.csh {check in line /cluster/bluearc/danRer2/multiz3way/$(dir1)/$(root2).maf} {check in line /cluster/bluearc/danRer2/multiz3way/$(root1)/$(root2).maf} {check out line+ /cluster/bluearc/danRer2/multiz3way/$(root1)$(dir1)/$(root2).maf}
#ENDLOOP
'EOF'
# << for emacs
chmod +x doMultiz.csh
ssh kksilo
set workingDir = /cluster/bluearc/danRer2/multiz3way
# copy mafs to bluearc for tetraodon (tetNig1)
mkdir $workingDir/tetNig1
cd /cluster/data/danRer2/bed/blastz.tetNig1/mafNet
foreach f (*.maf)
set c = $f:t:r
set g = $c:t:r
echo $g
cp $f $workingDir/tetNig1/${g}.maf
end
cd /cluster/data/danRer2/bed/multiz3way
ls $workingDir/tetNig1/*.maf > chrom.lst
# fugu (fr1)
mkdir $workingDir/fr1
cd /cluster/data/danRer2/bed/blastz.fr1/mafNet
foreach f (*.maf)
set c = $f:t:r
set g = $c:t:r
echo $g
cp $f $workingDir/fr1/${g}.maf
end
# one multiz - add in fugu to zebrafish/tetraodon
ssh kki
set workingDir = /cluster/bluearc/danRer2/multiz3way
cd /cluster/data/danRer2/bed/multiz3way
mkdir run.fr1
cd run.fr1
echo "fr1/tetNig1" > species.lst
gensub2 species.lst ../chrom.lst ../gsub jobList
para create jobList
# 28 jobs in batch
para try, check, push, check ...
# para time
# CPU time in finished jobs: 337s 5.62m 0.09h 0.00d 0.000 y
# IO & Wait Time: 153s 2.55m 0.04h 0.00d 0.000 y
# Average job time: 18s 0.29m 0.00h 0.00d
# Longest job: 66s 1.10m 0.02h 0.00d
# Submission to last job: 134s 2.23m 0.04h 0.00d
cd ..
# copy 3-way mafs to build directory
ssh kksilo
set workingDir = /cluster/bluearc/danRer2/multiz3way
ln -s $workingDir/tetNig1fr1 $workingDir/maf
cd /cluster/data/danRer2/bed/multiz3way
mkdir maf
cp $workingDir/maf/*.maf maf
# WZ EST CLUSTERS ALIGNMENTS (DONE, 2004-12-23, hartera)
# WZ ESTs are compiled ESTs from WashU. They were provided by
# Anthony DiBiase from Leonard Zon's lab at the Children's Hospital, Harvard
# Contact: adibiase@enders.tch.harvard.edu
# http://zon.tchlab.org
ssh kksilo
mkdir -p /cluster/data/danRer2/bed/ZonLab/wzESTs
cd /cluster/data/danRer2/bed/ZonLab/wzESTs
# put WZ ESTs in this directory as wzcontigs.txt -
# obtained from the Zon Lab, these are unmasked.
# There are 42857 ESTs in this file.
# Translated to upper case for danRer1
cp /cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa .
cd /cluster/data/danRer2/bed/ZonLab/wzESTs
mkdir -p /cluster/bluearc/danRer2/wzESTs
faSplit sequence wzcontigs.fa 20 /cluster/bluearc/danRer2/wzESTs/wzcontigs
ssh kki
cd /cluster/data/danRer2/bed/ZonLab/wzESTs
mkdir psl
ls -1 /cluster/bluearc/danRer2/wzESTs/*.fa > est.lst
ls -1S /cluster/bluearc/danRer2/trfFa/*.fa > genome.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} -ooc={check in exists /iscratch/i/danRer2/11.ooc} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 genome.lst est.lst gsub spec
para create spec
para try, check, push, try ....
# para time
# Completed: 5980 of 5980 jobs
# CPU time in finished jobs: 17176s 286.27m 4.77h 0.20d 0.001 y
# IO & Wait Time: 22870s 381.16m 6.35h 0.26d 0.001 y
# Average job time: 7s 0.11m 0.00h 0.00d
# Longest job: 49s 0.82m 0.01h 0.00d
# Submission to last job: 2890s 48.17m 0.80h 0.03d
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create wzEsts.psl
ssh kksilo
cd /cluster/data/danRer2/bed/ZonLab/wzESTs
pslSort dirs raw.psl tmp psl
# only use alignments that have at least
# 96% identity in aligned region. use parameters used by auto
# GenBank update for native ESTs
pslReps -minAli=0.96 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp wz_ests.psl /cluster/data/danRer2/jkStuff/liftAll.lft warn contig.psl
# check psl
pslCheck wz_ests.psl >& pslCheck.log
# looks good
# Load EST alignments into database.
ssh hgwdev
cd /cluster/data/danRer2/bed/ZonLab/wzESTs
hgLoadPsl danRer2 wz_ests.psl
# Add WZ EST sequences
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/danRer2/wzESTs
ln -s \
/cluster/data/danRer1/bed/ZonLab/wzESTs/wzcontigs.fa \
/gbdb/danRer2/wzESTs
hgLoadSeq danRer2 /gbdb/danRer2/wzESTs/wzcontigs.fa
# MAKE Human Proteins track (hg17 DONE 2004-12-15 braney)
ssh kksilo
mkdir -p /cluster/data/danRer2/blastDb
cd /cluster/data/danRer2/blastDb
cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list
for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
ln -s `cat list` .
for i in *.fa
do
formatdb -i $i -p F
done
rm *.log *.fa list
cd ..
for i in `cat blastDb/chrom.list`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft
rm blastDb/chrom.list
mkdir /cluster/data/danRer2/scaffoldBlastDb
cd /cluster/data/danRer2/scaffoldBlastDb
cat ../Un/scaffolds/*.fa ../NA/scaffolds/*.fa | faSplit sequence stdin 500 scaf
for i in *.fa
do
formatdb -i $i -p F
done
rm *.log *.fa
ssh kkr1u00
mkdir -p /iscratch/i/danRer2/blastDb
cd /cluster/data/danRer2/blastDb
for i in nhr nin nsq; do cp *.$i /iscratch/i/danRer2/blastDb ; echo $i; done
mkdir -p /iscratch/i/danRer2/scaffoldBlastDb
cd /cluster/data/danRer2/scaffoldBlastDb
for i in nhr nin nsq; do cp *.$i /iscratch/i/danRer2/scaffoldBlastDb ; echo $i; done
cd
(iSync) > sync.out
mkdir -p /cluster/data/danRer2/bed/tblastn.hg17KG
cd /cluster/data/danRer2/bed/tblastn.hg17KG
echo /iscratch/i/danRer2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
echo /iscratch/i/danRer2/scaffoldBlastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > scaffold.lst
# back to kksilo
exit
# we want around 250000 jobs
cd /cluster/data/danRer2/bed/tblastn.hg17KG
calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 42156/(250000/3899) = 657.464976
mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa
split -l 657 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa/kg
ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa kgfa
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2
liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
# I ended up doing the scaffolds separately from the chopped up chrom segments
# but next time I wouldn't
gensub2 scaffold.lst kg.lst blastGsub scaffoldBlastSpec
ssh kk
cd /cluster/data/danRer2/bed/tblastn.hg17KG
para create blastSpec
para push
# Completed: 253435 of 253435 jobs
# CPU time in finished jobs: 58003988s 966733.13m 16112.22h 671.34d 1.839 y
# IO & Wait Time: 13196517s 219941.96m 3665.70h 152.74d 0.418 y
# Average job time: 281s 4.68m 0.08h 0.00d
# Longest job: 6238s 103.97m 1.73h 0.07d
# Submission to last job: 174503s 2908.38m 48.47h 2.02d
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
ssh kki
cd /cluster/data/danRer2/bed/tblastn.hg17KG
para create chainSpec
para push
# Completed: 1287 of 1287 jobs
# CPU time in finished jobs: 72236s 1203.94m 20.07h 0.84d 0.002 y
# IO & Wait Time: 22886s 381.43m 6.36h 0.26d 0.001 y
# Average job time: 74s 1.23m 0.02h 0.00d
# Longest job: 3374s 56.23m 0.94h 0.04d
# Submission to last job: 5982s 99.70m 1.66h 0.07d
exit
# back to kksilo
cd /cluster/data/danRer2/bed/tblastn.hg17KG/blastOut
# again some weirdness because I did the NA and Un scaffolds separately
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > cs60.$i.psl
sort -rn cs60.$i.psl | pslUniq stdin us.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" cs60.$i.psl > ms60.$i.psl
echo $i
done
for i in kg??
do
cat $i/c.*.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n | uniq > /cluster/data/danRer2/bed/tblastn.hg17KG/blastHg17KG.psl
cd ..
# lift the chrUn and chrNA scaffolds
ssh hgwdev
cd /cluster/data/danRer2/bed/tblastn.hg17KG
hgLoadPsl danRer2 blastHg17KG.psl
# back to kksilo
rm -rf blastOut
# End tblastn
# ACCESSIONS FOR CONTIGS (WORKING 2005-03-17 kate)
# Used for Assembly track details, and also ENCODE AGP's
cd /cluster/data/danRer2/bed
mkdir -p contigAcc
cd contigAcc
# extract WGS contig to accession mapping from Entrez Nucleotide summaries
# To get the summary file, access the Genbank page for the project
# by searching:
# genus[ORGN] AND WGS[KYWD]
# At the end of the page, click on the list of accessions for the contigs.
# Select summary format, and send to file.
# output to file <species>.acc
grep scaffold zfish.acc | wc -l
# 21333
# edit hg/encode/regionAgp/contigAccession.pl to recognize zfish format
# and install in /cluster/bin/scripts
contigAccession zfish.acc > contigAcc.tab
wc -l contigAcc.tab
# 21333
grep Zv4 /cluster/data/danRer2/?{,?}/*.agp | wc -l
# 21333
hgsql danRer2 < $HOME/kent/src/hg/lib/contigAcc.sql
echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
hgsql danRer2
hgsql danRer2 -e "SELECT COUNT(*) FROM contigAcc"
# 21333
# KNOWN GENES TRACK (in progress, 2005-02-14, hartera)
# Found number of loci for Ensembl and mRNA by clustering
cd $HOME/kent/src/hg/near/hgClusterGenes
hgClusterGenes -noProt danRer2 ensGene ensCluster ensCanonical
# creates 23675 clusters, there are 32062 entries in ensGene
# remove extra tables created by this program
echo 'drop table ensCluster;' | hgsql danRer2
echo 'drop table ensCanonical;' | hgsql danRer2
cd ./kent/src/hg/geneBounds/clusterRna
clusterRna danRer2 rna.bed est.bed -noEst
wc -l rna.bed
# 10,383 clusters, RefGene has 8397 accessions so that is about right
rm *.bed
# Therefore Ensembl should be the basis for the Known Genes tracks
# since it represents the most loci.
# move Ensembl to be the first track on the browser display and
# with default visibility of pack
# edit zebrafish/danRer2/trackDb.ra
# track ensGene
# shortLabel Ensembl Genes
# longLabel Ensembl Gene Predictions
# group genes
# priority 32.8
# visibility pack
# color 150,0,0
# type genePred ensPep
# url http://www.ensembl.org/perl/transview?transcript=$$
# hgGene on
cd kent/src/hg/hgGene
# edit hgGene.c to add special case of ensGene table for Zebrafish
# from kent/src/hg/near/makeNear.doc
ssh hgwdev
cd kent/src/hg/near/hgNear/hgNearData
mkdir Zebrafish
cvs add Zebrafish
cd Zebrafish
cp ../C_elegans/genome.ra .
# edit this to be specific for zebrafish and commit to CVS
cvs add genome.ra
cvs commit genome.ra
cd $HOME/kent/src/hg/near/hgClusterGenes
# Cluster together various alt splice forms
hgClusterGenes -noProt danRer2 ensGene ensIsoforms ensCanonical
# Got 23675 clusters, from 32062 genes in 28 chromosomes
# need to add genome.ra in hgGeneData
cd $HOME/kent/src/hg/hgGene/hgGeneData
mkdir Zebrafish
cp ../C_elegans/genome.ra .
# edit genome.ra and commit to CVS
cp ../C_elegans/links.ra .
# edit links.ra and commit to CVS
# download protein accessions from Ensembl
# go to http://www.ensembl.org/Multi/martview
# select Ensembl 29 and Danio rerio genes (ZFISH 4)
# go to Features and select Ensembl Gene ID from Ensembl Attributes,
# then from External References, select UniProt/Swiss-ProtAC,
# and UniProt AC and RefSeq DNA accession
# copy to this directory and unizip
gunzip dr2EnsemblUniProt.gz
wc -l dr2EnsemblUniProt
# 34115 dr2EnsemblUniProt
awk 'BEGIN{FS="\t"} {print $3;}' dr2EnsemblUniProt > ensembl.uniprot
sort ensembl.uniprot | uniq > ensembl.uniprot.uniq
# 4361
# download uniProt AC, UniProt/SPTREMBL ID, UniProt/Swiss-Prot ID
# remove blank lines from ensembl.uniprot
sed -e '/^$/d' ensembl.uniprot > ensembl.uniprot.accsonly
# there are 6171 UniProt accessions so some Ensembl IDs share accessions
#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)
# EXTRACT AXTs and MAFs FROM MOUSE (mm6) NET - already done, see
# /cluster/data/danRer2/bed/blastz.mm6.swap/mafNet and makeMm6.doc
# EXTRACT AXTs and MAFs FROM OPOSSUM (monDom1) NET
# (DONE, 2005-05-13, hartera) - see makeMonDom1.doc
# EXTRACT AXTs AND MAF's FROM HUMAN NET (DONE, 2005-05-15, hartera)
ssh kkstore01
# create axts
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
gunzip humanhg17.net.gz
gunzip chainAR/*.chain.gz
netSplit humanhg17.net humanHg17Net
mkdir -p ../axtNet
cat > axtNet.csh << 'EOF'
foreach f (humanHg17Net/chr*.net)
set c = $f:t:r
echo "axtNet on $c"
netToAxt humanHg17Net/$c.net chainAR/$c.chain \
/cluster/data/danRer2/nib \
/cluster/data/hg17/nib ../axtNet/$c.axt
echo "Complete: $c.net -> $c.axt"
end
'EOF'
chmod +x axtNet.csh
csh axtNet.csh >&! axtNet.log &
tail -100f axtNet.log
# sort axts before making mafs - must be sorted for multiz
cd /cluster/data/danRer2/bed/blastz.hg17.swap
mv axtNet axtNet.unsorted
mkdir axtNet
foreach f (axtNet.unsorted/*.axt)
set c = $f:t:r
echo "Sorting $c"
axtSort $f axtNet/$c.axt
end
# create maf
ssh kkstore01
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtNet
mkdir ../mafNet
cat > makeMaf.csh << 'EOF'
foreach f (chr*.axt)
set maf = $f:t:r.hg17.maf
echo translating $f to $maf
axtToMaf $f \
/cluster/data/danRer2/chrom.sizes /cluster/data/hg17/chrom.sizes \
../mafNet/$maf -tPrefix=danRer2. -qPrefix=hg17.
end
'EOF'
chmod +x makeMaf.csh
csh makeMaf.csh >&! makeMaf.log &
tail -100f makeMaf.log
cd /cluster/data/danRer2/bed/blastz.hg17.swap/axtChain
nice gzip humanhg17.net chainAR/*.chain &
cd /cluster/data/danRer2/bed/blastz.hg17.swap
nice gzip axtNet/*.axt mafNet/*.maf &
# PREP FOR LIFTOVER CHAINS FOR THIS ASSEMBLY (DONE, 2005-05-15, hartera)
# split into 3k chunks
ssh kkr1u00
cd /cluster/data/danRer2
set liftDir = /iscratch/i/danRer2/liftOver/liftSplit
mkdir -p $liftDir
cd $liftDir
mkdir -p split lift
cat > split.csh << 'EOF'
set liftDir = /iscratch/i/danRer2/liftOver/liftSplit
cd /cluster/data/danRer2
foreach n (`ls ?{,?}/*.fa`)
set d = $n:h
set c = $n:t:r
echo $c
faSplit -lift=$liftDir/lift/$c.lft size \
/cluster/data/danRer2/$d/$c.fa -oneFile 3000 $liftDir/split/$c
end
'EOF'
# << for emacs
chmod +x split.csh
csh split.csh >&! split.log &
tail -100f split.log
mkdir /iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA
mv /iscratch/i/danRer2/liftOver/liftSplit/split/chrUn.fa \
/iscratch/i/danRer2/liftOver/liftSplit/split/chrNA.fa \
/iscratch/i/danRer2/liftOver/liftSplit/split/UnandNA
iSync
# MAKE VSTETNIG1 DOWNLOADABLES (DONE, 2004-05-16, hartera)
ssh hgwdev
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChrom
set gp = /usr/local/apache/htdocs/goldenPath/danRer2
mkdir -p $gp/vsTetNig1/axtChrom
cp -p *.axt $gp/vsTetNig1/axtChrom
cd $gp/vsTetNig1/axtChrom
gzip *.axt
md5sum *.gz > md5sum.txt
# copy chains and net to downloads area
cd /cluster/data/danRer2/bed/blastz.tetNig1/axtChain
gzip -c all.chainARFilt5k > /cluster/data/danRer2/zip/tetraTetNig1.chain.gz
gzip -c tetraTetNig1.net > /cluster/data/danRer2/zip/tetraTetNig1.net.gz
cd $gp/vsTetNig1
mv /cluster/data/danRer2/zip/tetra*.gz .
md5sum *.gz > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# 6-WAY VAR_MULTIZ ALIGNMENTS (DONE, 2005-05-20, hartera)
# Species: zebrafish(danRer2), human (hg17), mouse(mm6), opossum(monDom1),
# fugu(fr1) and tetraodon(tetNig1)
# Tetraodon and Fugu are quite distant from zebrafish, more distant than
# human/chicken so use the HoxD55.q matrix in future for blastz with these
# species. Here the pairwise alignments of zebrafish with Tetraodon and Fugu
# were done using the default matrix for blastz. (2005-06-01, hartera)
# zebrafish(danRer2), human (hg17), mouse(mm6), opossum(monDom1),
# fugu(fr1) and tetraodon(tetNig1)
# Prepared tree image for Conservation track details page (2005-06-07, hartera)
ssh kkstore01
mkdir -p /cluster/data/danRer2/bed/multiz6way
cd /cluster/data/danRer2/bed/multiz6way
mkdir mafLinks
# set up directories for links to mafs for each pairwise alignment
mkdir mafLinks/hg17
mkdir mafLinks/mm6
mkdir mafLinks/monDom1
mkdir mafLinks/fr1
mkdir mafLinks/tetNig1
set dir=/cluster/data/danRer2/bed
set dirMonDom=/cluster/data/monDom1/bed
# need to make all the links so that they are just chrN.maf.gz
# without the species db in the middle e.g. chrN.hg17.maf.gz
ln -s $dir/blastz.mm6.swap/mafNet/*.maf.gz ./mafLinks/mm6
echo "$dir/blastz.hg17.swap" > dir.lst
echo "$dirMonDom/blastz.monDom1.to.danRer2" >> dir.lst
echo "$dir/blastz.fr1" >> dir.lst
echo "$dir/blastz.tetNig1" >> dir.lst
foreach d (`cat dir.lst`)
foreach m ($d/mafNet/*.maf.gz)
foreach c (`cat /cluster/data/danRer2/chrom.lst`)
foreach sp (hg17 monDom1 fr1 tetNig1)
set maf = ${d}/mafNet/chr${c}.${sp}.maf.gz
if ($m == $maf) then
set s = $sp
echo $s
endif
end
ln -s $d/mafNet/chr${c}.${s}.maf.gz ./mafLinks/$s/chr${c}.maf.gz
end
end
end
# Copy MAFS to Iservers for kluster run
ssh kkr1u00
mkdir /iscratch/i/danRer2/multiz6way
cd /iscratch/i/danRer2/multiz6way
rsync -a --copy-links --progress \
/cluster/data/danRer2/bed/multiz6way/mafLinks/ .
# 321 Mb of data - takes seconds
mkdir penn
cp -p /cluster/bin/penn/psuCVS/multiz-tba/multiz penn
cp -p /cluster/bin/penn/maf_project penn
/cluster/bin/iSync
# Progressive alignment up the tree w/o stager,
# using multiz.v10 (var_multiz)
# Method: align internal subtrees (using 0 flag to var_multiz)
# Then, align these to human (using 1 flag to var_multiz)
# NOTE: must use maf_project after each multiz run, in order
# to order output. Single-cov guaranteed by use of net MAF's,
# so it is not necessary to run single_cov2.
# make output dir and run dir
ssh kki
cd /cluster/data/danRer2/bed/multiz6way
mkdir -p maf
mkdir -p run
cd run
# create scripts to run var_multiz on cluster
cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set multi = /scratch/danRer2/multiz6way.$c
set pairs = /iscratch/i/danRer2/multiz6way
# special mode --
# with 1 arg, cleanup
if ($#argv == 1) then
rm -fr $multi
exit
endif
# special mode --
# with 3 args, saves an alignment file
if ($#argv == 3) then
cp $multi/$2/$c.maf $3
exit
endif
set s1 = $2
set s2 = $3
set flag = $4
# locate input files -- in pairwise dir, or multiple dir
set d1 = $multi
set d2 = $multi
if (-d $pairs/$s1) then
set d1 = $pairs
set f1 = $d1/$s1/$c.maf.gz
set t1 = /tmp/$s1.$c.maf
zcat $f1 > $t1
else
set f1 = $d1/$s1/$c.maf
set t1 = /tmp/$s1.$c.maf
cp -p $f1 $t1
endif
if (-d $pairs/$s2) then
set d2 = $pairs
set f2 = $d2/$s2/$c.maf.gz
set t2 = /tmp/$s2.$c.maf
zcat $f2 > $t2
else
set f2 = $d2/$s2/$c.maf
set t2 = /tmp/$s2.$c.maf
cp -p $f2 $t2
endif
# write to output dir
set out = $multi/${s1}${s2}
mkdir -p $out
# check for empty input file
if (-s $t1 && -s $t2) then
echo "Aligning $f1 $f2 $flag"
/iscratch/i/danRer2/multiz6way/penn/multiz $t1 $t2 $flag $out/$c.unused1.maf $out/$c.unused2.maf > $out/$c.full.maf
cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
$out/$c.tmp.maf
echo "Ordering $c.maf"
/iscratch/i/danRer2/multiz6way/penn/maf_project $out/$c.tmp.maf danRer2.$c > $out/$c.maf
rm -f $t1 $t2
else if (-s $t1) then
cp -p $t1 $out/$c.maf
rm -f $t1
else if (-s $t2) then
cp -p $t2 $out/$c.maf
rm -f $t2
endif
'EOF'
# << keep emacs coloring happy
chmod +x oneMultiz.csh
# Copy this script to iscratch
ssh kkr1u00
cd /iscratch/i/danRer2/multiz6way/penn
cp -p /cluster/data/danRer2/bed/multiz6way/run/oneMultiz.csh .
/cluster/bin/iSync
# back to run the job
ssh kki
cd /cluster/data/danRer2/bed/multiz6way/run
# This tree.nh was used in the distant past for early versions
# of phastCons. Now, this is merely a convenient reference to the
# tree under construction. This is also used to draw a graphic
# tree as species.nh, see below.
cat << '_EOF_' > tree.nh
((hg17,mm6),monDom1),((tetNig1,fr1),danRer2))
'_EOF_'
# << this line keeps emacs coloring happy
cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
# multiple alignment steps:
set c = $1
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c hg17 mm6 0
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c monDom1 hg17mm6 0
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1 fr1 1
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1fr1 monDom1hg17mm6 1
# get final alignment file
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c tetNig1fr1monDom1hg17mm6 /cluster/data/danRer2/bed/multiz6way/maf/$c.maf
#cleanup
/iscratch/i/danRer2/multiz6way/penn/oneMultiz.csh $c
'EOF'
# << keep emacs coloring happy
chmod +x allMultiz.csh
cat << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/danRer2/bed/multiz6way/maf/$(root1).maf}
#ENDLOOP
'EOF'
cd /cluster/data/danRer2/bed/multiz6way/run
awk '{print $1}' ../../../chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
para try; para check
para push
# para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 5903s 98.39m 1.64h 0.07d 0.000 y
# IO & Wait Time: 227s 3.78m 0.06h 0.00d 0.000 y
# Average job time: 219s 3.65m 0.06h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1328s 22.13m 0.37h 0.02d
# Submission to last job: 2098s 34.97m 0.58h 0.02d
# do not filter mafs as only removes a small fraction of alignments
# better to keep them all. check for single column alignments (these
# just have a single base for each species in the alignment), if
# present need to do glueing step.
ssh hgwdev
# create tab file for chr1.maf
cd /cluster/data/danRer2/bed/multiz6way
hgLoadMaf danRer2 multiz6way -test=./maf/chr1.maf
awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $5, $6, $4-$3;}' \
multiz6way.tab > chr1.tab.lengths
sort -n -k6 chr1.tab.lengths > chr1.tab.lengths.sort
# chr1 - 10 single column alignments
# also looked at chrUn - 14 single column alignments
# there are also a small number of 2,3,4 column alignments.
# This is ok as it is such a small number.
# use mafAddIRows to 'chain' the alignments
ssh kkstore01
cd /cluster/data/danRer2/bed/multiz6way
mkdir mafAddIRows
foreach m (./maf/*.maf)
set n=$m:t
echo "Processing $n ..."
mafAddIRows $m /cluster/data/danRer2/danRer2.2bit ./mafAddIRows/$n
end
# cat maf files into one file for those alignments without iRows,
# do not use alignments with iRows in browser at the moment
catDir maf > multiz6way.maf
# 978 Mb of data
# -rw-rw-r-- 1 hartera protein 977796776 May 20 12:06 multiz6way.maf
# Load into database
ssh hgwdev
cd /cluster/data/danRer2/bed/multiz6way
mkdir /gbdb/danRer2/multiz6way
ln -s /cluster/data/danRer2/bed/multiz6way/multiz6way.maf \
/gbdb/danRer2/multiz6way
hgLoadMaf danRer2 multiz6way
# Loaded 1230321 mafs in 1 files from /gbdb/danRer2/multiz6way
# took about 2 minutes to load
hgLoadMafSummary -minSize=10000 -mergeGap=500 -maxSize=50000 danRer2 \
multiz6waySummary multiz6way.maf
# Processed 2565228 components in 1230321 mafs from multiz6way.maf
# took about 2 minutes to load
# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql danRer2 -e "alter table multiz6waySummary drop index chrom_2"
hgsql danRer2 -e "alter table multiz6waySummary drop index chrom_3"
# create tree image - like tree.nh but with common names, added zebrafish
# to tree (2005-06-07, hartera):
cat << '_EOF_' > species6.nh
(((human,mouse),opossum),((tetraodon,fugu),zebrafish))
'_EOF_'
/cluster/bin/phast/draw_tree -b -s species6.nh > species6.ps
# used GIMP to reduce the amount of whitespace to make it
# smaller, then save as jpg
display species6.jpg
# use the Transform->Chop and then Direction->horizontal or vertical
# to reduce the tree size while keeping label sizes the same
cp species6.jpg /usr/local/apache/htdocs/images/phylo/DanRer2_6way.jpg
# change permissions for display
chmod +r /usr/local/apache/htdocs/images/phylo/DanRer2_6way.jpg
# add all.joiner entry for danRer2 multiz6way (2005-05-31, hartera)
# add trackDb.ra entry
# track multiz6way
# shortLabel 6-Way Conservation
# longLabel 6-Way Vertebrate Multiz Alignment & Conservation
# group compGeno
# priority 109
# visibility hide
# color 0, 10, 100
# altColor 0,90,10
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# #wiggle phastCons6way
# yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
# summary multiz6waySummary
# speciesGroups mammal vertebrate
# sGroup_mammal hg17 mm6 monDom1
# sGroup_vertebrate fr1 tetNig1
# treeImage phylo/DanRer2_6way.jpg
# PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 6-WAY ALIGNMENT
# (DONE, 2005-06-01, hartera)
# Tetraodon and Fugu are quite distant from zebrafish, more distant than
# human/chicken so use the HoxD55.q matrix in future for blastz with these
# species. Here the pairwise alignments of zebrafish with Tetraodon and Fugu
# that were used for the 6-way multiz were done using the default matrix
# for blastz. (2005-06-01, hartera)
ssh kkstore01
mkdir /cluster/data/danRer2/bed/multiz6way/cons
cd /cluster/data/danRer2/bed/multiz6way/cons
# create a starting-tree.mod based on chr5 (67Mb - largest chrom)
# chr5 is the largest chrom apart from NA and Un
/cluster/bin/phast/msa_split ../maf/chr5.maf \
--refseq ../../../5/chr5.fa --in-format MAF \
--windows 100000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
# takes about 2 minutes
/cluster/bin/phast/phyloFit -i SS s1.*.ss \
--tree "((tetNig1,fr1),((mm6,hg17),monDom1))" \
--out-root starting-tree
# only takes about 5 minutes
rm s1.*.ss
# Get genome-wide average GC content (for all species together,
# not just the reference genome). If you have a globally
# estimated tree model, as above, you can get this from the
# BACKGROUND line in the .mod file. E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.309863 0.189473 0.189412 0.311252
# This implies a GC content of 0.1895 + 0.1895 = 0.379
# If you do *not* have a global tree model and you do not know
# your GC content, you can get it directly from the MAFs with
# a command like:
# /cluster/bin/phast/msa_view \
# --aggregate danRer2,tetNig1,fr1,mm6,hg17,monDom1 -i MAF \
# --summary-only /cluster/data/danRer2/bed/multiz6way/maf/chr*.maf \
# > maf_summary.txt
# this gives GC content of 0.4026 so similar
# break up the genome-wide MAFs into pieces
ssh kkstore01
mkdir -p /cluster/bluearc/danRer2/cons/ss
cd /cluster/bluearc/danRer2/cons/ss
bash
for C in `awk '{print $1}' /cluster/data/danRer2/chrom.sizes`
do
if [ -s /cluster/data/danRer2/bed/multiz6way/maf/${C}.maf ]; then
echo msa_split $C
chrN=${C/chr/}
/cluster/bin/phast/msa_split \
/cluster/data/danRer2/bed/multiz6way/maf/${C}.maf \
--refseq /cluster/data/danRer2/${chrN}/${C}.fa \
--in-format MAF --windows 1000000,0 --between-blocks 5000 \
--out-format SS -I 1000 --out-root ${C}
fi
done
# took 15 minutes to run
# Create a random list of 50 1 mb regions (do not use the NA and Un)
ls -l | grep -v "NA" > fileList
cat fileList | grep -v "Un" > fileList2
mv fileList2 fileList
cat fileList | awk '$5 > 4000000 {print $9;}' | \
randomLines stdin 50 ../randomSs
rm fileList
# Set up parasol directory to calculate trees on these 50 regions
ssh kk9
mkdir -p /cluster/bluearc/danRer2/cons/treeRun1
cd /cluster/bluearc/danRer2/cons/treeRun1
mkdir tree log
# now set up cluster job to estimate model parameters. Parameters
# will be estimated separately for each alignment fragment then
# will be combined across fragments
# Tuning this loop should come back to here to recalculate
# Create little script that calls phastCons with right arguments
cat > makeTree << '_EOF_'
/cluster/bin/phast/phastCons ../ss/$1.ss \
/cluster/data/danRer2/bed/multiz6way/cons/starting-tree.mod \
--gc 0.403 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-lengths 12 --target-coverage 0.17 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
# emacs happy
chmod a+x makeTree
# msa_view as this is from the whole genome. Also, notice the
# target coverage of 0.17. Here we are going to aim for 65% coverage
# of coding regions by conserved elements.
# Create gensub file
cat > template << '_EOF_'
#LOOP
makeTree $(root1)
#ENDLOOP
'_EOF_'
# happy emacs
# Make cluster job and run it
gensub2 ../randomSs single template jobList
para create jobList
para try/push/check/etc
# para time
# Completed: 50 of 50 jobs
# CPU time in finished jobs: 4013s 66.89m 1.11h 0.05d 0.000 y
# IO & Wait Time: 139s 2.31m 0.04h 0.00d 0.000 y
# Average job time: 83s 1.38m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 135s 2.25m 0.04h 0.00d
# Submission to last job: 5260s 87.67m 1.46h 0.06d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ls tree/*.cons.mod > cons.txt
/cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
--output-average ../ave.cons.mod > cons_summary.txt
ls tree/*.noncons.mod > noncons.txt
/cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
--output-average ../ave.noncons.mod > noncons_summary.txt
cd ..
cp -p ave.*.mod /cluster/data/danRer2/bed/multiz6way/cons
# measuring entropy
# consEntropy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
# never stops with the --NH argument
# target entropy should be L_min*H=9.8 bits, the expected length
# that produces this entropy is the one to use for phastCons.
/cluster/bin/phast/consEntropy 0.17 12 \
ave.cons.mod ave.noncons.mod
# the above steps were repeated with the target coverage and expected lengths
# parameters set as below:
# -target-coverage=0.17 -expected-lengths 12
#Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=0.768727 bits/site
# Expected min. length: L_min=13.932145 sites
# Expected max. length: L_max=8.869545 sites
# Total entropy: L_min*H=10.710017 bits
# -target-coverage=0.20 -expected-lengths 10
#Transition parameters:gamma=0.200000,omega=10.000000, mu=0.100000, nu=0.025000
# Relative entropy: H=0.799627 bits/site
# Expected min. length: L_min=12.358880 sites
# Expected max. length: L_max=7.593192 sites
# Total entropy: L_min*H=9.882496 bits
# -target-coverage=0.20 -expected-lengths=9
#Transition parameters:gamma=0.200000, omega=9.000000, mu=0.111111, nu=0.027778
# Relative entropy: H=0.805459 bits/site
# Expected min. length: L_min=12.022440 sites
# Expected max. length: L_max=7.111297 sites
# Total entropy: L_min*H=9.683580 bits
# -target-coverage=0.25 -expected-lengths=10
#Transition parameters:gamma=0.250000, omega=10.000000, mu=0.100000, nu=0.033333
# Relative entropy: H=0.843206 bits/site
# Expected min. length: L_min=10.846871 sites
# Expected max. length: L_max=6.973477 sites
# Total entropy: L_min*H=9.146148 bits
# -target-coverage=0.35 -expected-lengths=12
#Transition parameters:gamma=0.350000, omega=12.000000, mu=0.083333, nu=0.044872
# Relative entropy: H=0.917684 bits/site
# Expected min. length: L_min=9.169807 sites
# Expected max. length: L_max=6.687135 sites
# Total entropy: L_min*H=8.414989 bits
# -target-coverage=0.35 -expected-lengths=14
#Transition parameters:gamma=0.350000,omega=14.000000, mu=0.071429, nu=0.038462
# Relative entropy: H=0.903639 bits/site
# Expected min. length: L_min=9.778765 sites
# Expected max. length: L_max=7.300778 sites
# Total entropy: L_min*H=8.836478 bits
# -target-coverage=0.35 -expected-lengths=18
#Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
# Relative entropy: H=0.881986 bits/site
# Expected min. length: L_min=10.798324 sites
# Expected max. length: L_max=8.320602 sites
# Total entropy: L_min*H=9.523968 bits
# -target-coverage=0.35 -expected-lengths=19
#Transition parameters:gamma=0.350000,omega=19.000000, mu=0.052632, nu=0.028340
# Relative entropy: H=0.877572 bits/site
# Expected min. length: L_min=11.021340 sites
# Expected max. length: L_max=8.542773 sites
# Total entropy: L_min*H=9.672025 bits
# -target-coverage=0.32 -expected-lengths=20
#Transition parameters:gamma=0.320000, omega=20.000000, mu=0.050000, nu=0.023529
# Relative entropy: H=0.853466 bits/site
# Expected min. length: L_min=11.824480 sites
# Expected max. length: L_max=9.084203 sites
# Total entropy: L_min*H=10.091797 bits
#### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####
# -target-coverage=0.32 -expected-lengths=18
#Transition parameters:gamma=0.320000,omega=18.000000, mu=0.055556, nu=0.026144
# Relative entropy: H=0.860401 bits/site
# Expected min. length: L_min=11.402969 sites
# Expected max. length: L_max=8.648243 sites
# Total entropy: L_min*H=9.811131 bits
# need to iterate and get the right coverage and parameters
# try running phastCons below with parameters used above and check the
# coverage of coding regions by the most conserved elements
# Create cluster dir to do main phastCons run
ssh kk9
mkdir /cluster/bluearc/danRer2/cons/consRun1
cd /cluster/bluearc/danRer2/cons/consRun1
mkdir ppRaw bed
# Create script to run phastCons with right parameters
# This job is I/O intensive in its output files. To make this
# cluster safe, it would be better to do this work somewhere over
# in /tmp/... and copy the final result back. kk9 can do this
# run, but kk cannot.
# Run whole genome, this goes fast in this case.
cat > doPhast << '_EOF_'
mkdir -p ppRaw/$2
/cluster/bin/phast/phastCons ../ss/$1.ss ../ave.cons.mod,../ave.noncons.mod \
--expected-lengths 12 --target-coverage 0.17 --quiet --seqname $2 \
--idpref $2 --viterbi bed/$1.bed --score --require-informative 0 > \
ppRaw/$2/$1.pp
'_EOF_'
# emacs happy
chmod a+x doPhast
# Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast $(file1) $(root1)
#ENDLOOP
'_EOF_'
# happy emacs
# Create parasol batch and run it for the whole genome
ls -1 ../ss | sed 's/.ss//' > in.lst
gensub2 in.lst single template jobList
para create jobList
para try/check/push/etc.
# para time
# Completed: 1604 of 1604 jobs
# CPU time in finished jobs: 11005s 183.42m 3.06h 0.13d 0.000 y
# IO & Wait Time: 30912s 515.19m 8.59h 0.36d 0.001 y
# Average job time: 26s 0.44m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 55s 0.92m 0.02h 0.00d
# Submission to last job: 608s 10.13m 0.17h 0.01d
# combine predictions and transform scores to be in 0-1000 interval
# it uses a lot of memory, so on kolossus:
ssh kolossus
cd /cluster/bluearc/danRer2/cons/consRun
catDir bed | awk ' \
{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > \
/cluster/data/danRer2/bed/multiz6way/mostConserved.bed
# Figure out how much is actually covered by the bed files as so:
ssh kkstore01
cd /cluster/data/danRer2/bed/multiz6way
foreach f ( `cat /cluster/data/danRer2/chrom.lst` )
cat $f/chr${f}.fa >> allChroms.fa
end
faSize allChroms.fa
# 1589273282 bases (101846391 N's 1487426891 real 813899141 upper
# 673527750 lower) in 28 sequences in 1 files
# Total size: mean 56759760.1 sd 58536773.4 min 16596 (chrM)
# max 318297480 (chrNA) median 42763181
awk '{sum+=$3-$2} \
END{printf "%% %.2f = 100.0*%d/1487426891\n",100.0*sum/1487426891,sum}' \
mostConserved.bed
-target-coverage 0.17: % 1.99 = 100.0*29631918/1487426891 length 12
-target-coverage 0.20: % 1.96 = 100.0*29183170/1487426891 length 10
-target-coverage 0.25: % 2.11 = 100.0*31391360/1487426891 length 10
-target-coverage 0.35 % 2.69 = 100.0*39940068/1487426891 length 19
-target-coverage 0.32 % 2.60 = 100.0*38730791/1487426891 length 18
# the non-N genome size, from faSize on all chroms: 1487426891
# check with featureBits
ssh hgwdev
cd /cluster/data/danRer2/bed/multiz6way
# get an or of refGene and mgcGenes CDS regions
featureBits danRer2 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
# 9745850 bases of 1560497282 (0.625%) in intersection
featureBits danRer2 refSeqOrMgcCds.bed mostConserved.bed -enrichment
# featureBits danRer2 refSeqOrMgcCds.bed mostConserved.bed -enrichment
# -target-coverage=0.17 -expected-lengths=12
# refSeqOrMgcCds.bed 0.625%, mostConserved.bed 1.899%, both 0.350%,
# cover 56.07%, enrich 29.53x
# -target-coverage=0.20 -expected-lengths=10
# refSeqOrMgcCds.bed 0.625%, mostConserved.bed 1.870%, both 0.347%,
# cover 55.54%, enrich 29.70x
# -target-coverage=0.25 -expected-length=10
# refSeqOrMgcCds.bed 0.625%, mostConserved4.bed 2.012%, both 0.364%,
# cover 58.35%, enrich 29.01x
# -target-coverage=0.35 -expected-lengths=19
# refSeqOrMgcCds.bed 0.625%, mostConserved5.bed 2.559%, both 0.431%,
# cover 68.98%, enrich 26.95x
# -target-coverage=0.32 -expected-lengths=18
# refSeqOrMgcCds.bed 0.625%, mostConserved6.bed 2.482%, both 0.423%,
# cover 67.75%, enrich 27.30x
# looking at ensGene:
# featureBits danRer2 ensGene:cds mostConserved.bed -enrichment
# ensGene:cds 2.135%, mostConserved.bed 2.482%, both 1.210%,
# cover 56.67%, enrich 22.83x
# so use this result with -target-coverage=0.32 -expected-lengths=18
# total entropy is 9.81 which is good, aiming for 9.8 and 65% coverage
# of coding regions with most conserved elements, here it is 67.8%
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/danRer2/bed/multiz6way
hgLoadBed danRer2 phastConsElements mostConserved.bed
# Loaded 393654 elements of size 5
# less than 1 minute load time
featureBits danRer2 -enrichment refSeqOrMgcCds.bed phastConsElements
# refSeqOrMgcCds.bed 0.625%, phastConsElements 2.482%, both 0.423%,
# cover 67.75%, enrich 27.30x
featureBits danRer2 mgcGenes:cds phastConsElements -enrichment
# mgcGenes:cds 0.486%, phastConsElements 2.482%, both 0.336%,
# cover 69.06%, enrich 27.82x
featureBits danRer2 refGene:cds phastConsElements -enrichment
# refGene:cds 0.597%, phastConsElements 2.482%, both 0.400%,
# cover 66.97%, enrich 26.98x
# Create merged posterier probability file and wiggle track data files
ssh kkstore01
cd /cluster/bluearc/danRer2/cons/consRun
# interesting sort here on the chr name and position.
# first convert all . and - characters to special strings x/ and x_/
# to get a consistent delimiter of / for all fields to be sorted.
# Then do the sort on the chrom name and the start position, after
# the sort convert the special stringx x_/ and x/ back to - and .
# respectively. This gets everything in order by chrom name and
# chrom start.
find ./ppRaw -type f | sed -e "s#\.#x/#g; s#-#x_/#g" | \
sort -t"/" -k4,4 -k6,6n | sed -e "s#x_/#-#g; s#x/#.#g" | xargs cat | \
wigEncode stdin phastCons6way.wig phastCons6way.wib
# about 3 minutes for above
ssh kkstore01
cd /cluster/bluearc/danRer2/cons/consRun
cp -p phastCons6way.wi? /cluster/data/danRer2/bed/multiz6way/cons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/danRer2/bed/multiz6way/cons
mkdir -p /gbdb/danRer2/wib
ln -s `pwd`/phastCons6way.wib /gbdb/danRer2/wib/phastCons6way.wib
# use this if need to reload table
hgsql -e 'drop table phastCons6way;' danRer2
# load table
hgLoadWiggle danRer2 phastCons6way phastCons6way.wig
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/danRer2/bed/multiz6way/cons
bash
time hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=danRer2 phastCons6way > histogram.data 2>&1
# real 3m1.185s user 2m19.440s sys 0m11.850s
# create plot of histogram:
cat << '_EOF_' > histo.gp
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer2 Histogram phastCons6 track"
set xlabel " phastCons6 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# happy emacs
gnuplot histo.gp > histo.png
display histo.png &
# add line: wiggle phastCons6way to trackDb.ra for multiz6way to display the
# wiggle for the conservation track.
# copy over html for multiz and edit.
# PHASTCONS SCORES DOWNLOADABLES (DONE, 2005-05-31, hartera)
# prepare compressed copy of ascii data values for downloads
ssh kkstore01
cd /cluster/bluearc/danRer2/cons/consRun
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons6wayScores
ls ppRaw | while read D
do
out=${TOP}/phastCons6wayScores/${D}.gz
echo -n "$out ... "
cd ${TOP}/ppRaw/${D}
gzip -c `ls *.pp | sed -e "s#-#.x-x.#g;" | \
sort -t"." -k1,1 -k2,2n | sed -e "s#.x-x.#-#g;"` > ${out}
echo "done"
done
'_EOF_'
# happy emacs
chmod +x gzipAscii.sh
time ./gzipAscii.sh
# 104.769u 8.789s 3:00.18 63.0% 0+0k 0+0io 11pf+0w
# 182 Mb of data
# copy scores for downloads
ssh kkstore01
mkdir /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores
cd /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores
rsync -a --progress \
/cluster/bluearc/danRer2/cons/consRun/phastCons6wayScores/ .
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/danRer2/phastCons6wayScores
cd /usr/local/apache/htdocs/goldenPath/danRer2/phastCons6wayScores
ln -s /cluster/data/danRer2/bed/multiz6way/phastCons6wayScores/*.gz .
md5sum *.gz > md5sum.txt
# copy over and edit README.txt.
# MULTIZ 6-WAY DOWNLOADABLES (DONE, 2005-05-31, hartera)
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/danRer2
mkdir -p multiz6way
cd multiz6way
foreach f (/cluster/data/danRer2/bed/multiz6way/maf/*.maf)
set c = $f:r:t
echo $c
nice gzip -c $f > $c.maf.gz
end
# copy README and edit
# Create upstream files for download
ssh hgwdev
cd /cluster/data/danRer2/bed/multiz6way
echo danRer2 tetNig1 fr1 mm6 hg17 monDom1 > org.txt
# mafFrags is quick for these
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits danRer2 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4,
0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags danRer2 multiz6way up.bed upstream$i.maf -orgs=org.txt
rm up.bed
end
# took less than 3 minutes
ssh kkstore01
cd /cluster/data/danRer2/bed/multiz6way
nice gzip upstream{1000,2000,5000}.maf
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/danRer2
mv /cluster/data/danRer2/bed/multiz6way/upstream*.maf.gz multiz6way
cd multiz6way
md5sum *.gz > md5sum.txt
###########################################################################
# LIFTOVER CHAINS TO DANRER3 (DONE, 2006-04-18, hartera)
# Split (using makeLoChain-split) of danRer3 is doc'ed in makeDanRer3.doc
# Do what makeLoChain-split says to do next (start blat alignment)
ssh kk
mkdir -p /cluster/data/danRer2/bed/liftOver
cd /cluster/data/danRer2/bed/liftOver
makeLoChain-align danRer2 /iscratch/i/danRer2/nib danRer3 \
/iscratch/i/danRer3/split10k \
/iscratch/i/danRer3/danRer3_11.ooc >&! align.log &
# Took about 4 minutes.
# Do what its output says to do next (start cluster job)
cd /cluster/data/danRer2/bed/blat.danRer3.2006-04-17/run
para try, check, push, check, ...
para time >&! run.time
# Completed: 784 of 784 jobs
# CPU time in finished jobs: 4022758s 67045.97m 1117.43h 46.56d 0.128 y
# IO & Wait Time: 71872s 1197.86m 19.96h 0.83d 0.002 y
# Average job time: 5223s 87.05m 1.45h 0.06d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 42935s 715.58m 11.93h 0.50d
# Submission to last job: 42935s 715.58m 11.93h 0.50d
# lift alignments
ssh kkr1u00
cd /cluster/data/danRer2/bed/liftOver
makeLoChain-lift danRer2 danRer3 >&! lift.log &
# Took about 10 minutes to run.
# chain alignments
ssh kki
cd /cluster/data/danRer2/bed/liftOver
makeLoChain-chain danRer2 /iscratch/i/danRer2/nib \
danRer3 /iscratch/i/danRer3/nib >&! chain.log &
# Do what its output says to do next (start cluster job)
cd /cluster/data/danRer2/bed/blat.danRer3.2006-04-17/chainRun
para try, check, push ...
para time >&! run.time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 2697s 44.95m 0.75h 0.03d 0.000 y
# IO & Wait Time: 548s 9.13m 0.15h 0.01d 0.000 y
# Average job time: 116s 1.93m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 439s 7.32m 0.12h 0.01d
# Submission to last job: 439s 7.32m 0.12h 0.01d
# net alignment chains
ssh kkstore04
cd /cluster/data/danRer2/bed/liftOver
makeLoChain-net danRer2 danRer3 >&! net.log &
# Took about 20 minutes to run.
# load reference to over.chain into database table,
# and create symlinks /gbdb and download area
ssh hgwdev
cd /cluster/data/danRer2/bed/liftOver
makeLoChain-load danRer2 danRer3 >&! load.log &
# test by converting a region using the "convert" link on
# the browser, and comparing to blat of the same region
###########################################################################
# SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM DANRER3 (DONE, 2006-04-25, hartera)
# followed instructions used in makePanTro2.doc
ssh kkr1u00
cd /cluster/data/danRer2/bed
mkdir -p liftOver
cd liftOver
makeLoChain-split danRer2 /cluster/data/danRer2/nib >&! split.log &
# Took about 30 minutes to run.
#########################################################################
## Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
hgsql -h genome-testdb hgcentraltest \
-e "update dbDb set orderKey = 452 where name = 'danRer2';"
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer2
##########################################################################
# BACENDS CLEANUP (DONE, 2007-03-27, hartera)
ssh kkstore04
cd /cluster/data/danRer2/bed/
du -sh ZonLab
# 150G ZonLab/
cd ZonLab
# Remove old BAC ENDS directories
nice rm -r bacends2 bacends3
cd bacends
rm bacEndsCov* bacEndsMinCov* bl2seq.out blast.out diff bacEnds \
bacEnds.names.uniq jobList
nice rm -r psl err minCov10MinAli92
# bacEnds.lifted.psl is lifted version of bacEnds.psl so remove latter
rm bacEnds.psl batch batch.bak *.lst para.results names.test template *.log
cd bacends.1
rm accs* bacEndAccs.aliases *.log allBacEnds* zfishaccs* diffaccs bacEnds \
BACEnd*
rm -r test test2
nice gzip *.psl
cd ../pairs
# bacEndSingles.bed is already in singles directory
rm bacEnds.* bed.tab bacEndSingles.bed psl.tab bacEndSingles.sql *.bak
rm -r old test
cd ../singles
rm bacEnds.*
rm -r old
cd ../scores
rm *.counts *.count *.hits good.txt error.log bed.tab bad.txt
rm -r test
cd ../duplicates
rm -r old2 test test1
rm error* sing *.counts singles.log.may10 log* out* bed.tab err
rm list list.aligns singles* *.uniq *.tmp *.tmp2 100names* pairs.* \
pairsTest*
rm -r badPrsTest pairsTest
cd ../cloneandStsAliases
rm *.bak dr1* dr2* xref.names.uniq zfishNew* zfishOld* *.tab accs.*
mv testTables/*.pl .
rm -r test testTables
cd /cluster/data/danRer2/bed/
du -sh ZonLab
# 46G ZonLab/
# removed 104 GB
##########################################################################