src/hg/makeDb/doc/danRer3.txt 1.17
1.17 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/danRer3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer3.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/danRer3.txt 17 Oct 2008 01:06:31 -0000 1.16
+++ src/hg/makeDb/doc/danRer3.txt 25 Nov 2009 21:48:38 -0000 1.17
@@ -1,8705 +1,8705 @@
# for emacs: -*- mode: sh; -*-
# Danio Rerio (zebrafish) from Sanger, version Zv5 (released 5/20/05)
# Project website:
# http://www.sanger.ac.uk/Projects/D_rerio/
# Assembly notes:
# http://www.sanger.ac.uk/Projects/D_rerio/Zv5_assembly_information.shtml
# DOWNLOAD SEQUENCE (DONE, 2005-06-06, hartera)
# MOVE DANRER3 DIRECTORY AND CONTENTS TO STORE11 AS STORE3 IS FULL
# (DONE, 2005-07-22, hartera)
ssh kkstore01
mkdir /cluster/store9/danRer3
ln -s /cluster/store9/danRer3 /cluster/data
cd /cluster/data/danRer3
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/README
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.stats
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.chunks.agp
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.scaffolds.agp wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv5release/Zv5.fa
# 2005-07-22 MOVE danRer3
# store9 is 100% full, move danRer3 to store11 which is 10% full
ssh kkstore02
cd /cluster/store9
nohup nice mv danRer3 /cluster/store11 &
# make link to /cluster/data/danRer3
ln -s /cluster/store11/danRer3 /cluster/data
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2005-06-13, hartera)
ssh kkstore01
mkdir -p /cluster/data/danRer3/M
cd /cluster/data/danRer3/M
# go to http://www.ncbi.nih.gov/ and search Nucleotide for
# "Danio mitochondrion genome". That shows the gi number:
# 8576324 for the accession, AC024175
# Use that number in the entrez linking interface to get fasta:
wget -O chrM.fa \
'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
# Edit chrM.fa: make sure the header line says it is the
# Danio Rerio mitochondrion complete genome, and then replace the
# header line with just ">chrM".
perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
rm *.bak
# Make a "pseudo-contig" for processing chrM too:
mkdir ./chrM_1
sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
mkdir ./lift
echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
echo "chrM_1" > ./lift/ordered.lst
echo "0 M/chrM_1 16596 chrM 16596" > ./lift/ordered.lft
# make sure this is tab delimited
# create a .agp file for chrM as hgGoldGapGl and other
# programs require a .agp file so create chrM.agp
cat << '_EOF_' > ./chrM.agp
chrM 1 16596 1 F AC024175.3 1 16596 +
'_EOF_'
# Create a chrM.chunks.agp
mkdir -p /cluster/data/danRer3/M/agps
cd /cluster/data/danRer3/M/agps
awk 'BEGIN {OFS="\t"} \
{print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' ../chrM.agp \
> chrM.chunks.agp
# make sure that all these above files are tab delimited
# Create list of chromosomes (DONE, 2005-06-08, hartera)
ssh kkstore01
cd /cluster/data/danRer3
awk '{if ($1 !~ /Zv5/) print $1;}' Zv5.scaffolds.agp \
| sort -n | uniq > chrom.lst
cp chrom.lst chrom1to25.lst
# add chrM
echo "M" >> chrom.lst
# add chrUn
echo "Un" >> chrom.lst
# add NA
echo "NA" >> chrom.lst
# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2005-06-09, hartera)
ssh kkstore01
cd /cluster/data/danRer3
# This used to hold scripts -- better to keep them inline here
# Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
mkdir /cluster/data/danRer3/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/danRer3/bed
# GET ADDITIONAL ZEBRAFISH REPBASE LIBRARY FOR REPEATMASKER
# (DONE, 2005-05-10, hartera)
# Go to http://www.girinst.org/server/RepBase/RepBase10.04.fasta
# and download zebunc.ref containing unclassified zebrafish repeats.
# Need username and password. Copy to /cluster/bluearc/RepeatMasker/Libraries/
ssh hgwdev
cd /cluster/bluearc/RepeatMasker/Libraries/
perl -pi.bak -e 's/>(Dr[0-9]+)/>$1#Unknown \@danio [S:]/' zebunc.ref
# add to RepeatMasker library
cat zebunc.ref >> RepeatMasker.lib
# This is all in: /cluster/bluearc/RepeatMasker050305/Libraries
# CHECK AGP FILES AND FASTA SIZE CONSISTENCY (DONE, 2005-06-10, hartera)
# The script, createAgpWithGaps.pl (see next section for creating
# agps and FASTAs for chrNA and chrUn), was used to create a scaffolds
# agp file for chrUn to test the program. The agp output was compared to
# that from scaffoldFaToAgp and difference was found in the agp file
# output for scaffoldFaToAgp which used 990568 as the end co-ordinate for
# Zv5_scaffold1475 instead of 976101 as in the output from the script. So
# the co-ordinate numbering is different from there on. The program,
# scaffoldFaToAgp is creating the agp file from the FASTA file
# so perhaps the sequence is a different size than stated in the agp file.
# Get sequence and find the size:
ssh kkstore01
mkdir test
cd test
faOneRecord ../Zv5.fa Zv5_scaffold1475 > Zv5_scaffold1475.fa
faSize Zv5_scaffold1475.fa
# 990568 bases
rm Zv5_scaffold1475.fa
# reported this inconsistency to Mario Caccamo at Sanger
# mc2@sanger.ac.uk (2005-06-09) and new scaffolds and chunks agp files
# were sent on 2005-06-10. There was a chunk (contig) missing from the
# chunks agp file and the scaffold therefore had the wrong end
# co-ordinate in the agp files.
# check all sizes of scaffold sequences against those in the agp files
ssh kkr1u00
cd /cluster/data/danRer3
mkdir -p /iscratch/i/danRer3/scaffolds
cp Zv5.fa /iscratch/i/danRer3/scaffolds/
iSync
ssh kk
mkdir -p /cluster/data/danRer3/scaffolds/run
cd /cluster/data/danRer3/scaffolds/run
grep '>' ../Zv5.fa | sed -e 's/>//' > Zv5.scaffolds.lst
cat << '_EOF_' > getSizes.csh
#!/bin/csh -fe
set dir=/cluster/bluearc/danRer3/scaffolds
faOneRecord /iscratch/i/danRer3/scaffolds/Zv5.fa $1 > $dir/$1.fa
echo $1 >> $dir/$1.size
faSize $dir/$1.fa >> $dir/$1.size
rm $dir/$1.fa
'_EOF_'
# << this line makes emacs coloring happy
chmod +x getSizes.csh
cat << '_EOF_' > gsub
#LOOP
getSizes.csh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 Zv5.scaffolds.lst single gsub jobList
para create jobList
para try,check,push,check etc...
ssh kkstore01
cd /cluster/bluearc/danRer3/scaffolds
foreach f (*.size)
cat $f >> Zv5.scaffolds.sizes
end
cd /cluster/data/danRer3/scaffolds
mv /cluster/bluearc/danRer3/scaffolds/Zv5.scaffolds.sizes .
# Check that these sizes correspond to the sizes in the scaffolds agp file
# use script compareSizes.pl
cat << '_EOF_' > compareSizes.pl
#!/usr/bin/perl -w
use strict;
my ($file, $agp);
$file = $ARGV[0];
$agp = $ARGV[1];
open(FILE, $file) || die "Can not open $file: $!\n";
open(AGP, $agp) || die "Can not open $agp: $!\n";
open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";
my ($l, $name, $size, %scafsHash);
while (<FILE>)
{
$l = $_;
if ($l =~ /^(Zv5_(scaffold|NA)[0-9]+)/)
{
$name = $1;
}
elsif ($l =~ /^([0-9]+)\sbases/)
{
$size = $1;
$scafsHash{$name} = $size;
}
}
close FILE;
while (<AGP>)
{
my ($line, @fi, $scaf, $end);
$line = $_;
@fi = split(/\t/, $line);
$scaf = $fi[5];
$end = $fi[7];
if (exists($scafsHash{$scaf}))
{
if ($scafsHash{$scaf} eq $end)
{
print OUT "$scaf - ok\n";
}
else
{
print OUT "$scaf - different size to sequence\n";
}
}
else
{
print OUT "$scaf - does not exist in list of sizes\n";
}
}
close AGP;
close OUT;
'_EOF_'
# << happy emacs
chmod +x compareSizes.pl
perl compareSizes.pl Zv5.scaffolds.sizes ../Zv5.scaffolds.agp
# the only lines where no ID was found in the list of scaffolds with sizes
# were those lines for gaps.
grep "different" Zv5_scaffold1475
# Zv5_scaffold1475 - different size to sequence
# so only this scaffold is a different size in the agp to the sequence
# need to check that sizes are consistent between agp files
# check also new agp file for scaffolds - newAgps/Zv5.scaffolds.agp
perl compareSizes.pl Zv5.scaffolds.sizes ../newAgps/Zv5.scaffolds.agp
# these are all consistent with the sequence sizes
cd /cluster/data/danRer3/newAgps/
# print out scaffold names where the co-ordinates are not consistent
# with sizes given
awk '{if ($6 ~ /^Zv5/ && (($3-$2+1) != $8)) print $6;}' Zv5.scaffolds.agp \
> Zv5.scaffolds.coordCheck
# this file is empty so they are ok. do the same for the chunks.agp file
awk '{if ($6 ~ /^Zv5/ && (($3-$2+1) != $8)) print $6;}' Zv5.chunks.agp \
> Zv5.chunks.coordCheck
# also empty so ok. check that the difference between $7 and $8 is the
# same as the difference between $11 and $12 fields
awk '{if ($6 != 5000 && (($8 - $7) != ($12 - $11))) print $6;}' \
Zv5.chunks.agp > Zv5.chunks.coordCheck2
# these are all ok
rm Zv5.*.coord*
cat << '_EOF_' > checkSizesInAgps.pl
#!/usr/bin/perl -w
use strict;
my ($ch, $sc, %scafsHash);
$sc = $ARGV[0]; # scaffolds agp
$ch = $ARGV[1]; # chunks or contigs agp
open(SCAFS, $sc) || die "Can not open $sc: $!\n";
open(CHUNKS, $ch) || die "Can not open $ch: $!\n";
while (<SCAFS>)
{
my ($l, @f, $name, $e);
$l = $_;
@f = split(/\t/, $l);
if ($f[5] =~ /^Zv5/)
{
$name = $f[5];
$e = $f[2];
$scafsHash{$name} = $e;
}
}
close SCAFS;
my $scaf = "";
my $prev = "";
my $prevEnd = 0;
while (<CHUNKS>)
{
my ($line, @fi);
$line = $_;
@fi = split(/\t/, $line);
if ($fi[5] ne "5000")
{
$scaf = $fi[9];
if (($scaf ne $prev) && ($prev ne ""))
{
checkCoords($prev, $prevEnd);
}
$prev = $scaf;
$prevEnd = $fi[2];
}
}
# check last entry in file
checkCoords($prev, $prevEnd);
close CHUNKS;
sub checkCoords {
my ($name, $end) = @_;
if (exists($scafsHash{$prev}))
{
if ($scafsHash{$prev} != $prevEnd)
{
my $ed = $scafsHash{$prev};
print "Scaffold $prev is not consistent between agps\n";
}
else
{
my $ed = $scafsHash{$prev};
print "Scaffold $prev - ok\n";
}
}
}
'_EOF_'
# << happy emacs
chmod +x checkSizesInAgps.pl
checkSizesInAgps.pl Zv5.scaffolds.agp Zv5.chunks.agp \
> Zv5.scafsvschunks
grep "not consistent" Zv5.scafsvschunks
# no lines were inconsistency was reported
wc -l Zv5.scafsvschunks
# 16214 Zv5.scafsvschunks
grep "Zv5" Zv5.scaffolds.agp | wc -l
# 16214
# so all the scaffolds were checked and were ok.
cd /cluster/data/danRer3
mv ./newAgps/Zv5.scaffolds.agp .
mv ./newAgps/Zv5.chunks.agp
mv ./scaffolds/compareSizes.pl ./jkStuff/
mv ./newAgps/checkSizesInAgps.pl ./jkStuff/
rm -r newAgps
# SPLIT AGP FILES BY CHROMOSOME (DONE, 2005-06-13, hartera)
# FASTA WAS CREATED USING SCAFFOLDS AGP
ssh kkstore01
cd /cluster/data/danRer3
# There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and
# then one for chunks (contigs on danRer1) showing how they map on to
# scaffolds.
# get list of scaffolds from FASTA file and check these are in agp
grep '>' Zv5.fa | sed -e 's/>//' | sort | uniq > Zv5FaScafs.lst
# get list of scaffolds from agp - do not print from gap lines
awk '{if ($7 !~ /contig/) print $6;}' Zv5.scaffolds.agp \
| sort | uniq > Zv5AgpScafs.lst
diff Zv5FaScafs.lst Zv5AgpScafs.lst
# no difference so all scaffolds are in the FASTA file
# add "chr" prefix for the agp files
perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
# for chromosomes:
foreach c (`cat chrom1to25.lst`)
echo "Processing $c ..."
mkdir $c
perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
./Zv5.chunks.agp \
> $c/chr$c.chunks.agp
perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
./Zv5.scaffolds.agp \
> $c/chr$c.scaffolds.agp
end
# CREATE AGP FILES FOR chrNA AND chrUn (DONE, 2005-06-13, hartera)
ssh kkstore01
# chrNA consists of WGS contigs that could not be related to any
# FPC contig and the scaffolds and contigs are named Zv5_NAN in the
# first field of the agp files
cd /cluster/data/danRer3
mkdir ./NA
awk '{if ($1 ~ /Zv5_NA/) print;}' Zv5.chunks.agp \
> ./NA/NA.chunks.agp
awk '{if ($1 ~ /Zv5_NA/) print;}' Zv5.scaffolds.agp \
> ./NA/NA.scaffolds.agp
# change the first field to "chrUn" then can use agpToFa to process
perl -pi.bak -e 's/Zv5_NA[0-9]+/chrNA/' ./NA/*.agp
# check files and remove backup files
rm ./NA/*.bak
# then process chrUn.
# Re-make chrUn with new agp files - this is made from scaffolds and
# contigs where the name is Zv5_scaffoldN in the first field of the
# agp files. These scaffolds and contigs are unmapped to chromosomes
# in the agp file. chrUn is made up of WGS scaffolds that mapped to
# FPC contigs, but the chromosome is unknown.
rm -r Un
mkdir ./Un
awk '{if ($1 ~ /Zv5_scaffold/) print;}' Zv5.chunks.agp \
> ./Un/Un.chunks.agp
awk '{if ($1 ~ /Zv5_scaffold/) print;}' Zv5.scaffolds.agp \
> ./Un/Un.scaffolds.agp
# change the first field to "chrUn" then can use agpToFa to process
perl -pi.bak -e 's/Zv5_scaffold[0-9]+/chrUn/' ./Un/*.agp
# check files and remove backup files
rm ./Un/*.bak
# get FASTA file of sequences for NA and Un and create agp with
# Ns between scaffolds
# from scaffolds agp, get name of scaffolds to retrieve from the FASTA
# file to make the NA and Un chromosomes.
foreach c (NA Un)
awk '{print $6;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
$HOME/bin/i386/faSomeRecords /cluster/data/danRer3/Zv5.fa \
$c/chr$c.scaffolds.lst $c/chr$c.fa
end
# check that all scaffolds in list are in FASTA file for NA and Un - ok
# edit scaffoldFaToAgp.c so that it creates agp with 500Ns between
# scaffolds as contig gaps for chrNA and compile. chrNA is already large
# so the number of Ns are reduced to reduce the size.
foreach c (NA Un)
$HOME/bin/i386/scaffoldFaToAgp $c/chr$c.fa
mv $c/chr$c.fa $c/chr$c.scaffolds.fa
end
# change chrUn to chrNA for NA and D to W for NA and Un
sed -e 's/chrUn/chrNA/' ./NA/chrNA.agp | sed -e 's/D/W/' \
> ./NA/chrNA.scaffolds.agp
sed -e 's/D/W/' ./Un/chrUn.agp > ./Un/chrUn.scaffolds.agp
# edit ./NA/chrNA.scaffolds.agp and ./Un/chrUn.scaffolds.agp and
# remove last line as this just adds an extra 500 Ns at the
# end of the sequence.
rm ./NA/chrNA.agp ./Un/chrUn.agp
cat << '_EOF_' > /cluster/data/danRer3/jkStuff/createAgpWithGaps.pl
#!/usr/bin/perl
use strict;
# This script takes a chunks agp and inserts Ns between scaffolds for
# the chunks (contigs) agp file. Could also insert Ns between scaffolds
# for scaffolds agp.
my ($chrom, $numN, $name, $prev, $st, $end, $prevEnd, $id);
my $chrom = $ARGV[0]; # chromosome name
my $numN = $ARGV[1]; # number of Ns to be inserted
my $type = $ARGV[2]; # contigs or scaffolds
$prev = "";
$st = 1;
$prevEnd = 0;
$id = 0;
while (<STDIN>)
{
my $l = $_;
my @f = split(/\t/, $l);
if ($type eq "contigs")
{
$name = $f[9];
}
else
{
$name = $f[5]
}
my $currSt = $f[1];
my $currEnd = $f[2];
my $size = $currEnd - $currSt;
$id++;
$st = $prevEnd + 1;
$end = $st + $size;
if (($prev ne "") && ($prev ne $name))
{
$st = $prevEnd + 1;
$end = ($st + $numN) - 1;
print "$chrom\t$st\t$end\t$id\tN\t$numN\tcontig\tno\n";
$prevEnd = $end;
$id++;
}
$st = $prevEnd + 1;
$end = $st + $size;
print "$chrom\t$st\t$end\t$id\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$f[8]";
if ($type eq "contigs")
{
print "\t$f[9]\t$f[10]\t$f[11]";
}
$prevEnd = $end;
$prev = $name;
}
'_EOF_'
chmod +x /cluster/data/danRer3/jkStuff/createAgpWithGaps.pl
cd /cluster/data/danRer3
foreach c (NA Un)
cd $c
perl ../jkStuff/createAgpWithGaps.pl chr${c} 500 contigs \
< ${c}.chunks.agp > chr${c}.chunks.agp
cd ..
end
# check co-ordinates
# clean up
foreach c (NA Un)
rm $c/${c}.scaffolds.agp $c/${c}.chunks.agp $c/chr${c}.scaffolds.fa \
$c/${c}.scaffolds.lst
end
# BUILD CHROM-LEVEL SEQUENCE (DONE, 2005-06-13, hartera)
ssh kkstore01
cd /cluster/data/danRer3
# Sequence is already in upper case so no need to change
foreach c (`cat chrom.lst`)
echo "Processing ${c}"
$HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
$c/chr$c.fa ./Zv5.fa
echo "${c} - DONE"
end
# move scaffolds agp to be chrom agp and clean up
foreach c (`cat chrom.lst`)
cd $c
rm *.bak
cp chr${c}.scaffolds.agp chr${c}.agp
mkdir -p agps
mv chr${c}.*.agp ./agps/
cd ..
end
# CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2005-06-13, hartera)
# Check that the size of each chromosome .fa file is equal to the
# last coord of the .agp:
ssh hgwdev
cd /cluster/data/danRer3
foreach c (`cat chrom.lst`)
foreach f ( $c/chr$c.agp )
set agpLen = `tail -1 $f | awk '{print $3;}'`
set h = $f:r
set g = $h:r
echo "Getting size of $g.fa"
set faLen = `faSize $g.fa | awk '{print $1;}'`
if ($agpLen == $faLen) then
echo " OK: $f length = $g length = $faLen"
else
echo "ERROR: $f length = $agpLen, but $g length = $faLen"
endif
end
end
# all are the OK so FASTA files are the expected size
# CREATING DATABASE (DONE, 2005-06-13, hartera)
# Create the database.
# next machine
ssh hgwdev
echo 'create database danRer3' | hgsql ''
# if you need to delete that database: !!! WILL DELETE EVERYTHING !!!
echo 'drop database danRer3' | hgsql danRer3
# Delete and re-create database as above (hartera, 2004-11-30)
# Use df to make sure there is at least 10 gig free on
df -h /var/lib/mysql
# Before loading data:
# Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 927G 734G 56% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2005-06-13, hartera)
# next machine
ssh hgwdev
# the following command copies all the data from the table
# grp in the database danRer2 to the new database danRer3
echo "create table grp (PRIMARY KEY(NAME)) select * from danRer2.grp" \
| hgsql danRer3
# if you need to delete that table: !!! WILL DELETE ALL grp data !!!
echo 'drop table grp;' | hgsql danRer3
# BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
# (DONE, 2004-06-14, hartera)
ssh kkstore01
cd /cluster/data/danRer3
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c.agp)
if (-e $agp) then
set fa = $c/chr$c.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
splitFaIntoContigs $agp $fa . -nSize=5000000
endif
end
end
# MAKE LIFTALL.LFT (DONE, 2005-06-14, hartera)
ssh kkstore01
cd /cluster/data/danRer3
cat */lift/ordered.lft > jkStuff/liftAll.lft
# SIMPLE REPEAT [TRF] TRACK (DONE, 2005-06-14, hartera)
# TRF can be run in parallel with RepeatMasker on the file server
# since it doesn't require masked input sequence.
# Run this on the kilokluster. Need to mask contig and chromosome
# sequences so run trf using contig sequences.
# First copy over contig sequences to iscratch and then iSync to cluster.
ssh kkr1u00
mkdir -p /iscratch/i/danRer3/contigsNoMask
cd /cluster/data/danRer3
foreach d (/cluster/data/danRer3/*/chr*_?{,?})
set ctg = $d:t
foreach f ($d/${ctg}.fa)
echo "Copyig $f ..."
cp $f /iscratch/i/danRer3/contigsNoMask/
end
end
# 288 sequence files
/cluster/bin/iSync
ssh kk
mkdir -p /cluster/data/danRer3/bed/simpleRepeat
cd /cluster/data/danRer3/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << keep emacs coloring happy
chmod +x runTrf
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << keep emacs coloring happy
ls -1S /iscratch/i/danRer3/contigsNoMask/chr*.fa > genome.lst
gensub2 genome.lst single gsub jobList
# 288 jobs
para create jobList
para try, check, push, check etc...
para time
# Completed: 288 of 288 jobs
# CPU time in finished jobs: 70742s 1179.03m 19.65h 0.82d 0.002 y
# IO & Wait Time: 1263s 21.05m 0.35h 0.01d 0.000 y
# Average job time: 250s 4.17m 0.07h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 6722s 112.03m 1.87h 0.08d
# Submission to last job: 10037s 167.28m 2.79h 0.12d
# lift up to chrom level
liftUp simpleRepeat.bed /cluster/data/danRer3/jkStuff/liftAll.lft warn \
trf/*.bed
# Load into the database
ssh hgwdev
cd /cluster/data/danRer3/bed/simpleRepeat
hgLoadBed danRer3 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 757119 elements of size 16
# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2005-06-14, hartera)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore01
cd /cluster/data/danRer3/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/danRer3
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (`cat chrom.lst`)
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
# REPEAT MASKING - Run RepeatMasker on chroms (DONE, 2005-06-15, hartera)
# When a new library is added for this version of repeatMasker, need to
# check in /cluster/bluearc/RepeatMasker/Libraries for a directory made
# up of a date e.g. 20050112 here and inside this are species directories
# for which RepeatMasker has already been run. In this directory it creates
# a specieslib of the danio repeats. If this exists, this is used for the
# RepeatMasker run for that species so if new repeats are added to the
# library, they will not get used unless this is deleted a new specieslib
# is created using the new library on the first run for danio.
ssh kkstore01
rm -r /cluster/bluearc/RepeatMasker/Libraries/20050112/danio/
cd /cluster/data/danRer3
#- Split contigs into 500kb chunks, at gaps if possible:
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
# For RepeatMasking, use RepeatMasker "open-3.0" with repeat library
# version RepBase Update 9.11, RM database version 20050112 with the
# addition of the zebrafish unclassified repeats (zebunc.ref) - see above
# section on getting this additional zebrafish RepeatMasker library.
#- Make the run directory and job list:
cd /cluster/data/danRer3
cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/danRer3/$2
/bin/cp $2 /tmp/danRer3/$2/
cd /tmp/danRer3/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -species danio $2
popd
/bin/cp /tmp/danRer3/$2/$2.out ./
if (-e /tmp/danRer3/$2/$2.align) /bin/cp /tmp/danRer3/$2/$2.align ./
if (-e /tmp/danRer3/$2/$2.tbl) /bin/cp /tmp/danRer3/$2/$2.tbl ./
if (-e /tmp/danRer3/$2/$2.cat) /bin/cp /tmp/danRer3/$2/$2.cat ./
/bin/rm -fr /tmp/danRer3/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer3/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer3
'_EOF_'
chmod +x jkStuff/RMZebrafish
mkdir -p RMRun
cp /dev/null RMRun/RMJobs
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/danRer3/jkStuff/RMZebrafish \
/cluster/data/danRer3/$d $f \
'{'check out line+ /cluster/data/danRer3/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
# Do the run
ssh kk
cd /cluster/data/danRer3/RMRun
para create RMJobs
para try, para check, para check, para push, para check,...
para time
# Completed: 4069 of 4069 jobs
# CPU time in finished jobs: 13726314s 228771.90m 3812.87h 158.87d 0.435 y
# IO & Wait Time: 45762s 762.70m 12.71h 0.53d 0.001 y
# Average job time: 3385s 56.41m 0.94h 0.04d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 4549s 75.82m 1.26h 0.05d
# Submission to last job: 56947s 949.12m 15.82h 0.66d
# This is slow. It should have taken about 5 hours.
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kkstore01
cd /cluster/data/danRer3
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (`cat chrom.lst`)
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
cd ..
end
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/danRer3
hgLoadOut danRer3 */chr*.fa.out -verbose=2
# bad rep range [689, 602] line 105524 of 16/chr16.fa.out
# bad rep range [147, 146] line 124027 of 16/chr16.fa.out
# bad rep range [280, 258] line 754 of 17/chr17.fa.out
# bad rep range [280, 258] line 76417 of 17/chr17.fa.out
# bad rep range [314, 311] line 99427 of 19/chr19.fa.out
# bad rep range [367, 366] line 88398 of 23/chr23.fa.out
# bad rep range [41, 40] line 51509 of 25/chr25.fa.out
# bad rep range [1133, 1132] line 62610 of 9/chr9.fa.out
# bad rep range [6133, 6132] line 122359 of NA/chrNA.fa.out
# bad rep range [6133, 6132] line 160183 of NA/chrNA.fa.out
# bad rep range [292, 291] line 252829 of NA/chrNA.fa.out
# bad rep range [751, 599] line 261276 of NA/chrNA.fa.out
# bad rep range [360, 359] line 259794 of Un/chrUn.fa.out
# bad rep range [360, 359] line 259796 of Un/chrUn.fa.out
# bad rep range [360, 359] line 259798 of Un/chrUn.fa.out
# bad rep range [1, -56] line 379516 of Un/chrUn.fa.out
# note: 16 records dropped due to repStart > repEnd
# check coverage of repeats masked
# featureBits -chrom=chr1 danRer1 rmsk
# 11589712 bases of 40488791 (28.624%) in intersection
# featureBits -chrom=chr1 danRer2 rmsk
# 26879295 bases of 61678023 (43.580%) in intersection
# featureBits -chrom=chr1 danRer3 rmsk
# 25822888 bases of 55805710 (46.273%) in intersection
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF AND BUILD NIB FILES
# (DONE, 2005-06-15, hartera)
ssh kkstore01
cd /cluster/data/danRer3
# Soft-mask (lower-case) the contig and chr .fa's,
# then make hard-masked versions from the soft-masked.
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
# for the chromosomes:
foreach f (*/chr*.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
# This warning is extremely rare -- if it indicates a problem, it is only with
# the repeat annotation and does not affect the masking:
# repeat- and trf-masking Un/chrUn.fa
# WARNING: negative rEnd: -56 chrUn:153329594-153329609 MOSAT_DR
# for the contigs:
foreach c (`cat chrom.lst`)
echo "repeat- and trf-masking contigs of chr$c"
foreach d ($c/chr*_?{,?})
set ctg=$d:t
set f=$d/$ctg.fa
maskOutFa -soft $f $f.out $f
maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
maskOutFa $f hard $f.masked
end
end
# same warning here too:
# repeat- and trf-masking contigs of chrUn
# WARNING: negative rEnd: -56 chrUn_26:1159145-1159160 MOSAT_DR
# check percent sequence masked
faSize /cluster/data/danRer3/1/chr1.fa
# 55805710 bases (1047706 N's 54758004 real 28887275 upper 25870729 lower)
# 46% is in lower case so masked
# for danRer2:
faSize /cluster/data/danRer2/1/chr1New.fa
# 62208023 bases (3421437 N's 58786586 real 31874160 upper 26912426 lower)
# 43% is in lower case so masked
# Build nib files, using the soft masking in the fa
mkdir nib
foreach f (*/chr*.fa)
faToNib -softMask $f nib/$f:t:r.nib
end
# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2005-06-15, hartera)
# Added link from danRer3.2bit file to the danRer3 gbdb directory
# (2005-06-17, hartera)
# Make symbolic links from /gbdb/danRer3/nib to the real nibs
ssh hgwdev
cd /cluster/data/danRer3
mkdir -p /gbdb/danRer3/nib
foreach f (/cluster/data/danRer3/nib/chr*.nib)
ln -s $f /gbdb/danRer3/nib
end
# Load /gbdb/danRer3/nib paths into database and save size info
# hgNibSeq creates chromInfo table
hgNibSeq -preMadeNib danRer3 /gbdb/danRer3/nib */chr*.fa
echo "select chrom,size from chromInfo" | hgsql -N danRer3 > chrom.sizes
# take a look at chrom.sizes, should be 28 lines
wc chrom.sizes
# 28 56 409 chrom.sizes
# Make one big 2bit file as well, and make a link to it in
# /gbdb/danRer3/nib because hgBlat looks there:
faToTwoBit */chr*.fa danRer3.2bit
# add link to this 2bit file from gbdb danRer3 directory (2005-06-17)
ln -s /cluster/data/danRer3/danRer3.2bit /gbdb/danRer3/
# also make 2 bit files for chrUn and chrNA later on - need masked seq
# make 2 bit files for chrUn and chrNA scaffolds (2005-06-17)
ssh kkstore01
cd /cluster/data/danRer3
# make scaffolds files
foreach c (NA Un)
cd $c
echo "Processing $c ..."
mkdir scafSeqs
awk '{if ($5 != "N") print $6;}' chr${c}.agp > scafSeqs/scaffolds.lst
cd ..
end
cd /cluster/data/danRer3/NA/scafSeqs
cat << '_EOF_' > getSeqs.csh
#!/bin/csh -fe
set dir=/cluster/bluearc/danRer3/scaffolds
faOneRecord /iscratch/i/danRer3/scaffolds/Zv5.fa $1 > $dir/$1.fa
'_EOF_'
# << this line makes emacs coloring happy
chmod +x getSeqs.csh
cat << '_EOF_' > gsub
#LOOP
getSeqs.csh $(path1)
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ssh kk
cd /cluster/data/danRer3/NA/scafSeqs
gensub2 scaffolds.lst single gsub jobList
para create jobList
para try,check,push,check etc...
ssh kkstore01
cd /cluster/bluearc/danRer3/scaffolds
foreach f (*.size)
faToTwoBit ./chrNA/scafSeqs/*.fa danRer3ChrNA.2bit
faToTwoBit ./chrUn/scafSeqs *.fa danRer3ChrUn.2bit
# MAKE GOLD AND GAP TRACKS (DONE, 2005-06-15, hartera)
# Add trackDb entry and html page for gold and gap tracks (2005-06-16, hartera)
ssh hgwdev
cd /cluster/data/danRer3
# the gold and gap tracks are created from the chrN.agp file and this is
# the scaffolds or supercontigs agp
hgGoldGapGl -noGl -chromLst=chrom.lst danRer3 /cluster/data/danRer3 .
# featureBits danRer3 gold
# 1630323462 bases of 1630323462 (100.000%) in intersection
# featureBits danRer2 gold
# 1560497282 bases of 1560497282 (100.000%) in intersection
# featureBits danRer1 gold
# 1459132082 bases of 1459132082 (100.000%) in intersection
# featureBits danRer3 gap
# 13709500 bases of 1630323462 (0.841%) in intersection
# featureBits danRer2 gap
# 28776000 bases of 1560497282 (1.844%) in intersection
# featureBits danRer1 gap
# 64174000 bases of 1459132082 (4.398%) in intersection
# Add trackDb.ra entries for gold and gap tracks and also create
# gap.html and gold.html pages.
# MAKE TRACKDB ENTRY FOR DANRER3 (DONE, 2005-06-16, hartera)
ssh hgwdev
# Make trackDb table so browser knows what tracks to expect:
mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3
cd ~/kent/src/hg/makeDb/trackDb/zebrafish
cvs add danRer3
cvs commit danRer3
cd ~/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add danRer3 in all the right places and do
make update
make alpha
cvs commit -m "Added danRer3." makefile
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2005-06-16, hartera)
ssh hgwdev
mkdir /cluster/data/danRer3/html
# make a symbolic link from /gbdb/danRer3/html to /cluster/data/danRer3/html
ln -s /cluster/data/danRer3/html /gbdb/danRer3/html
# Add a description page for zebrafish
cd /cluster/data/danRer3/html
cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer2/description.html .
# Edit this for zebrafish danRer3
# create a description.html page here
cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3
# Add description page here too
cp /cluster/data/danRer3/html/description.html .
cvs add description.html
cvs commit -m "First draft of description page for danRer3." \
description.html
cd ~/kent/src/hg/makeDb/trackDb
make update
make alpha
# MAKE HGCENTRALTEST ENTRY FOR DANRER3 (DONE, 2005-06-16, hartera)
# UPDATE ENTRY TO ADD DANRER3 TO GENE SORTER (DONE, 2006-06-09, hartera)
# Make dbDb and defaultDb entries so test browser knows about it:
ssh hgwdev
# Add dbDb and defaultDb entries:
echo 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
values("danRer3", "May 2005", \
"/gbdb/danRer3", "Zebrafish", "chr2:15,906,734-15,926,406", 1, \
37, "Zebrafish", "Danio rerio", \
"/gbdb/danRer3/html/description.html", 0, 0, \
"Sanger Centre, Danio rerio Sequencing Project Zv5");' \
| hgsql -h genome-testdb hgcentraltest
# set danRer3 to be the default assembly for Zebrafish
echo 'update defaultDb set name = "danRer3" \
where genome = "Zebrafish";' \
| hgsql -h genome-testdb hgcentraltest
# Update dbDb entry for danRer3 to add it to Gene Sorter
# (hartera, 2006-06-09)
echo 'update dbDb set hgNearOk = 1 where name = "danRer3";' \
| hgsql -h genome-testdb hgcentraltest
# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS AND ON BLUEARC
# (DONE, 2005-06-16, hartera)
ssh kkr1u00
# Chrom-level mixed nibs that have been repeat- and trf-masked:
rm -rf /iscratch/i/danRer3/nib
mkdir -p /iscratch/i/danRer3/nib
cp -p /cluster/data/danRer3/nib/chr*.nib /iscratch/i/danRer3/nib
# Pseudo-contig fa that have been repeat- and trf-masked:
rm -rf /iscratch/i/danRer3/trfFa
mkdir /iscratch/i/danRer3/trfFa
foreach d (/cluster/data/danRer3/*/chr*_?{,?})
cp -p $d/$d:t.fa /iscratch/i/danRer3/trfFa
end
rm -rf /iscratch/i/danRer3/rmsk
mkdir -p /iscratch/i/danRer3/rmsk
cp -p /cluster/data/danRer3/*/chr*.fa.out /iscratch/i/danRer3/rmsk
cp -p /cluster/data/danRer3/danRer3.2bit /iscratch/i/danRer3/
/cluster/bin/iSync
# add to the bluearc
ssh kkstore01
mkdir -p /cluster/bluearc/danRer3/nib
cp -p /cluster/data/danRer3/nib/chr*.nib /cluster/bluearc/danRer3/nib
mkdir -p /cluster/bluearc/danRer3/trfFa
foreach d (/cluster/data/danRer3/*/chr*_?{,?})
cp -p $d/$d:t.fa /cluster/bluearc/danRer3/trfFa
end
cp /cluster/data/danRer3/danRer3.2bit /cluster/bluearc/danRer3/
# ADD CONTIGS TRACK (DONE, 2005-06-16, hartera)
# make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from
# chunks (contigs) agp files.
ssh kkstore01
mkdir -p /cluster/data/danRer3/bed/ctgPos2
cd /cluster/data/danRer3/bed/ctgPos2
# ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
awk 'BEGIN {OFS="\t"} \
{if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
/cluster/data/danRer3/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
end
ssh hgwdev
cd /cluster/data/danRer3/bed/ctgPos2
hgsql danRer3 < ~/kent/src/hg/lib/ctgPos2.sql
echo "load data local infile 'ctgPos2.tab' into table ctgPos2" \
| hgsql danRer3
# create trackDb.ra entry and html page for ctgPos2 track.
# Changed termRegEx for ctgPos2 in trackDb.ra so that it handles
# contigs named "Zv5_scaffold*". (2006-04-19, hartera)
# CREATE gc5Base WIGGLE TRACK (DONE, 2005-06-16, hartera)
# FIX LINK FOR WIB FILES TO POINT TO danRer3 ON store11 (2005-07-25, hartera)
ssh kkstore01
mkdir -p /cluster/data/danRer3/bed/gc5Base
cd /cluster/data/danRer3/bed/gc5Base
# The number of bases that hgGcPercent claimed it measured is calculated,
# which is not necessarily always 5 if it ran into gaps, and then the
# division by 10.0 scales down the numbers from hgGcPercent to the range
# [0-100]. wigEncode now replaces wigAsciiToBinary and the previous
# processing step between these two programs. The result file is *.wig.
# Each value represents the measurement over five bases beginning with
# <position>. wigEncode also calculates the zoomed set of data.
# Uses the 2bit file in /cluster/data/danRer3 as sequence input.
nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer3 \
/cluster/data/danRer3 | \
wigEncode stdin gc5Base.wig gc5Base.wib
# load the .wig file back on hgwdev:
ssh hgwdev
cd /cluster/data/danRer3/bed/gc5Base
hgLoadWiggle -pathPrefix=/gbdb/danRer3/wib/gc5Base \
danRer3 gc5Base gc5Base.wig
# and symlink the .wib file into /gbdb
# fix link as danRer3 is now in store 11 (2005-07-25, hartera)
rm -r /gbdb/danRer3/wib/gc5Base
mkdir -p /gbdb/danRer3/wib/gc5Base
ln -s `pwd`/gc5Base.wib /gbdb/danRer3/wib/gc5Base
# MAKE 10.OOC, 11.OOC FILE FOR BLAT (DONE, 2005-06-17, hartera)
# Use -repMatch=512 (based on size -- for human we use 1024, and
# the zebrafish genome is ~50% of the size of the human genome
ssh kkr1u00
mkdir /cluster/data/danRer3/bed/ooc
cd /cluster/data/danRer3/bed/ooc
mkdir -p /cluster/bluearc/danRer3
ls -1 /cluster/data/danRer3/nib/chr*.nib > nib.lst
blat nib.lst /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/danRer3/danRer3_11.ooc -repMatch=512
# Wrote 50575 overused 11-mers to /cluster/bluearc/danRer3/11.ooc
# For 10.ooc, repMatch = 4096 for human, so use 2048
blat nib.lst /dev/null /dev/null -tileSize=10 \
-makeOoc=/cluster/bluearc/danRer3/danRer3_10.ooc -repMatch=2048
# Wrote 12574 overused 10-mers to /cluster/bluearc/danRer3/10.ooc
# keep copies of ooc files in this directory and copy to iscratch
cp /cluster/bluearc/danRer3/*.ooc .
cp -p /cluster/bluearc/danRer3/*.ooc /iscratch/i/danRer3/
/cluster/bin/iSync
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer3 (DONE, 2005-07-20, kuhn)
# hgcentraltest is now on hgwdev
ssh hgwdev
# DNA port is "0", trans prot port is "1"
echo 'insert into blatServers values("danRer3", "blat2", "17778", "1", "0"); insert into blatServers values("danRer3", "blat2", "17779", "0", "1");' \
| hgsql hgcentraltest
# this enables blat and isPcr, isPcr is enabled by loading blat server
# with tilesize=5 (ask for this when request blat servers from
# cluster admin).
# if you need to delete those entries
echo 'delete from blatServers where db="danRer3";' \
| hgsql hgcentraltest
# to check the entries:
echo 'select * from blatServers where db="danRer3";' \
| hgsql hgcentraltest
# AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2005-07-22, hartera)
# REMAKE THIS TRACK USING chrUn AND chrNA SCAFFOLDS (DONE, 2005-08-19, hartera)
# UPDATED (2006-09-27) - see separate section, UPDATE AFFY ZEBRAFISH TRACK.
# array chip sequences already downloaded for danRer1
ssh hgwdev
cd /projects/compbio/data/microarray/affyZebrafish
mkdir /cluster/bluearc/affy
cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \ /cluster/bluearc/affy/
# Set up cluster job to align Zebrafish consensus sequences to danRer3
ssh kkr1u00
mkdir -p /cluster/data/danRer3/bed/affyZebrafish.2005-08-19
ln -s /cluster/data/danRer3/bed/affyZebrafish.2005-08-19 \
/cluster/data/danRer3/bed/affyZebrafish
cd /cluster/data/danRer3/bed/affyZebrafish
mkdir -p /iscratch/i/affy
cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
/cluster/bin/iSync
# the kilokluster is down, so run on the pitakluster
ssh pk
cd /cluster/data/danRer3/bed/affyZebrafish
ls -1 /cluster/bluearc/affy/Zebrafish_consensus.fa > affy.lst
ls -1 /cluster/bluearc/danRer3/trfFa/chr[0-9M]*.fa > genome.lst
# for output:
mkdir -p /san/sanvol1/danRer3/affy/pslChrom
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/danRer3/affy/pslChrom/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 genome.lst affy.lst template.sub para.spec
para create para.spec
para try, check, push ... etc.
# para time
# Completed: 208 of 208 jobs
# CPU time in finished jobs: 1355s 22.59m 0.38h 0.02d 0.000 y
# IO & Wait Time: 9988s 166.46m 2.77h 0.12d 0.000 y
# Average job time: 55s 0.91m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 74s 1.23m 0.02h 0.00d
# Submission to last job: 217s 3.62m 0.06h 0.00d
# then run the 2bit file of scaffolds
ssh pk
cd /cluster/data/danRer3/bed/affyZebrafish
mkdir scaffoldsNAandUnRun
cd scaffoldsNAandUnRun
ls -1 /cluster/bluearc/affy/Zebrafish_consensus.fa > affy.lst
foreach f (/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/*.fa)
ls -1 $f >> scafs.lst
end
mkdir -p /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/cluster/bluearc/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn/$(root1)_$(root2).psl}\n#ENDLOOP' > template2.sub
gensub2 scafs.lst affy.lst template2.sub para.spec
para create para.spec
para try, check, push ... etc.
# para time
# Completed: 14941 of 14941 jobs
# CPU time in finished jobs: 27574s 459.57m 7.66h 0.32d 0.001 y
# IO & Wait Time: 47642s 794.03m 13.23h 0.55d 0.002 y
# Average job time: 5s 0.08m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 35s 0.58m 0.01h 0.00d
# Submission to last job: 339s 5.65m 0.09h 0.00d
# need to do pslSort and lift up for each separate run
cd /cluster/data/danRer3/bed/affyZebrafish
cd /san/sanvol1/danRer3/affy/pslScaffoldsNAandUn
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyZebrafish.psl
# only use alignments that have at least
# 95% identity in aligned region.
# do not use minCover since a lot of sequence is in Un, NA and Finished
# so genes may be split up so good to see all alignments
# first do the chr1-25 and chrM alignments
pslSort dirs raw.psl tmp pslChrom
pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
# Processed 27408 alignments
pslSort dirs rawNAandUn.psl tmp pslScaffoldsNAandUn
pslReps -minAli=0.95 -nearTop=0.005 rawNAandUn.psl scafNAandUn.psl /dev/null
# Processed 9888 alignments
# lift up chrom contigs to chrom level
liftUp affyZfishChroms.psl \
/cluster/data/danRer3/jkStuff/liftAll.lft warn contig.psl
liftUp affyZfishScafsNAandUn.psl \
/cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
warn scafNAandUn.psl
# sort and merge these files
mkdir psl
cp affyZfish* ./psl/
pslSort dirs affyZebrafish.psl tmp1 psl
# rsync these psl files
rsync -a --progress /san/sanvol1/danRer3/affy/*.psl \
/cluster/data/danRer3/bed/affyZebrafish/
ssh kkstore02
cd /cluster/data/danRer3/bed/affyZebrafish
# shorten names in psl file
sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
mv affyZebrafish.psl.tmp affyZebrafish.psl
pslCheck affyZebrafish.psl
# psl is good
# load track into database
ssh hgwdev
cd /cluster/data/danRer3/bed/affyZebrafish
hgLoadPsl danRer3 affyZebrafish.psl
# Add consensus sequences for Zebrafish chip
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/hgFixed/affyProbes
ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/gbdb/hgFixed/affyProbes
hgLoadSeq -abbr=Zebrafish: danRer3 \
/gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
# Clean up
rm batch.bak contig.psl raw.psl
# moved affyZebrafish.html description and trackDb.ra track entry and
# search for Affy Zebrafish track to
# ~/kent/src/hg/makeDb/trackDb/zebrafish since it is common to all
# danRer assemblies.
# LIFT FILES FROM SCAFFOLDS TO chrUn AND chrNA (DONE, 2005-07-27, hartera)
ssh kkstore02
mkdir -p /cluster/data/danRer3/liftSupertoChrom
cd /cluster/data/danRer3/liftSupertoChrom
# lift files are already created when scaffoldFaToAgp was run for chrUn.fa
# and chrNA.fa. These need to be edited as the last 500 Ns were removed
# from the agp file making the sequence 184125739 bp and not 184126239 bp
# for chrUn, for chrNA, it is 253521007 bp instead of 253521507 bp and need
# to change chrUn to chrNA
cp /cluster/data/danRer3/Un/tmp/chrUn.lft .
cp /cluster/data/danRer3/NA/tmp/chrNA.lft .
# edit to remove last lines of each file first
# then use perl to change co-ordinates
perl -pi.bak -e 's/184126239/184125739/' chrUn.lft
perl -pi.bak -e 's/253521507/253521007/' chrNA.lft
perl -pi.bak -e 's/chrUn/chrNA/' chrNA.lft
cat *.lft >> liftNAandUnScaffoldsToChrom.lft
# clean up
rm *.bak
# ENSEMBL GENES (DONE, 2005-07-29, hartera)
ssh hgwdev
mkdir -p /cluster/data/danRer3/bed/ensembl
cd /cluster/data/danRer3/bed/ensembl
# Get the Ensembl gene data from
# http://www.ensembl.org/Multi/martview
# Follow this sequence through the pages: (NOTE: this interface has changed
# a little since danRer2)
# Page 1) Select the Ensembl dataset (v32 here) and the
# Danio_rerio choice (ZFISH5 here). Hit next. 22877 entries total.
# Ensembl 35 now (2005-11-23) and this is the same as for the version 32
# downloaded as above. Ensembl 36 (Dec 2005) is the same as for 32 for
# Zebrafish. Ensembl 38 (April 2006) Protein Coding genes is the same
# as for Ensembl 32. (Select Gene type as protein_coding on page 2).
# Page 2) Then hit next.
# Page 3) Choose the "Structures" Attribute Page from the pulldown menu
# at the top. Make sure that under the GENE section, the Ensembl
# Attributes checked include the Ensembl Gene ID and Ensembl
# Transcript ID. Choose GTF as the output. Choose gzip compression.
# Hit export. Save as ensemblGene35.gtf.gz
# the Ensembl gene predictions are mapped to chromosomes except for
# chrNA and chrUn. Use lift files for scaffolds to these chroms.
# get chrUn and chrNA Ensembl records
ssh kkstore02
cd /cluster/data/danRer3/bed/ensembl
gunzip ensemblGene.gtf.gz
awk '$1 ~ /^Zv5_NA[0-9]+/ || $1 ~ /^Zv5_scaffold[0-9]+/' ensemblGene.gtf \
> ensemblGenechrUns.gtf
# get records for all other chroms
awk '$1 ~ /^[0-9]+/' ensemblGene.gtf > ensemblGenechroms.gtf
wc -l *.gtf
# 513421 ensemblGenechroms.gtf
# 125319 ensemblGenechrUns.gtf
# 638740 ensemblGene.gtf
# total lines of files made equal to original file so ok
liftUp -type=.gtf ensemblGenechrUns.lifted \
/cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
warn ensemblGenechrUns.gtf
# Got 29880 lifts in
# /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft
sed -e "s/^/chr/" ensemblGenechroms.gtf > ensGene.gtf
cat ensemblGenechrUns.lifted >> ensGene.gtf
# check file sizes -ok and some of the lifted co-ordinates
# there were some erroneous lines with "1;" or "2;" - 8 lines total
# Notified Ensembl and they fixed it so downloaded file again
# and reloaded into database
# Also remove the suffix that denotes the transcript version number.
# This is not in the ensGtp or ensPep tables.
perl -pi.bak -e 's/\.[0-9]+//'g ensGene.gtf
# load into database
ssh hgwdev
cd /cluster/data/danRer3/bed/ensembl
hgsql -e 'drop table ensGene;' danRer3
/cluster/bin/i386/ldHgGene danRer3 ensGene ensGene.gtf
# Read 32143 transcripts in 638732 lines in 1 files
# 32143 groups 27 seqs 1 sources 4 feature types
# 32143 gene predictions
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# Page 3) Choose the "Features" box. In "Ensembl Attributes", check
# Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Choose Text, tab-separated as the output format and gzip compression.
# Result name: ensGtp.
gunzip ensGtp.tsv.gz
# edit to remove first header line
hgsql danRer3 < ~/kent/src/hg/lib/ensGtp.sql
# remove header line from ensGtp.txt
echo "load data local infile 'ensGtp.tsv' into table ensGtp" \
| hgsql -N danRer3
# Get the ensembl peptide sequences from
# http://www.ensembl.org/Multi/martview
# Choose Danio Rerio as the organism
# Follow this sequence through the pages:
# Page 1) Choose the Ensembl Genes choice. Hit next.
# Page 2) Then hit next.
# Page 3) Choose "Sequences" from the Attributes pulldown menu at the top.
# Page 4) Choose Peptide as type of sequence to export and select
# Ensembl Gene ID from Gene Attributes and
# Ensembl Transcript ID and Ensembl Peptide Stable ID from
# Transcript Attributes as the output,
# choose text/fasta and gzip compression,
# name the file ensemblPep.fa.gz and then hit export.
gunzip ensemblPep.fa.gz
hgPepPred danRer3 ensembl ensemblPep.fa
# added code to hgc.c so that the link to the Ensembl Protein
# is also displayed on the description page.
FOR MGC GENES:
- wait one day for nightly build to align and load them into the db
- rebuild trackDb
# SPLIT UP ZEBRAFISH MASKED SEQUENCE FROM chrUn and chrNA INTO SCAFFOLDS
# (DONE, 2005-08-04, hartera)
# ADD SOFT-MASKED SCAFFOLDS TO ISERVERS FOR CLUSTER RUNS
# (DONE, 2005-08-15, hartera) AND TO BLUEARC (DONE, 2005-08-19)
ssh kkstore02
cd /cluster/data/danRer3
# for chrUn and chrNA, get masked sequence for soft and hard-masked
foreach c (Un NA)
cd $c
mkdir scaffoldsSoftMask scaffoldsHardMask
awk 'BEGIN {FS="\t"}{if ($5 != "N") \
print "faFrag -mixed chr'${c}'.fa",$2-1, $3, $6".fa";}' chr${c}.agp \
>> ./scaffoldsSoftMask/faFragSoftMask.csh
awk 'BEGIN {FS="\t"}{if ($5 != "N") \
print "faFrag -mixed chr'${c}'.fa.masked",$2-1, $3, $6".fa.masked";}' \
chr${c}.agp >> ./scaffoldsHardMask/faFragHardMask.csh
cd ..
end
# change permissions run scripts to get sequences
foreach d (Un NA)
chmod +x $d/scaffoldsSoftMask/faFragSoftMask.csh
chmod +x $d/scaffoldsHardMask/faFragHardMask.csh
end
cat << '_EOF_' > jkStuff/getMaskedScaffolds.csh
#!/bin/csh
foreach c (Un NA)
set dir=/cluster/data/danRer3
echo "Processing $c"
cd $dir/$c/scaffoldsSoftMask
cp ../chr${c}.fa .
echo "Getting soft-masked sequences ..."
nice faFragSoftMask.csh >& faFrag.log
echo "Getting hard-masked sequences ..."
cd $dir/$c/scaffoldsHardMask
cp ../chr${c}.fa.masked .
nice faFragHardMask.csh >& faFrag.log
end
'_EOF_'
chmod +x jkStuff/getMaskedScaffolds.csh
nice ./jkStuff/getMaskedScaffolds.csh &
# check a few sequences that they are correct
# add name of scaffold to sequence fasta and cat together
foreach c (Un NA)
set dir = /cluster/data/danRer3
foreach d (scaffoldsSoftMask scaffoldsHardMask)
cd $dir/$c/$d
foreach f (Zv5*)
if ($d == "scaffoldsHardMask") then
set b=$f:r
set g=$b:r
set sc=scaffoldMasked${c}.fa
else
set g=$f:r
set sc=scaffold${c}.fa
endif
perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:]+/>$g/" $f
cat $f >> $sc
rm *.bak
end
cp scaffold* $dir/$c/
end
end
# check sizes of final FASTA file with all sequences. check a few
# sequence files to see that they are correct - ok
# Add soft-masked scaffolds to the iservers for cluster runs
# (2005-08-15, hartera)
ssh kkr1u00
mkdir -p /iscratch/i/danRer3/scaffoldsSoftMask
cd /cluster/data/danRer3
foreach c (NA Un)
foreach f (/cluster/data/danRer3/$c/scaffoldsSoftMask/Zv5_*.fa)
cp -p $f /iscratch/i/danRer3/scaffoldsSoftMask
end
end
/cluster/bin/iSync
# Add soft-masked scaffolds to the bluearc for cluster runs
# (2005-08-19, hartera)
ssh kkr1u00
cd /cluster/data/danRer3/
mkdir -p /cluster/bluearc/scratch/danRer3/scaffoldsSoftMask
foreach c (NA Un)
foreach f (/cluster/data/danRer3/$c/scaffoldsSoftMask/Zv5_*.fa)
rsync -a --progress $f \
/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/
end
end
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2005-08-05, hartera)
ssh kkstore02
cd /cluster/data/danRer3
#- Build the .zip files
cat << '_EOF_' > jkStuff/gzipAll.csh
rm -rf gzip
mkdir gzip
# chrom AGP's
tar cvzf gzip/chromAgp.tar.gz [0-9A-Z]*/chr*.agp
# chrom RepeatMasker out files
tar cvzf gzip/chromOut.tar.gz */chr*.fa.out
# soft masked chrom fasta
tar cvzf gzip/chromFa.tar.gz */chr*.fa
# soft masked chrNA and chrUn scaffolds
tar cvzf gzip/scaffoldUnsFa.tar.gz NA/scaffoldNA.fa \
Un/scaffoldUn.fa
# hard masked chrom fasta
tar cvzf gzip/chromFaMasked.tar.gz */chr*.fa.masked
# hard masked chrNA and chrUn scaffolds
tar cvzf gzip/scaffoldUnsFaMasked.tar.gz \
NA/scaffoldMaskedNA.fa \
Un/scaffoldMaskedUn.fa
# chrom TRF output files
cd bed/simpleRepeat
tar cvzf ../../gzip/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=danRer3 -native GenBank mrna \
/cluster/data/danRer3/gzip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=danRer3 -xeno GenBank mrna \
/cluster/data/danRer3/gzip/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=danRer3 -native refseq mrna \
/cluster/data/danRer3/gzip/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=danRer3 -native GenBank est \
/cluster/data/danRer3/gzip/est.fa
cd /cluster/data/danRer3/gzip
# gzip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
gzip mrna.fa
gzip xenoMrna.fa
gzip refMrna.fa
gzip est.fa
'_EOF_'
# << this line makes emacs coloring happy
chmod +x ./jkStuff/gzipAll.csh
csh ./jkStuff/gzipAll.csh |& tee ./jkStuff/gzipAll.log
#- Look at zipAll.log to make sure all file lists look reasonable.
# Make upstream files and Copy the .zip files to
# hgwdev:/usr/local/apache/...
ssh hgwdev
cd /cluster/data/danRer3/gzip
# make upstream files for zebrafish RefSeq
featureBits danRer3 refGene:upstream:1000 -fa=upstream1000.fa
gzip upstream1000.fa
featureBits danRer3 refGene:upstream:2000 -fa=upstream2000.fa
gzip upstream2000.fa
set gp = /usr/local/apache/htdocs/goldenPath/danRer3
mkdir -p $gp/bigZips
cp -p *.gz $gp/bigZips
mkdir -p $gp/chromosomes
foreach f (../*/chr*.fa)
cp $f $gp/chromosomes
end
foreach c (NA Un)
cd /cluster/data/danRer3/$c
cp scaffold${c}.fa.gz $gp/chromosomes
end
cd $gp/bigZips
md5sum *.gz > md5sum.txt
cd $gp/chromosomes
# gzip the chromosome and scaffold FASTAs individually
foreach f (*.fa)
gzip $f
end
md5sum *.gz > md5sum.txt
# Take a look at bigZips/* and chromosomes/*
# copy README.txt's from danRer2 and update
# MAKE NIB FILES AND 2BIT FILE FOR SOFT MASKED chrUn AND chrNA SCAFFOLDS
# (DONE, 2005-08-06, hartera)
# ADD chrUn AND chrNA SCAFFOLDS 2BIT FILE TO BLUEARC (DONE, 2005-08-19, hartera)
ssh kkstore02
cd /cluster/data/danRer3
mkdir scaffoldsNAandUnNib
# Build nib files, using the soft masking in the fa
foreach c (NA Un)
echo "Processing $c"
foreach f ($c/scaffoldsSoftMask/Zv5*.fa)
faToNib -softMask $f scaffoldsNAandUnNib/$f:t:r.nib
end
end
# check correct number of nib files in directory: 14941
# there are 14676 chrNA scaffolds and 265 chrUn scaffolds
# copy chromosome 1-25 and chrNA and chrUn scaffolds nibs to a directory
# on iscratch and iSync for use in cluster runs
ssh kkr1u00
mkdir -p /iscratch/i/danRer3/chromandScafNib
cp -p /cluster/data/danRer3/nib/chr[0-9]*.nib \
/iscratch/i/danRer3/chromandScafNib
foreach f (/cluster/data/danRer3/scaffoldsNAandUnNib/Zv5*.nib)
cp -p $f /iscratch/i/danRer3/chromandScafNib
end
ssh kkstore02
# make a 2 bit file of all the scaffolds for chrNA and chrUn
# for blastz cluster runs
cd /cluster/data/danRer3/
cat NA/scaffoldNA.fa Un/scaffoldUn.fa > danRer3NAandUnScaffolds.fa
grep '>' danRer3NAandUnScaffolds.fa | wc -l
# 14941
faToTwoBit danRer3NAandUnScaffolds.fa danRer3NAandUnScaf.2bit
ssh kkr1u00
mkdir -p /iscratch/i/danRer3/NAandUnScafs
cp /cluster/data/danRer3/danRer3NAandUnScaf.2bit \
/iscratch/i/danRer3/NAandUnScafs
/cluster/bin/iSync
# get sizes of scaffolds for the .len file used by blastz
ssh kolossus
mkdir -p /panasas/store/danRer3/NAandUnScafSizes
cd /cluster/data/danRer3
cat << '_EOF_' > jkStuff/getNAandUnScafSizes.csh
#!/bin/csh -fe
foreach c (NA Un)
set sizeDir=/panasas/store/danRer3/NAandUnScafSizes
cd /cluster/data/danRer3/$c/scaffoldsSoftMask
foreach f (Zv5*.fa)
set g=$f:r
faSize detailed=on $f >> $sizeDir/NAandUnScafs.sizes
end
end
'_EOF_'
chmod +x jkStuff/getNAandUnScafSizes.csh
nice jkStuff/getNAandUnScafSizes.csh >& size.log &
# took about 1 minute
wc -l /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes
# 14941 /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes
# so correct number of scaffolds
cp /panasas/store/danRer3/NAandUnScafSizes/NAandUnScafs.sizes \
/cluster/data/danRer3
# add 2 bit to bluearc for cluster runs (2005-08-19, hartera)
ssh kkr1u00
mkdir -p /cluster/bluearc/scratch/danRer3
cp /cluster/data/danRer3/danRer3NAandUnScaf.2bit \
/cluster/bluearc/scratch/danRer3/
# BLASTZ SWAP FOR MOUSE (mm6) (DONE, 2005-08-10, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# REMAKE AXTNET AND COPY TO DOWNLOADS. REMAKE MAFNET (DONE, 2005-08-17, hartera)
# DROPPED THE CHAIN AND NET TABLES FROM HGWDEV AS THERE WERE 3 SETS OF
# MOUSE ALIGNMENTS: mm6, mm7 and mm8 (DONE, 2006-03-28, hartera)
ssh kkr1u00
# blastz requires lineage-specific repeats
# Treat all repeats as lineage-specific
# if not done already, get lineage-specific repeats
mkdir -p /iscratch/i/mm6/linSpecRep.notInZebrafish
foreach f (/panasas/store/mm6/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/mm6/linSpecRep.notInZebrafish/$f:t:r:r.out.spec
end
mkdir -p /iscratch/i/danRer3/linSpecRep.notInMouse
foreach f (/iscratch/i/danRer3/rmsk/chr*.fa.out)
cp -p $f /iscratch/i/danRer3/linSpecRep.notInMouse/$f:t:r:r.out.spec
end
/cluster/bin/iSync
# NOTE: the "mouse/human/etc." lineage-specific repeat files are now in
# /san/sanvol1/scratch/danRer3/linSpecRep.notInOthers
# however, the files for chrNA and chrUn were missing, so I'm
# adding them here. (2005-12-19 kate)
ssh kkstore02
cd /cluster/data/danRer3
cp -p Un/chrUn.fa.out \
/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/chrUn.out.spec
cp -p NA/chrNA.fa.out \
/san/sanvol1/scratch/danRer3/linSpecRep.notInOthers/chrNA.out.spec
# do swap of mm6 vs. danRer3 chain and net alignments to
# create danRer3 vs. mm6. see makeMm6.doc for details.
ssh kk
cd /cluster/data/mm6/bed/blastz.danRer3
mkdir -p /panasas/store/danRer3vsmm6Out
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
-stop download -blastzOutRoot /panasas/store/danRer3vsmm6Out \
-swap -chainMinScore=5000 >& doSwap.log &
# Start: Aug 10 16:30
# Finish: Aug 10 16:54
# Blastz parameters are as for mm6 vs. danRer3 danRer3 - see makeMm6.doc
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# BLASTZ_ABRIDGE_REPEATS=1
# do cleanup step and specify a different file server as can not
# access panasas from kkstore02.
nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
-continue cleanup -fileServer eieio \
-blastzOutRoot /panasas/store/danRer3vsmm6Out \
-swap -chainMinScore=5000 >& doSwapCleanUp.log &
# make html files and trackDb.ra entry for chain and net tracks.
# check README.txt for downloads.
# featureBits -chrom=chr1 danRer3 refGene:cds chainMm6Link -enrichment
# refGene:cds 0.688%, chainMm6Link 8.193%, both 0.543%, cover 78.94%,
# enrich 9.64x
# featureBits -chrom=chr1 danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.642%, chainMm5Link 4.499%, both 0.492%, cover 76.60%,
# enrich 17.02x
# featureBits -chrom=chr2 danRer3 refGene:cds chainMm6Link -enrichment
# refGene:cds 0.705%, chainMm6Link 8.219%, both 0.557%, cover 79.04%,
# enrich 9.62x
# featureBits -chrom=chr2 danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.739%, chainMm5Link 4.539%, both 0.579%, cover 78.37%,
# enrich 17.26x
# looks good, although enrichment is lower than for danRer2 and mm5, there are
# more chains in the score <10000 range for danRer3 than for danRer2 but
# this does not make up for all the extra chains in danRer3 over danRer2.
# Maybe there are more high scoring alignments to the chrUn and chrNA chains
# due to the scaffolds being used for the alignments.
# danRer3 has a extra sequence compared to danRer2. danRer3 chr2 is 48.2 Mb
# and for danRer2, chr2 is 52 Mb so in this case the chrom is smaller.
# featureBits -chrom=chrNA danRer3 refGene:cds chainMm6Link -enrichment
# refGene:cds 0.449%, chainMm6Link 10.952%, both 0.350%, cover 77.94%,
# enrich 7.12x
# featureBits -chrom=chrNA danRer2 refGene:cds chainMm5Link -enrichment
# refGene:cds 0.499%, chainMm5Link 4.176%, both 0.372%, cover 74.60%,
# enrich 17.86x
# netToAxt was processing nets incorrectly so remake these with
# new version of netToAxt and transfer to downloads dir.
ssh kkstore02
cd /cluster/data/danRer3/bed/blastz.mm6.swap
rm -r axtNet
# Make axtNet for download: one .axt per danRer3 seq.
# remake noClass.net
# Make nets("noClass", i.e. without rmsk/class stats which are added later):
cd axtChain
chainPreNet danRer3.mm6.all.chain.gz /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len stdout \
| chainNet stdin -minSpace=1 /cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len stdout /dev/null \
| netSyntenic stdin noClass.net
# create net for each chrom again
netSplit noClass.net net
# also split up chains again
mkdir chain
zcat danRer3.mm6.all.chain.gz | chainSplit chain stdin
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.mm6.swap
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/cluster/bluearc/danRer3/nib /panasas/store/mm6/nib stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.danRer3.mm6.net.axt.gz
end
# cleanup
ssh kkstore02
cd /cluster/data/danRer3/bed/blastz.mm6.swap/axtChain
rm noClass.net
rm -r net
rm -r chain
# remake mafNet from the new axtNet
cd /cluster/data/danRer3/bed/blastz.mm6.swap
rm -r mafNet
# Make mafNet for multiz: one .maf per danRer3 seq.
mkdir mafNet
foreach f (axtNet/*.danRer3.mm6.net.axt.gz)
axtToMaf -tPrefix=danRer3. -qPrefix=mm6. $f \
/cluster/data/mm6/bed/blastz.danRer3/S2.len /cluster/data/mm6/bed/blastz.danRer3/S1.len \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
# copy the new axtNet files to downloads and replace old ones
ssh hgwdev
rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6/axtNet
cd /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6
mkdir -p /usr/local/apache/htdocs/goldenPath/danRer3/vsMm6/axtNet
ln -s /cluster/data/danRer3/bed/blastz.mm6.swap/axtNet/*.axt.gz axtNet/
# remake md5sum.txt
rm md5sum.txt
md5sum *.gz */*.gz > md5sum.txt
# Dropped mouse mm6 chain and net tables from hgwdev as there were 3 sets
# of mouse alignments for danRer3: mm6, mm7 and mm8 (hartera, 2006-03-29)
hgsql -e 'drop table netMm6;' danRer3
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
hgsql -e "drop table chr${c}_chainMm6;" danRer3
hgsql -e "drop table chr${c}_chainMm6Link;" danRer3
end
# BLASTZ FOR FUGU (fr1) (DONE, 2005-08-18, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# RECREATE DOWNLOADS AS THE FUGU DOWNLOADS DIRECTORY HAS BEEN DELETED
# (DONE, 2005-11-17, hartera)
ssh kk
mkdir /cluster/data/danRer3/bed/blastz.fr1.2005-08-13
cd /cluster/data/danRer3/bed
ln -s blastz.fr1.2005-08-13 blastz.fr1
# use parameters for fr1 in makeDanRer2.doc. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q
# matrix as Fugu is quite distant from zebrafish. Blastz uses
# lineage-specfic repeats but there are none for these two species.
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - zebrafish (danRer3)
# soft-masked chroms, 1-25 and M
SEQ1_DIR=/iscratch/i/danRer3/chromNib
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Fugu (fr1)
# soft-masked scaffolds in 2bit format
SEQ2_DIR=/iscratch/i/fr1/UnScaffolds/fr1UnScaffolds.2bit
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/danRer3/bed/blastz.fr1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
cp /cluster/data/danRer3/chrom.sizes ./S1.len
# make S2.len for fr1 scaffolds
twoBitInfo /cluster/data/fr1/fr1UnScaffolds.2bit ./S2.len
wc -l *.len
# 28 S1.len
# 20379 S2.len
# make output directory
mkdir -p /cluster/bluearc/danRer3vsfr1Out
# do blastz and create chains for fr1 scaffolds on danRer3 chr1-25 and chrM
# chickenHumanTuned.gap scoring matrix is now used by default
# by axtChain.
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
-blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
-stop chainMerge >& do.log &
# Start: Aug 13 10:48
# Finish: Aug 13 13:35
# then run the danRer3 NA and Un scaffolds against fugu scaffolds
mkdir NAandUnScaffolds
cd NAandUnScaffolds
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - zebrafish (danRer3)
# soft-masked scaffolds for chrNA and chrUn in 2 bit format
SEQ1_DIR=/iscratch/i/danRer3/NAandUnScafs/danRer3NAandUnScaf.2bit
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Fugu (fr1)
# soft-masked scaffolds in 2bit format
SEQ2_DIR=/iscratch/i/fr1/UnScaffolds/fr1UnScaffolds.2bit
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0
BASE=/cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
twoBitInfo /cluster/data/danRer3/danRer3NAandUnScaf.2bit ./S1.len
# make S2.len for fr1 scaffolds
twoBitInfo /cluster/data/fr1/fr1UnScaffolds.2bit ./S2.len
wc -l *.len
# 14941 S1.len
# 20379 S2.len
# make output directory
mkdir -p /cluster/bluearc/danRer3vsfr1Out/NAandUnScaffolds
# do blastz and create chains for fr1 scaffolds on danRer3
# chrNA and chrUn scaffolds
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
-blastzOutRoot /cluster/bluearc/danRer3vsfr1Out/NAandUnScaffolds \
-chainMinScore=5000 -stop chainMerge >& do.log &
# Start: Aug 13 14:05
# Finish: Aug 14 20:58
# The log file says it is finished. chainSplit was not run as SEQ1 has
# is not < 100 sequences. Need to do liftUp before running chainSplit.
cd /cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds/axtChain/run
# Lifting up chains:
# need to lift these chains up to chrom level for Fugu for chrom run and
# for danRer3 and Fugu for the NA and Un scaffolds run.
# first for Fugu in the danRer3 chrom run
ssh kkstore02
cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
mkdir liftedChain
foreach f (chain/*.chain)
set c=$f:t:r
echo $c
liftUp -chainQ liftedChain/${c}.lifted.chain \
/cluster/data/fr1/Un/lift/ordered.lft warn $f
end
# lift up for danRer3 scaffolds run.
ssh kkstore02
cd /cluster/data/danRer3/bed/blastz.fr1/NAandUnScaffolds/axtChain
# first lift Fugu fr1 query, there is no split chains here as there
# were not < 100 sequences for the target.
zcat danRer3.fr1.all.chain.gz | liftUp -chainQ danRer3.fr1.liftedQall.chain \
/cluster/data/fr1/Un/lift/ordered.lft warn stdin
# then liftUp target coords for danRer3
liftUp danRer3.fr1.liftedQandTall.chain \
/cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
warn danRer3.fr1.liftedQall.chain
# gzip lifted danRer3.fr1 chain file
gzip danRer3.fr1.liftedQandTall.chain
# merge the chains from the danRer3 chrom run and the danRer3
# NA and Un scaffolds run. chains are sorted by score and IDs are uniqued.
cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
mv danRer3.fr1.all.chain.gz danRer3.fr1.chroms.chain.gz
set blastz=/cluster/data/danRer3/bed/blastz.fr1
# copy over lifted chains for danRer3 scaffolds vs fr1
cp $blastz/NAandUnScaffolds/axtChain/danRer3.fr1.liftedQandTall.chain.gz \
./liftedChain
gunzip ./liftedChain/*.gz
nice chainMergeSort liftedChain/*.chain \
| nice gzip -c > danRer3.fr1.all.chain.gz
# then split up into chains again
mv chain chromChain
mkdir chain
nice zcat danRer3.fr1.all.chain.gz | chainSplit chain stdin
# then pick up the doBlastzChainNet.pl script at the net step.
ssh kkstore02
cd /cluster/data/danRer3/bed/blastz.fr1
cp DEF DEF.chroms
# edit DEF file to include the all nib files for danRer3 and the
# nib file for the chrUn of Fugu fr1. Since all the coords have now
# been lifted to chrom level then these are now needed.
# SEQ1_DIR=/iscratch/i/danRer3/nib
# SEQ2_DIR=/cluster/bluearc/fugu/fr1/chromNib
# use kkr1u00 for computationally intensive steps as kolossus is down.
# need to create new S2.len for whole chrUn for Fugu
mv S2.len S2.scaffolds.len
cp /cluster/data/fr1/chrom.sizes S2.len
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
-blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
-workhorse kkr1u00 -continue net >& doNet.log &
# crashed at cleanup step when trying to access kkstore02
# The authenticity of host 'kkstore02 (128.114.50.155)' can't be
# established. Re-run from this step.
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
-blastzOutRoot /cluster/bluearc/danRer3vsfr1Out -chainMinScore=5000 \
-workhorse kkr1u00 -continue cleanup >& doNet2.log &
# netToAxt was processing nets incorrectly so remake these with
# new version of netToAxt.
# and transfer to downloads dir.
ssh kkstore02
cd /cluster/data/danRer3/bed/blastz.fr1
rm -r axtNet
# Make axtNet for download: one .axt per danRer3 seq.
# remake noClass.net
# Make nets("noClass", i.e. without rmsk/class stats which are added later):
cd axtChain
chainPreNet danRer3.fr1.all.chain.gz \
/cluster/data/danRer3/bed/blastz.fr1/S1.len /cluster/data/danRer3/bed/blastz.fr1/S2.len stdout \
| chainNet stdin -minSpace=1 /cluster/data/danRer3/bed/blastz.fr1/S1.len \
/cluster/data/danRer3/bed/blastz.fr1/S2.len stdout /dev/null \
| netSyntenic stdin noClass.net
# create net for each chrom again
netSplit noClass.net net
# also split up chains again
mkdir chain
zcat danRer3.fr1.all.chain.gz | chainSplit chain stdin
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.fr1
mkdir axtNet
foreach f (axtChain/net/*.net)
netToAxt $f axtChain/chain/$f:t:r.chain \
/cluster/bluearc/danRer3/nib /cluster/bluearc/fugu/fr1/chromNib stdout \
| axtSort stdin stdout \
| gzip -c > axtNet/$f:t:r.danRer3.fr1.net.axt.gz
end
# cleanup
ssh kkstore02
cd /cluster/data/danRer3/bed/blastz.fr1/axtChain
rm noClass.net
rm -r net
rm -r chain
# remake mafNet from the new axtNet
cd /cluster/data/danRer3/bed/blastz.fr1
rm -r mafNet
mkdir mafNet
foreach f (axtNet/*.danRer3.fr1.net.axt.gz)
axtToMaf -tPrefix=danRer3. -qPrefix=fr1. $f \
/cluster/data/danRer3/bed/blastz.fr1/S1.len /cluster/data/danRer3/bed/blastz.fr1/S2.len \
stdout \
| gzip -c > mafNet/$f:t:r:r:r:r:r.maf.gz
end
# copy the new axtNet files to downloads and replace old ones
ssh hgwdev
rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1/axtNet
cd /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1
mkdir -p /usr/local/apache/htdocs/goldenPath/danRer3/vsFr1/axtNet
ln -s /cluster/data/danRer3/bed/blastz.fr1/axtNet/*.axt.gz axtNet/
# remake md5sum.txt
rm md5sum.txt
md5sum *.gz */*.gz > md5sum.txt
# Check README in downloads section and add a note about how the
# unordered chroms were split up into scaffolds.
# Add trackDb entry for chain and net tracks to
# trackDb/zebrafish/danRer3/trackDb.ra
# Do swap to get danRer3 chains on Fugu, fr1 - see makeFr1.doc
# featureBits -chrom=chr2 danRer3 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.705%, chainFr1Link 8.960%, both 0.645%, cover 91.53%,
# enrich 10.22x
# featureBits -chrom=chr2 danRer2 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.739%, chainFr1Link 4.537%, both 0.620%, cover 83.90%,
# enrich 18.49x
# featureBits -chrom=chrNA danRer3 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.449%, chainFr1Link 7.129%, both 0.399%, cover 88.78%,
# enrich 12.45x
# featureBits -chrom=chrNA danRer2 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.499%, chainFr1Link 3.901%, both 0.409%, cover 81.90%,
# enrich 20.99x
# Run directory files are already on /cluster/data. Remake downloads
# for fugu alignments since these have been removed from
# the downloads directory. (hartera, 2005-11-17)
ssh hgwdev
# remake downloads using doBlastzChainNet.pl script
cd /cluster/data/danRer3/bed/blastz.fr1
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-continue download -stop download `pwd`/DEF >& doDownload.log &
# Check README in downloads section and add a note about how the
# unordered chroms were split up into scaffolds.
# VEGA
# get transcripts in transcripts_coords from e-mail from Mario Caccamo
# at Sanger 06/16/05.
# also README for Vega
ssh kkstore01
mkdir -p /cluster/data/danRer3/bed/vegaGene
cd /cluster/data/danRer3/bed/vegaGene
# AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN (DONE, 2005-08-22, markd)
# align with revised genbank process
cd ~kent/src/hg/makeDb/genbank
cvs update -d etc
# edit etc/genbank.conf to add danRer3, had to run on pk, due to kk
# being down. Set temporary locations for server files
# danRer3 (zebrafish)
# Lift file partitions unplaced sequence pseudo-chroms (disabled)
danRer3.serverGenome = /cluster/data/danRer3/danRer3.2bit
##danRer3.clusterGenome = /iscratch/i/danRer3/danRer3.2bit
##danRer3.ooc = /iscratch/i/danRer3/danRer3_11.ooc
danRer3.clusterGenome = /san/sanvol1/scratch/danRer3/danRer3.2bit
danRer3.ooc = /san/sanvol1/scratch/danRer3/danRer3_11.ooc
##danRer3.align.unplacedChroms = chrNA chrUn
##danRer3.lift = /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft
danRer3.lift = no
danRer3.downloadDir = danRer3
danRer3.mgcTables.default = full
danRer3.mgcTables.mgc = all
# update /cluster/data/genbank/
make etc-update
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial danRer3 &
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad danRer3&
# enable daily alignment and update of hgwdev
cd ~kent/src/makeDb/genbank
cvs update -d etc
# add danRer3 to:
etc/align.dbs
etc/hgwdev.dbs
cvs commit
make etc-update
# TIGR GENE INDEX (DONE, 2005-08-24, hartera)
# Data from Razvan Sultana (rsultana@jimmy.harvard.edu or rsultana@tigr.org)
# Includes data for chr1-25 and chrM, NOT chrNA and chrUn. Asked for these
# on scaffolds and not on the virtual chroms - harder to generate.
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/tigr
cd /cluster/data/danRer3/bed/tigr
wget --timestamping \
ftp://ftp.tigr.org/pub/data/tgi/Danio_rerio/TGI_track_danRer3_chr1-25.tgz
tar xvzf TGI*.tgz
# this is data for just chr1-25 and chrM. Data for NA and Un are to follow.
ls chr1_*
# chr1_drosophTCs chr1_g_gallusTCs chr1_mouseTCs chr1_zfishTCs
# chr1_elegansTCs chr1_humanTCs chr1_ratTCs
# so species are fly, chicken, mouse, zebrafish, C. elegans, human and rat
foreach f (*g_gallus*)
set f1 = `echo $f | sed -e 's/g_gallus/chicken/g'`
mv $f $f1
end
foreach f (*drosoph*)
set f1 = `echo $f | sed -e 's/drosoph/Dmelano/g'`
mv $f $f1
end
foreach o (Dmelano chicken elegans human mouse rat zfish)
echo $o
setenv O $o
foreach f (chr*_$o*s)
tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
end
end
ssh hgwdev
cd /cluster/data/danRer3/bed/tigr
hgsql danRer3 -e "drop table tigrGeneIndex"
nice ldHgGene -exon=TC danRer3 tigrGeneIndex *.gff
# Read 75388 transcripts in 288032 lines in 182 files
# 75388 groups 26 seqs 1 sources 1 feature types
# 75388 gene predictions
checkTableCoords danRer3 tigrGeneIndex
/cluster/bin/scripts/runGeneCheck /cluster/data/danRer3/bed/tigr
# no CDS in these gene predictions so fix this:
hgsql danRer3 -e "update tigrGeneIndex set cdsStart = txStart;"
hgsql danRer3 -e "update tigrGeneIndex set cdsEnd = txEnd;"
# compress all files
gzip chr*
# MAKE Human Proteins track (DONE 2005-09-21 braney)
ssh kkstore02
mkdir -p /cluster/data/danRer3/blastDb
cd /cluster/data/danRer3/blastDb
cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA/d" | sed "/Un/d" > chrom.list
for i in `cat chrom.list`; do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
ln -s `cat list` .
for i in *.fa
do
/projects/compbio/bin/i686/formatdb -i $i -p F
done
rm *.log *.fa list
cd ..
for i in `cat blastDb/chrom.list`; do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft
rm blastDb/chrom.list
mkdir /cluster/data/danRer3/scaffoldBlastDb
cd /cluster/data/danRer3/scaffoldBlastDb
cat ../Un/scaffoldsSoftMask/*.fa ../NA/scaffoldsSoftMask/*.fa | faSplit sequence stdin 500 scaf
for i in *.fa
do
/projects/compbio/bin/i686/formatdb -i $i -p F
done
rm *.log *.fa
mkdir -p /san/sanvol1/scratch/danRer3/comboBlastDb
cd /cluster/data/danRer3/blastDb
for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer3/comboBlastDb; done
cd /cluster/data/danRer3/scaffoldBlastDb
for i in nhr nin nsq; do cp *.$i /san/sanvol1/scratch/danRer3/comboBlastDb; done
mkdir -p /cluster/data/danRer3/bed/tblastn.hg17KG
cd /cluster/data/danRer3/bed/tblastn.hg17KG
echo /san/sanvol1/scratch/danRer3/comboBlastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
# we want around 250000 jobs
calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 37365/(250000/3539) = 528.938940
mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa
split -l 529 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa/kg
ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/kgfa kgfa
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
ln -s /cluster/bluearc/danRer2/bed/tblastn.hg17KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/subChr.lft carry $f.2
liftUp -nosort -type=".psl" -nohead $f.4 ../../jkStuff/liftAll.lft carry $f.3
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.4
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
ssh kk
cd /cluster/data/danRer3/bed/tblastn.hg17KG
para create blastSpec
para push
# Completed: 203170 of 203170 jobs
# CPU time in finished jobs: 17875092s 297918.20m 4965.30h 206.89d 0.567 y
# IO & Wait Time: 4092508s 68208.46m 1136.81h 47.37d 0.130 y
# Average job time: 108s 1.80m 0.03h 0.00d
# Longest finished job: 1778s 29.63m 0.49h 0.02d
# Submission to last job: 64970s 1082.83m 18.05h 0.75d
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
para create chainSpec
para push
# Completed: 71 of 71 jobs
# CPU time in finished jobs: 89115s 1485.25m 24.75h 1.03d 0.003 y
# IO & Wait Time: 35631s 593.85m 9.90h 0.41d 0.001 y
# Average job time: 1757s 29.28m 0.49h 0.02d
# Longest finished job: 15587s 259.78m 4.33h 0.18d
# Submission to last job: 23380s 389.67m 6.49h 0.27d
ssh kkstore02
cd /cluster/data/danRer3/bed/tblastn.hg17KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
liftUp -nohead -type=.psl stdout /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft carry u.*.psl m60* | \
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > /cluster/data/danRer3/bed/tblastn.hg17KG/blastHg17KG.psl
ssh hgwdev
cd /cluster/data/danRer3/bed/tblastn.hg17KG
hgLoadPsl danRer3 blastHg17KG.psl
# 21063005 bases of 1630323462 (1.292%) in intersection
# back to kkstore02
rm -rf blastOut
# End tblastn
# BACENDS TRACK (DONE, 2005-09-28, hartera)
# Track display is very slow on large regions. Split all_bacends table by
# chromosome (DONE, 2006-04-19, hartera)
# REDO BACENDS FOR PAIRS, SINGLES, BAD PAIRS AND ALL BACENDS TABLES
# (see separate section on REDO BACENDS, 2006-05-01 - 2006-05-08, hartera)
ssh kkstore01
# BAC ends sequence files provided by Mario Caccamo at Sanger
# mc2@sanger.ac.uk
mkdir -p /cluster/data/danRer3/bed/bacends
cd /cluster/data/danRer3/bed/bacends
wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/zf_bacends.fa.gz
wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/DH_bacends.fa.gz
wget --timestamp ftp://ftp.sanger.ac.uk/pub/mc2/bacend_placement.txt.gz
gunzip *.gz
# DH_bacends.fa are from the new library from a doubled haploid zebrafish
# zf_bacends.fa are from the existing libraries used in danRer2 and danRer1
# Several reads are present for some of the BAC ends and these have
# names like p1kaSP6 or q1kaT7 for duplicated reads and p1kSP6w or q1kT7w
# for multiple reads. In the trace repository, the most recent sequence
# is stored and the 'a' or 'w' is dropped from the name.
# for the DH_bacends.fa from the CHORI73 library, the names are
# experiment file name trace_name
# ======================== ================
# CHORI73_139g06.p1kSP6 CHORI73_139G6SP6
# CHORI73_165b21.q1kT7 CHORI73_165B21T7
# The trace name is that stored in the trace archive with leading zeros
# dropped and ".p1k" or ".q1k" and lower case changed to upper.
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends
# check list of prefixes in zf_bacends.fa
grep '>' zf_bacends.fa > zf.names
perl -pi.bak -e 's/>//' zf.names
perl -pi.bak -e 's/^([A-Za-z]+)[0-9]+.+/$1/' zf.names
sort -u zf.names
# bZ
# zC
# zK
# zKp
# in DH_bacends.fa, all are CHORI73_
# For DH_bacends.fa, need to clean up, change names to Trace archive
# format as above. Then choose most recent sequence, those that are bad
# with lots of Ns should be removed at the alignment stage as they will
# not pass the Blat or pslReps criteria.
# cat zf_bacends.fa DH_bacends.fa >> Zv5Bacends.fa
# faSize Zv5Bacends.fa
# 680121953 bases (11160014 N's 668961939 real 668961939 upper 0 lower)
# in 729101 sequences in 1 files
# Total size: mean 932.8 sd 242.6 min 26 (CHORI73_189m04.p1kSP6)
# max 5717 (CHORI73_255a17.q1kT7) median 882
# N count: mean 15.3 sd 75.7
# U count: mean 917.5 sd 242.2
# L count: mean 0.0 sd 0.0
wc -l *.fa
# 6412741 DH_bacends.fa
# 14700258 Zv5Bacends.fa
# 8287517 zf_bacends.fa
grep '>' DH_bacends.fa | wc -l
# 304252
grep '>' zf_bacends.fa | wc -l
# 424849
# for DH_bacends.fa there are replicate reads. If duplicate plates
# have been made (i.e. read names like ..p1kaSP6 or ..q1kaT7) or plates
# have been sequenced multiple times (i.e. read names like ..p1kSP6w or
# ..q1kT7w), the Sanger trace repository has the most recent read and
# dropped the 'a' or 'w' from the trace name.
# some are not in the repository. They had bad quality reads with a lot
# of Ns or runs of the same base. These should be dropped in the
# alignment filtering.
# now download sequence files from Sanger ftp site - these are the
# ones from the Sanger sequence repository
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/bacends/seqs
cd /cluster/data/danRer3/bed/bacends/seqs
# get contents of ftp directory
wget --timestamp \
ftp://ftp.ensembl.org/pub/traces/danio_rerio/fasta/
# from index.html, grep lines with cloneEnd
grep "cloneEnd" index.html > cloneEnds
awk 'BEGIN {FS="\""} {print "wget --timestamp",$2;}' cloneEnds \
> getCloneEnds.csh
chmod +x getCloneEnds.csh
cat getCloneEnds.csh
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025270298.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025273988.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1025278580.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1035416745.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1035417824.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1040215846.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/mpgeb-zfish-cloneEnd-1048006071.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1114727127.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115222417.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115226483.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115230498.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115234585.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115238038.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-CHORI-73-cloneEnd-1115240957.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039514906.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039603426.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1039604741.fasta.gz
wget --timestamp ftp://ftp.ensembl.org:21/pub/traces/danio_rerio/fasta/sanger-zfish-cloneEnd-1040231265.fasta.gz
nice getCloneEnds.csh >& bac.log &
# check log to see everything downloaded ok and then remove
rm bac.log index.html
# unzip files
gunzip *.gz
# cat together CHORI73 FASTA files
cat sanger-zfish-CHORI*.fasta > CHORI73_bacends.fa
grep '>' CHORI73_bacends.fa > CHORI73.names
perl -pi.bak -e 's/>//' CHORI73.names
sort CHORI73.names | uniq > CHORI73.names.sort
wc -l CHORI73.names.sort
# 265235 CHORI73.names.sort
cat sanger-zfish-cloneEnd*.fasta > zfish_bacends.fa
cat mpgeb-zfish-cloneEnd*.fasta > zfishmpgeb_bacends.fa
grep '>' zfish_bacends.fa | wc -l
# 164302
grep '>' zfishmpgeb_bacends.fa | wc -l
# 264633
cp CHORI73.names.sort /cluster/data/danRer3/bed/bacends/
# compared this list of sequence names for zf_bacends.fa and got more
# sequences in the zf_bacends.fa - checked and some are in the trace
# repository and some are not.
# for CHORI_73 there are 394 extra sequences in the downloaded file
# and over 7000 in the original file sent by Mario. Just use the original
# file here as the sequences will probably be filtered out if there
# are bad alignments. get list of sequences for which there are more than
# 2 ends. Some end sequences have multiple reads.
cd /cluster/data/danRer3/bed/bacends
# look at file of CHORI73_ sequences sent by Mario at Sanger:
grep '>' DH_bacends.fa > DH.names
perl -pi.bak -e 's/>//' DH.names
perl -pi.bak -e 's/(CHORI73_[0-9a-z]+)\.[a-z0-9]+.+/$1/' DH.names
sort DH.names | uniq -c | sort -nr > DH.names.counts
awk '{if ($1 > 2) print $2;}' DH.names.counts > DH.names.morethan2
# translate to upper case and remove leading zeros
cat DH.names.morethan2 | tr '[a-z]' '[A-Z]' > DH.names.morethan2.upper
# remove leading 0
perl -pi.bak -e 's/(CHORI73_[0-9]+[A-Z])0([0-9]+)/$1$2/' \
DH.names.morethan2.upper
sort DH.names.morethan2.upper | uniq > DH.names.morethan2.upper.sort
wc -l *.sort
# 265235 CHORI73.names.sort
# 6020 DH.names.morethan2.upper.sort
comm -12 CHORI73.names.sort DH.names.morethan2.upper.sort | wc
# 5299
# so 721 are not in this list so they are probably not in the repository
# but align these anyway.
# for those that are then use the versions in CHORI73.names
comm -12 CHORI73.names.sort DH.names.morethan2.upper.sort \
> CHORI73.names.touse
comm -13 CHORI73.names.sort DH.names.morethan2.upper.sort \
> DHmorethan2.DHonly
awk '{if ($1 <= 2) print $2;}' DH.names.counts > DH.names.2orless
# this is list of sequences to get from DH_bacends.fa
# need to back translate the list in DHmorethan2.DHonly
cat DHmorethan2.DHonly | tr '[A-Z]' '[a-z]' > DHtmp
sed -e 's/chori/CHORI/' DHtmp > DHmorethan2.DHonly.format
# need to put leading zeros back and "." at the end to help
# pattern matching with grep.
cat << '_EOF_' > addZeros.pl
#/usr/bin/perl -w
use strict;
my ($file);
$file = $ARGV[0];
open (FILE, $file) || die "Can not open $file: $!\n";
while (<FILE>)
{
chomp;
my ($l,$id);
$l = $_;
if ($l =~ /^CHORI73_[0-9]+[a-z][0-9]{2,}/)
{
print "$l\\.\n";
}
elsif($l =~ /^(CHORI73_[0-9]+[a-z])([0-9]{1})/)
{
$id = $1 . "0" . $2 . "\\.";
print "$id\n";
}
}
close FILE;
'_EOF_'
chmod +x addZeros.pl
perl addZeros.pl DHmorethan2.DHonly.format > DHmorethan2.DHonly.format2
wc -l DHmorethan2.DHonly*
# 721 DHmorethan2.DHonly
# 721 DHmorethan2.DHonly.format
# 721 DHmorethan2.DHonly.format2
# need to get full sequence names
grep '>' DH_bacends.fa > DHBacs.fullnames
perl -pi.bak -e 's/>//' DHBacs.fullnames
perl -pi.bak -e 's/(CHORI73_[0-9a-z]+\.[a-z0-9A-Z]+) bases.+/$1/' \
DHBacs.fullnames
grep -f DHmorethan2.DHonly.format2 DHBacs.fullnames \
> DHmorethan2.DHonly.fullnames
wc -l DHmorethan2.DHonly.fullnames
# 2352 DHmorethan2.DHonly.fullnames
sort DHmorethan2.DHonly.fullnames > DHmorethan2.DHonly.fullnames.sort
# do for those with less than 2 sequences to get the full names
cat << '_EOF_' > getFullNames.pl
#!/usr/bin/perl -w
use strict;
my ($file, $patterns, %idsHash);
$file = $ARGV[0];
$patterns = $ARGV[1];
open (FILE, $file) || die "Can not open $file: $!\n";
open (PATTERNS, $patterns) || die "Can not open $patterns: $!\n";
while (<FILE>)
{
chomp;
my ($l, $pref, $dir);
$l = $_;
if ($l =~ /^(CHORI73_[0-9a-z]+)\./)
{
$pref = $1;
push(@{$idsHash{$pref}}, $l);
}
}
close FILE;
while (<PATTERNS>)
{
my ($line, @ids, $i);
chomp;
$line = $_;
if (exists($idsHash{$line}))
{
@ids = @{$idsHash{$line}};
foreach $i (@ids)
{
print "$i\n";
}
}
}
close PATTERNS;
'_EOF_'
chmod +x getFullNames.pl
perl getFullNames.pl DHBacs.fullnames DH.names.2orless \
> DH.fullnames.2orless
# do the same for CHORI73.names.touse to get full names
awk '{print $1"SP6"}' CHORI73.names.touse > CHORI73.namesSP6.touse
awk '{print $1"T7"}' CHORI73.names.touse > CHORI73.namesT7.touse
cat CHORI73.namesSP6.touse CHORI73.namesT7.touse \
> CHORI73.namesSP6andT7.touse
wc -l CHORI73.names*
# 265235 CHORI73.names.sort
# 10598 CHORI73.namesSP6andT7.touse
# 5299 CHORI73.namesSP6.touse
# 5299 CHORI73.namesT7.touse
# 5299 CHORI73.names.touse
grep '>' CHORI73_bacends.fa > CHORI73.fullnames
perl -pi.bak -e 's/>//' CHORI73.fullnames
grep -f CHORI73.namesSP6andT7.touse CHORI73.fullnames \
> CHORI73.fullnames.touse
# so get all the sequence records together in one file
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends
mkdir bacSeqs
# get all sequences from DH_bacends.fa that have 2 or less for the clone.
# This might include cases where there are duplicate reads for one end
# only but these will go into the singles track anyway.
faSomeRecords DH_bacends.fa DH.fullnames.2orless ./bacSeqs/DHBacs.2orless.fa
# get all sequences with more than 2 sequences for that clone but
# with no sequence in the new downloaded BAC ends sequence file that
# has only one sequence for each BAC end.
faSomeRecords DH_bacends.fa DHmorethan2.DHonly.fullnames.sort \
./bacSeqs/DHBacs.2ormore.orig.fa
# get all sequences for BAC ends where there are more than 2 read for
# ends for one clone so there are replicate reads for at least one end.
# use the sequence in the downloaded CHORI73 set of clone ends for these.
faSomeRecords CHORI73_bacends.fa CHORI73.fullnames.touse \
./bacSeqs/CHORI73.fromDH.morethan2.fa
cd bacSeqs
# translate to upper case and remove leading zeros
cat DHBacs.2orless.fa | tr '[a-z]' '[A-Z]' > DHBacs.2orless.format.fa
cat DHBacs.2ormore.orig.fa | tr '[a-z]' '[A-Z]' \
> DHBacs.2ormore.orig.format.fa
# remove leading 0 and just use name as FASTA header
# need to leave in a or w as in p1kaSP6 or q1kaT7 or p1kSP6w or q1kT7w
# these will distinguish replicate reads from the same sequence and will
# be removed later when the best alignment is selected.
perl -pi.bak -e \
's/(CHORI73_[0-9]+[A-Z]{1})0?([0-9]+)\.(P1K|Q1K)(ASP6|SP6|SP6W|AT7|T7|T7W) BASES.+/$1$2$4/' \
DHBacs*format.fa
cat CHORI73.*.fa DHBacs*.format.fa > CHORI73BACends.fa
grep '>' CHORI73BACends.fa | wc -l
# 295722
# then combine these with the zf_bacends.fa from Sanger which contain
# the rest of the BAC end sequences.
cat ../zf_bacends.fa CHORI73BACends.fa > Zv5BACends.fa
grep '>' Zv5BACends.fa | wc -l
# 720571
faSize Zv5BACends.fa
# 674252474 bases (10674972 N's 663577502 real 663577502 upper 0 lower) in
# 720571 sequences in 1 files Total size: mean 935.7 sd 239.8
# min 26 (CHORI73_189M4SP6) max 5403 (zC259G13.zb) median 882
# N count: mean 14.8 sd 72.4
# U count: mean 920.9 sd 239.6
# L count: mean 0.0 sd 0.0
# check Zv5BACends.fa has unique sequence names
grep '>' Zv5BACends.fa | sed 's/>//' > names
sort names | uniq -c | sort -nr > names.count
# all unique names so cleanup
rm names names.count *.bak
# Now the BAC end sequences file has been made, align the sequences
# to danRer3 using Blat.
ssh pk
# problems running these on kk using input from bluearc - slowed down
# kkstore02 with heavy load. So move everything to the san as it
# scales better than the bluearc especially from the pk. run directory
# is on san also.
cd /cluster/data/danRer3/bed/bacends/bacSeqs
# first split up bacends sequence and add to directory on the san
mkdir -p /san/sanvol1/scratch/danRer3/bacends/Zv5bacends
# split up sequence for cluster runs
faSplit sequence Zv5BACends.fa 20 \
/san/sanvol1/scratch/danRer3/bacends/Zv5bacends/bacends
# get all the chrom contig files onto the san
mkdir -p /san/sanvol1/scratch/danRer3/trfFaChroms
rsync -a --progress /cluster/bluearc/danRer3/trfFa/chr[0-9M]*.fa \
/san/sanvol1/scratch/danRer3/trfFaChroms/
cd /cluster/data/danRer3/bed/bacends
mkdir -p /san/sanvol1/scratch/danRer3/bacends/chromsRun
ln -s /san/sanvol1/scratch/danRer3/bacends/chromsRun
# make directory for output, do not have output going to /cluster/data dir
# as it is very large.
mkdir -p /san/sanvol1/scratch/danRer3/bacends/chromsPsl
ln -s /san/sanvol1/scratch/danRer3/bacends/chromsPsl
# also copy over the 11.ooc file for danRer3 if not there already
cp -p /cluster/bluearc/danRer3/danRer3_11.ooc \
/san/sanvol1/scratch/danRer3/
# make input file lists
cd /cluster/data/danRer3/bed/bacends/chromsRun
ls -1S /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/*.fa > bacends.lst
# do blat just for chr1-25 and chrM
ls -1S /san/sanvol1/scratch/danRer3/trfFaChroms/*.fa > seqs.lst
# 64 bit blat used for pk. This version of blat recently had a bug fix
# so should give the same result as i386 blat on kk. use absolute path for
# output dir rather than symlink as that would increase I/O.
# use Blat parameters as for mm5 and hg17
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc {check out line+ /san/sanvol1/scratch/danRer3/bacends/chromsPsl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
/cluster/bin/i386/gensub2 seqs.lst bacends.lst template jobList
/cluster/bin/i386/para create jobList
/cluster/bin/i386/para try, check, push, check, ...
# /cluster/bin/i386/para time
# Completed: 4160 of 4160 jobs
# CPU time in finished jobs: 746878s 12447.96m 207.47h 8.64d 0.024 y
# IO & Wait Time: 11166s 186.11m 3.10h 0.13d 0.000 y
# Average job time: 182s 3.04m 0.05h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 915s 15.25m 0.25h 0.01d
# Submission to last job: 5100s 85.00m 1.42h 0.06d
# run jobs to do blat of NA and Un scaffolds vs BAC end sequences
ssh pk
# copy scaffolds to the san
mkdir -p /san/sanvol1/scratch/danRer3/scaffoldsSoftMask
foreach f (/cluster/bluearc/scratch/danRer3/scaffoldsSoftMask/Zv5_*.fa)
rsync -a --progress $f /san/sanvol1/scratch/danRer3/scaffoldsSoftMask/
end
cd /cluster/data/danRer3/bed/bacends
mkdir -p /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
ln -s /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
# make directory for output, do not have output going to /cluster/data dir
# as it is very large.
mkdir -p /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
ln -s /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
# make input file lists
cd /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnRun
ls -1S /san/sanvol1/scratch/danRer3/bacends/Zv5bacends/*.fa > bacends.lst
# do blat just for NA and Un scaffolds
foreach f (/san/sanvol1/scratch/danRer3/scaffoldsSoftMask/Zv5_*.fa)
echo $f >> scafs.lst
end
# 64 bit blat used for pk. This version of blat recently had a bug fix
# so should give the same result as i386 blat on kk. use absolute path for
# output dir rather than symlink as that would use
# use Blat parameters as for mm5 and hg17
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc {check out line+ /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
/cluster/bin/i386/gensub2 scafs.lst bacends.lst template jobList
/cluster/bin/i386/para create jobList
/cluster/bin/i386/para try, check, push, check, ...
# para time
# Completed: 298820 of 298820 jobs
# CPU time in finished jobs: 1232495s 20541.58m 342.36h 14.26d 0.039 y
# IO & Wait Time: 923511s 15391.85m 256.53h 10.69d 0.029 y
# Average job time: 7s 0.12m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1008s 16.80m 0.28h 0.01d
# Submission to last job: 37494s 624.90m 10.41h 0.43d
ssh kolossus
cd /cluster/data/danRer3/bed/bacends
# need to sort psl files, filter and liftUp
# first do the chr1-25 and chrM alignments
nice pslSort dirs rawChroms.psl tmp chromsPsl >& chromSort.log
# Time taken: 2 hours 42 minues
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
rawChroms.psl bacEndsChroms.psl /dev/null >& pslRepsChroms.log
# Took 19 minutes
# then lift up NA and Un scaffolds to chrom level
nice pslSort dirs rawNAandUn.psl tmp scaffoldsNAandUnPsl \
>& scafsNAandUnSort.log
# took 1 hour 50 minutes
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
rawNAandUn.psl bacNAandUnScafs.psl /dev/null >& pslRepsNAandUn.log
# took 18 minutes
# lift results:
liftUp bacEnds.liftedChroms.psl /cluster/data/danRer3/jkStuff/liftAll.lft \
warn bacEndsChroms.psl
liftUp bacEnds.liftedNAandUn.psl \
/cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
warn bacNAandUnScafs.psl
# sort and merge these files
mkdir liftedPsl
mv *.lifted*.psl ./liftedPsl/
nice pslSort dirs bacEnds.psl tmp1 liftedPsl >& pslSortAll.log
# Took 4 minutes
pslCheck bacEnds.psl >& pslCheck.log
# there are 520 BAC ends with overlapping block errors - 1385 alignments
# use pslReps parameters used for mm6
pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons raw.psl \
bacEnds.psl /dev/null
# those for hg17
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
raw.psl bacEnds2.psl /dev/null
# see how many align in each case
awk '{print $10;}' bacEnds.psl | sort | uniq -c \
| sort -nr > bacEnds.qNames.sort
awk '{print $10;}' bacEnds2.psl | sort | uniq -c \
| sort -nr > bacEnds2.qNames.sort
wc -l bacEnds*qNames.sort
# 549086 bacEnds2.qNames.sort
# 519773 bacEnds.qNames.sort
grep '>' Zv5Bacends.fa | wc -l
# 729101
# so 71% of sequences aligned in bacEnds.psl
# and 75% of sequences aligned in bacEnds2.psl
# use textHistogram to look at number of alignments
# bacEnds.psl has 374002 with only 1 alignment
# bacEnds2.psl has 362364 with only 1 alignment
# bacEnds.psl - most alignments for 1 sequence is 515,
# for bacEnds2.psl - most alignments for 1 sequence is 1272
# when these are split up into bacEndPairs, bacEndPairsBad and
# bacEndSingles, the number of alignments per sequence is reduced
# so use bacEnds2.psl
# Process BAC end alignments
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/bacends/pairs
mkdir -p /cluster/data/danRer3/bed/bacends/bacends.1
# Downloaded BAC ends accessions from SRS
# Go to http://srs.sanger.ac.uk
# Go to "Select Databanks" tab and check DBGSS
# Go to "Query Form" tab
# Select Organism as field and enter "Danio*" as search term
# Select AllText as field and enter "*Sanger*" as search term
# Select AllText as filed and enter "T7|SP6" as search term
# Select a view
# Download as BACEndAccs.txt to bacend.1 directory
cd /cluster/data/danRer3/bed/bacends/bacends.1
cp /cluster/data/danRer2/bed/ZonLab/bacends/bacends.1/getBacEndInfo.pl .
# get lists of SP6 and T7 accessions and merge lists
awk 'BEGIN {FS="\t"}{OFS="\t"} {if ($7 ~ /SP6/) print $3"SP6",$4}' \
BACEndAccs.txt > BACEndSP6.accs
awk 'BEGIN {FS="\t"}{OFS="\t"} {if ($7 ~ /T7/) print $3"T7",$4}' \
BACEndAccs.txt > BACEndT7.accs
cat BACEndSP6.accs BACEndT7.accs > BACEndExtNames.accs
# change external names to internal names
cat << '_EOF_' > extToIntNames.pl
#!/usr/bin/perl -w
use strict;
my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
my %cloneHash = qw {
CH211- zC
DKEY- zK
DKEYP- zKp
RP71- bZ
BUSM1- dZ
CH73- CHORI73_
};
while (<STDIN>)
{
my ($l, $c, $intPref);
$l = $_;
foreach $c (@clonePrefixes)
{
if ($l =~ /$c/)
{
# get internal name
if (exists($cloneHash{$c}))
{
$intPref = $cloneHash{$c};
$l =~ s/$c/$intPref/;
print $l;
}
}
}
}
'_EOF_'
chmod +x extToIntNames.pl
perl extToIntNames.pl < BACEndExtNames.accs > BACEnd_accessions.txt
# get BAC clone accessions from Genbank. They can be obtained from EMBL
# through SRS but harder to separate the BAC end accessions from the
# BAC clone accessions:
# go to http://www.ncbi.nlm.nih.gov
# 1) select "Nucleotide" as the search database.
# 2) Search string: Danio rerio[ORGN] AND clone[TITL] NOT survey[TITL]
# Those sequences with "genomic survey" in the title appear to be
# BAC clone end accessions. Here, we want only BAC clone accessions.
# 3) There are 628991 sequences (2005-09-19). Select File from Send To
# pulldown menu and name file "BACClones.gbAccs.txt".
# create script to parse out clone ID and the accession:
cat << '_EOF_' > getAccsandIdsFromGb.pl
#!/usr/bin/perl -w
use strict;
my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
my %cloneHash = qw {
CH211- zC
DKEY- zK
DKEYP- zKp
RP71- bZ
BUSM1- dZ
CH73- CHORI73_
};
my $found = "FALSE";
my $acc = "";
my $id = "";
while (<STDIN>)
{
my ($l, @f, $intId, $extPref, $intPref);
$intPref = "";
$extPref = "";
chomp;
$l = $_;
if ($l =~ /^[0-9]+:\s+([A-Z]+[0-9]{3,})/)
{
$acc = "";
$acc = $1;
$found = "FALSE";
}
elsif ($l =~ /clone/)
{
$id = "";
# check for clone name in this line
foreach my $p (@clonePrefixes)
{
if ($l =~ /clone:?\s?($p[0-9-A-Za-z]+)/)
{
$id = $1;
# translate to upper case
$id =~ tr/a-z/A-Z/;
$extPref = $p;
$found = "TRUE";
}
}
}
if ($found eq "TRUE")
{
if (exists($cloneHash{$extPref}))
{
$intPref = $cloneHash{$extPref};
}
$intId = $id;
# translate this to internal ID
$intId =~ s/$extPref/$intPref/;
print "$intId\t$acc\t$id\n";
$found = "FALSE";
}
}
'_EOF_'
# chmod +x getAccsandIds.pl
# perl getAccsandIds.pl < BACClones.accs.txt > BACClonesIdsandAccs.txt
# Took 36 minutes. This file has internal BAC clone name, accession and
chmod +x getAccsandIdsFromGb.pl
# CHORI73_ is a new prefix, this is for the internal name of
# BAC clones from the CHORI73 doubled haploid library.
nice perl getAccsandIdsFromGb.pl < BACClones.gbAccs.txt \
> BACClonesIdsandAccs.txt &
# Took under 3 minutes. The output file here has internal BAC clone name,
# Genbank accession and external BAC clone name.
grep '>' ../bacSeqs/Zv5BACends.fa | sed -e 's/>//' > allBacEnds.names
# modify getBacEndInfo.pl for these sequence names so rename as
# getBacEndInfov2.pl
# need to make pairs file
perl getBacEndInfov2.pl allBacEnds.names BACEnd_accessions.txt \
> bacEnds.log
# check that all the BAC end sequence names from allBacEnds.names
# appear in either bacEndPairs.txt or bacEndSingles.txt
wc -l bacEnd*
# 159319 bacEndAccs.aliases
# 333356 bacEndPairs.txt
# 19788 bacEndSingles.txt
# bacEndAccs.aliases contains sequence read names and their
# Genbank accessions.
awk 'BEGIN {OFS="\n"} {print $1, $2;}' bacEndPairs.txt \
| sed -e 's/,/\n/g' > bacPrs.names
awk '{print $1;}' bacEndSingles.txt | sed -e 's/,/\n/g' > bacSingles.names
cat bacPrs.names bacSingles.names | sort > bacEnds.names.sort
sort allBacEnds.names > allBacEnds.names.sort
wc -l *.sort
# 720571 allBacEnds.names.sort
# 720571 bacEnds.names.sort
# so all the BAC ends from the FASTA file have been accounted for either
# as pairs or singles.
# process BAC end alignments
cd /cluster/data/danRer3/bed/bacends/pairs
set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
# try different parameters
/cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
-max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl $bacDir/bacEndPairs.txt \
all_bacends bacEnds
wc -l bacEnds.*
# 426 bacEnds.long
# 14875 bacEnds.mismatch
# 229139 bacEnds.orphan
# 164778 bacEnds.pairs
# 0 bacEnds.short
# 100 bacEnds.slop
# 409318 total
# there are less slop (190) more pairs (90967) and orphans (229139)
# and less mismatch (18083) and less long (980) than for danRer2
# size of sequence should be 100-200 kb but since assembly is not
# complete there are misassemblies so the distance between pairs could be
# larger. If -max=200000 -slopval=10000 -hardMax=500000 is used, then
# there are 18377 bacEnds.long, 250243 bacEnds.orphan,
# and 131209 bacEnds.pairs and over 3000 less just drop out.413243 total
# try -max=300000 -slopval=10000 -hardMax=500000
# wc -l bacEnds.*
# 3343 bacEnds.long
# 11731 bacEnds.mismatch
# 243500 bacEnds.orphan
# 154981 bacEnds.pairs
# 0 bacEnds.short
# 509 bacEnds.slop
# 414064 total
# try -min=25000 -max=350000 -slopval=10000 -hardMax=500000 as for human
# wc -l bacEnds.*
# 1725 bacEnds.long
# 12081 bacEnds.mismatch
# 242235 bacEnds.orphan
# 156444 bacEnds.pairs
# 616 bacEnds.short
# 1017 bacEnds.slop
# 414118 total
# this would be good to use but for direct comparison between danRer2
# and danRer3, it would be good to use the same parameters as before
# so stick with those above:
# -min=2000 -max=650000 -slopval=10000 -hardMax=800000
# create header required by "rdb" tools
# NOTE: there are overlapping BAC clone ends for danRer3. Some of these
# are only a few kb apart (from beginning of one to end of the other)
# so use stricter pslPairs parameters as for human and mouse.
ssh kkstore02
mkdir /cluster/data/danRer3/bed/bacends/pairs
cd /cluster/data/danRer3/bed/bacends/pairs
set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
/cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl \
$bacDir/bacEndPairs.txt all_bacends bacEnds
wc -l bacEnds.*
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
> ../header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
# make pairs bed file
cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairs.bed
# also need to process bacEndSingles.txt into a database table
# for singles in bacEndSingles.txt, create a dummy file where they
# are given zJA11B12T7 as dummy sequence pair. If the single is a forward
# sequence, put the dummy sequence in the second column, if the single is
# a reverse sequence put in first column. use a perl script to do this.
cd /cluster/data/danRer3/bed/bacends
set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
mkdir singles
cd singles
cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
perl formatSingles.pl $bacDir/bacEndSingles.txt > \
$bacDir/bacEndSingles.format
# then run pslPairs on this formatted file
/cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=2000 \
-max=650000 -slopval=10000 -hardMax=800000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
all_bacends bacEnds
wc -l bacEnds.*
# 0 bacEnds.long
# 0 bacEnds.mismatch
# 11439 bacEnds.orphan
# 0 bacEnds.pairs
# 0 bacEnds.short
# 0 bacEnds.slop
# there are 11439 orphans here and 229139 from pair analysis so
# a total of 240578 orphans
cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles
wc -l bacEnds.singles
# 240578 bacEnds.singles
# make singles bed file
cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndSingles.bed
cp bacEndSingles.bed ../pairs
cd ../pairs
# all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
# since orphans are already in bacEndSingles, do not add these
cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBad.bed
# add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans
# twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans
cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
| row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBadNoOrphans.bed
# use extractPslLoad later to get all_bacends.psl for database
# There are rows where the aligments were the same but the lfNames are
# different. This is due to the presence of multiple reads for the
# same BAC end sequence. Sometimes they are slightly different lengths
# so the alignments are a little different. It would be good to
# consolidate all of these. Firstly, the identical rows were merged into
# one with a list of all the lfNames corresponding to that alignment.
ssh kkstore02
#echo "create database bacsDr3_rah;" | hgsql danRer3
cd /cluster/data/danRer3/bed/bacends/pairs
#hgLoadBed bacsDr3_rah bacEndPairs bacEndPairs.bed \
# -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
# Loaded 163174 elements of size 11
# create a bacEndSingles table like bacEndPairs if not created already
# hgLoadBed bacsDr3_rah bacEndSingles bacEndSingles.bed \
# -sqlTable=../singles/bacEndSingles.sql -notItemRgb
# Loaded 212775 elements of size 11
# NOTE - this track isn't pushed to RR, just used for assembly QA
# Use bacEndPairsBadNoOrphans.bed as orphans are in the singles bed file
# hgLoadBed bacsDr3_rah bacEndPairsBad bacEndPairsBadNoOrphans.bed \
# -sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
# Loaded 15169 elements of size 11
# Need to consolidate similar rows for bacEndPairs and bacEndSingles - same
# name, different lfNames and same alignments.
mkdir -p /cluster/data/danRer3/bed/bacends/duplicates
cd /cluster/data/danRer3/bed/bacends/duplicates
mkdir -p /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
cd /cluster/data/danRer3/bed/bacends/duplicates
ln -s /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
# write program to do this for linked feature series (lfs) which
# is the type of data structure used for BAC ends.
# Need a bed file sorted by chrom and chromStart
cd overlapRun
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
sort -k1,2 /cluster/data/danRer3/bed/bacends/pairs/${f}.bed > ${f}.lfs
end
wc -l *.lfs
# 15169 bacEndPairsBadNoOrphans.lfs
# 163174 bacEndPairs.lfs
# 212775 bacEndSingles.lfs
# remove replicate rows where names match and the overlapping region
# (chromEnd - chromStart) is greater than or equal to 0.999.
ssh kolossus
cd /cluster/data/danRer3/bed/bacends/duplicates/overlapRun
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
echo "Processing $f"
nohup nice /cluster/home/hartera/bin/i386/lfsOverlap ${f}.lfs \
${f}.bed -name -minOverlap=1.0 -notBlocks
end
# Started: Tue Sep 27 21:51 Finished: Sep 28 06:29
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends/duplicates/overlapRun
# check the numbers of lines are correct
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
| sort | uniq -c | sort -nr > ${f}.uniqCount
end
wc -l *
# 163116 bacEndPairs.bed
# 163174 bacEndPairs.lfs
# 163116 bacEndPairs.uniqCount
# 15163 bacEndPairsBad.bed
# 15169 bacEndPairsBad.lfs
# 15163 bacEndPairsBad.uniqCount
# 212754 bacEndSingles.bed
# 212775 bacEndSingles.lfs
# 212754 bacEndSingles.uniqCount
# numbers of lines after uniqueing by coords, name and score is the
# same as that after using lfsOverlap to remove these lines so correct.
cd /cluster/data/danRer3/bed/bacends/duplicates
mv ./overlapRun/* .
rm -r overlapRun /cluster/bluearc/danRer3/bacends/duplicates/overlapRun
# Use perl script to choose 2 BAC ends to represent each BAC clone.
# since there are often more than one read for each BAC end in this set,
# 2 were chosen for each BAC pair or 1 for the singles. This was based on
# the ones that had the largest region aligned (using lfSizes).
# copy perl script over that was used for danRer2
cp /cluster/data/danRer2/bed/ZonLab/bacends/duplicates/pickLfNames.pl \
pickLfNamesv2.pl
# edit so that regular expression for matching BAC end names is the
# same as that used in ../bacends.1/getBacEndInfov2.pl
# need to sort by chrom, chromStart
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
end
# run perl script: input bed file, pairs or singles, name of output file
perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
mv error.log log.pairs
# log.pairs lists the 18 cases where alignments for a BAC clone use
# a different pair of sequence reads for the ends than the previous
# alignment for ends for that BAC clone. These were all checked and in
# each case, the extra alignments are almost identical or overlap for
# the most part so it does not matter if the extra alignments are
# removed.
# run script for singles:
perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
mv error.log log.singles
# log.singles has 34 cases where alignments for a BAC clone use
# different sequence reads for either the T7 or SP6 BAC end.
# singles may include both BAC ends for a clone in the case
# where they aligned to different chromosomes or a long way apart on
# the same chromsome (orphans). mostly those that have a different read
# align to an almost identical or largely overlapping region.
# some sequences appear to be different: CH211-98J20 - zC98J20.yb and
# zC98J20.ya do not align to each other. DKEYP-107B4 - zKp107B4.ya looks
# like it has low complexity sequence, this is discarded and zKp107B4.yb
# is kept. zKp107B4.za and zKp107B4.zb only align in the first ~ 59bp.
# zKp107B4.zb is kept in this case. DKEYP-114B4 - zKp114B4.za: 15-61 bp
# on zKp114B4.za align to 11-58 bp on zKp114B4.zb. zKp114B4.za is kept.
# In these cases, the 2 sequences align to different regions.
perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
badPairs2lfNames.bed
mv error.log log.badPairs
# only 3 alignments have a different pair of ends to other alignments
# but alignment region is almost the same in each case.
# for each of these new bed files, checks were made that there are
# only 2 BAC ends per alignments for pairs and 1 for singles.
# For each pair, there should only be 2 ends which can appear either
# way round depending on the orientation and there should be 1 end for
# the beginning (suffix T7, t7 or z) and one end for the end
# (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
# either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
# orientation. For singles, there should be a single BAC end for each
# alignment and for each BAC clone, a sequence for either or both types
# of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
# alignments.
# Finally overlaps in BAC clone names were checked. All BAC clones
# represented in each of the pairs, badPairs and singles bed files are
# unique to that file. Between all three bed files, 300323 BAC clones
# have alignments. 512886 clone ends are aligned in these three bed files.
# NOTE: using sort and uniq on hgwdev produces tab delimited output
# after merging rows with the same BAC name, the scoring is now
# wrong in the bed files.
# Scores should be 1000 if there is 1 row for that name, else
# 1500/number of rows for that sequence name - calculated by pslPairs.
# Correct the scores.
mkdir -p /cluster/data/danRer3/bed/bacends/scores
cd /cluster/data/danRer3/bed/bacends/scores
# copy over correctScores2.pl and checkscores.pl scripts from danRer2 and
# edit so both scripts so that hits file is split on space,not on tabs
cp /cluster/data/danRer2/bed/ZonLab/bacends/scores/correctScores2.pl .
cp /cluster/data/danRer2/bed/ZonLab/bacends/scores/checkScores.pl .
awk '{print $4}' ../duplicates/pairs2lfNames.bed \
| sort | uniq -c > pairs.hits
perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits noBin \
> bacEndPairsGoodScores.bed
# same for singles
awk '{print $4}' ../duplicates/singles1lfName.bed \
| sort | uniq -c > singles.hits
perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits \
noBin > bacEndSinglesGoodScores.bed
# and for badPairs
awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
| sort | uniq -c > badPairs.hits
perl correctScores2.pl ../duplicates/badPairs2lfNames.bed badPairs.hits \
noBin > bacEndPairsBadGoodScores.bed
# check that the scores are now correct
awk '{print $4, $5}' bacEndPairsGoodScores.bed \
| sort | uniq -c > pairs.count
perl checkScores.pl < pairs.count
# all the BAC clones should be in good.txt and none in bad.txt
# wc -l should give same number of lines in good.txt as in pairs.hits
# repeat for other bed files
awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
| sort | uniq -c > badPairs.count
perl checkScores.pl < badPairs.count
awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
| sort | uniq -c > singles.count
perl checkScores.pl < singles.count
# for the singles, 6 ended up in bad.txt because their scores
# were 214.285714285714 which is correct for 7 alignments. rounding the
# score caused the discrepancy.
ssh hgwdev
cd /cluster/data/danRer3/bed/bacends/scores
# copy over table definition from danRer2
cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/bacEndSingles.sql \
../singles/
# Now load database tables:
hgLoadBed danRer3 bacEndPairs bacEndPairsGoodScores.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
# Loaded 163098 elements of size 11
hgLoadBed danRer3 bacEndSingles bacEndSinglesGoodScores.bed \
-sqlTable=../singles/bacEndSingles.sql -notItemRgb
# Loaded 212720 elements of size 11
# 212720 record(s), 0 row(s) skipped, 50 warning(s) loading bed.tab
# warnings are unknown but all of bed file loaded and the number
# of warnings is small so ignore
hgLoadBed danRer3 bacEndPairsBad bacEndPairsBadGoodScores.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
# Loaded 15160 elements of size 11
# load BAC end sequences into seq table so alignments may be viewed
# symlink to bacends.fa sequences in danRer1
mkdir -p /gbdb/danRer3/bacends
ln -s /cluster/data/danRer3/bed/bacends/bacSeqs/Zv5BACends.fa \
/gbdb/danRer3/bacends/Zv5BACends.fa
hgLoadSeq danRer3 /gbdb/danRer3/bacends/Zv5BACends.fa
# create file for loading all_bacends table
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends/scores
# for all_bacends table, just load the alignments for those sequences
# represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
# bacEnds.load.psl is the file of alignments
# get all the names of sequences
foreach f (*GoodScores.bed)
echo $f
awk '{print $11;}' $f >> allBacEnds.names
end
wc -l allBacEnds.names
# 390978 allBacEnds.names
# this is the total number of lines in the *GoodScores.bed files
perl -pi.bak -e 's/,/\n/g' allBacEnds.names
sort allBacEnds.names | uniq > allBacEnds.names.uniq
wc -l allBacEnds.names.uniq
# 512886 allBacEnds.names.uniq
# get alignments for just the BAC ends that are in the database tables
# make bacEnds.load.psl
cd /cluster/data/danRer3/bed/bacends/scores
extractPslLoad -noBin ../bacEnds.psl bacEndPairsGoodScores.bed \
bacEndPairsBadGoodScores.bed bacEndSinglesGoodScores.bed | \
sorttbl tname tstart | headchg -del > bacEnds.load.psl
# check that alignments are present for all BAC ends in
# allBacEnds.names.uniq
awk '{print $10}' bacEnds.load.psl | sort | uniq > bacEnds.names
comm -12 bacEnds.names allBacEnds.names.uniq | wc -l
# 512886
ssh hgwdev
cd /cluster/data/danRer3/bed/bacends/scores
# load all_bacends table
hgLoadPsl danRer3 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 7584708 record(s),
# 0 row(s) skipped, 526 warning(s) loading psl.tab
# (hartera, 2006-04-19)
# Display is very slow for BAC ends on large regions. Try splitting
# all_bacends by chromosome.
ssh hgwdev
mkdir /cluster/data/danRer3/bed/bacends/all_bacends
cd /cluster/data/danRer3/bed/bacends/all_bacends
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
echo "Processing $c ..."
awk '{if ($14 == "'chr${c}'") print;}' \
/cluster/data/danRer3/bed/bacends/scores/bacEnds.load.psl \
> chr${c}.bacEnds.load.psl
end
# rename old table
hgsql -e 'alter table all_bacends rename allBacendsOld;' danRer3
# load new tables
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
hgLoadPsl danRer3 -table=chr${c}_all_bacends chr${c}.bacEnds.load.psl
end
# There are still warnings on loading, most (510) are for chrUn.
# This improves the performance a lot.
# The chrom-parsing code is confused by the double underscores in the
# chrN_all_bacends tables so change the names to chrN_allBacends
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
hgsql -e "alter table chr${c}_all_bacends rename chr${c}_allBacends;" \
danRer3
end
# Then add correct table name to each of the bacEnd* tables
foreach t (bacEndPairs bacEndPairsBad bacEndSingles)
hgsql -e "update $t set pslTable = 'allBacends';" danRer3
end
# corrected termRegex for some bacCloneXRef searches in trackDb.ra so
# that they work correctly (bacPairsIntName, bacSinglesIntName,
# bacPairsSangerSts and bacSinglesSangerSts). (2006-04-19, hartera)
# CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES
# (bacEndAlias, bacCloneAlias and bacCloneXRef) (DONE, 2005-10-06, hartera)
# RECREATE TABLES AFTER REMAKING THE SINGLES AND PAIRS TABLES
# (see REDO BACENDS SECTION) (DONE, 2006-06-08, hartera)
# REPLICATE ROWS IN TABLES SO REMOVE AND RELOAD (DONE, 2006-08-04, hartera)
# Process data and create bacEndAlias table
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends/bacends.1
# make bacEndAlias table with Genbank accessions for ends
# need to run getBacEndInfo.pl for the BAC end names in the
# BAC tables.
# in the pairs directory, there is the allBacEnds.names.uniq file
# so use this.
# Already made the bacEndAccs.aliases file with getBacEndInfov2.pl
# This has none of the BAC ends whose names end in ASP6 or AT7 as
# these are all from the CHORI73 library and they do not have BAC end
# accessions in Genbank at the moment. This contains accessions for
# all BAC ends even those without alignments.
hgsql danRer3 < $HOME/kent/src/hg/lib/bacEndAlias.sql
echo "load data local infile 'bacEndAccs.aliases' into table \
bacEndAlias" | hgsql danRer3
ssh kkstore02
# get the latest versions of the clonemarkers, contig names and markers
# files from Sanger
mkdir -p /cluster/data/danRer3/bed/bacends/cloneandStsAliases
cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
wget --timestamp \
ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README
wget --timestamp \
ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/clonemarkers.27.07.05.txt
wget --timestamp \
ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/ctgnames.27.07.05.txt
wget --timestamp \
ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/markers.27.07.05.txt
wc -l *27.07.05.txt
# 29885 clonemarkers.27.07.05.txt
# 167858 ctgnames.27.07.05.txt
# 12250 markers.27.07.05.txt
# Recreate tables as bacEndPairs, bacEndSingles, bacEndPairsBad and
# chrN_allBacends tables have changed (2006-06-08, hartera)
# get list of BAC end names, lfNames
cp /cluster/data/danRer3/bed/bacends/scoresAndCoords/allBacEnds.names.uniq .
# get list of BAC clone names
foreach f (bacEndPairs bacEndPairsBad bacEndSingles)
awk '{print $4}' \
/cluster/data/danRer3/bed/bacends/scoresAndCoords/${f}GoodScores.bed >> bacs.names
end
sort -u bacs.names > bacs.names.uniq
wc -l *.uniq
# 512321 allBacEnds.names.uniq
# 300290 bacs.names.uniq
# from psl file
awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
# edit to remove first few lines with no names
sort bacEndsPsl.names | uniq > bacEndsPsl.names.uniq
wc -l bacEndsPsl.names.uniq
# 545920 bacEndsPsl.names.uniq
# this is all the BAC ends that originally had alignments
# Add an alias table for BAC clones
# bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
# Add a xref table to give external clone registry names, internal names
# sanger name, relationship between STS and BAC clone (method of finding
# STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT,
# Genbank accession and STS primer sequences
# bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
set dir=/cluster/data/danRer3/bed/bacends/
awk 'BEGIN {OFS="\t"}{print $4, $1}' \
$dir/scoresAndCoords/bacEndPairsGoodScores.bed > bacClones.namesandchrom
awk 'BEGIN {OFS="\t"}{print $4, $1}' \
$dir/scoresAndCoords/bacEndSinglesGoodScores.bed >> bacClones.namesandchrom
sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
# use a list of internal names,Genbank accessions, and BAC clone names
# use BACClonesIdsandAccs.txt.
# get list of UniSTS IDs using aliases to search alias file
# print Sanger name, alias and UniSTS ID, use find_markers3.pl
cat << '_EOF_' > find_markers3.pl
# example:
# perl find_markers3.pl UniSTS.aliases markers.02.12.04.txt
use strict;
my $verbose = 0;
my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
my $aliasFile = $ARGV[0];
my $markersFile = $ARGV[1];
open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
# store aliases from aliasFile
my ($id, $al, @alsArray, %aliasHash);
while (<ALIAS>)
{
chomp;
($id, $al) = split /\t/;
@alsArray = split(/;/, $al);
foreach my $as (@alsArray)
{
push (@{$aliasHash{$as} }, $id);
}
}
close ALIAS;
while (<MARKERS>) {
my @idArray;
($f, $t, $m, $idArray[0]) = 0;
my @ids;
chomp; ($a, $b, $aliases, @rest) = split /\|/;
if ($verbose > 3) { printf "aliases $aliases \n"; }
@alias = split /;/, $aliases;
ALIAS: foreach $s (@alias) {
if ($s =~ /[\D]+/) {
if ($verbose > 5) { printf "this $s \n"; }
if (exists($aliasHash{$s}))
{
@idArray = @{$aliasHash{$s}};
}
if ($idArray[0]) {
$f = 1; $t = $s; @ids = @idArray;
if ($verbose) { printf "this $s found $m \n"; }
last ALIAS;
}
}
}
if ($f)
{
my @sNames = split(/;/, $b);
foreach my $sn (@sNames)
{
foreach my $i (@ids)
{
printf "$sn\t$i\n";
}
}
}
}
close MARKERS;
'_EOF_'
chmod +x find_markers3.pl
perl find_markers3.pl /cluster/data/ncbi/UniSTS.2005-09-29/UniSTS.aliases \
markers.27.07.05.txt > sangerandUniSTSId.txt
# No need to reformat this for zfishBacClonesandSts
# FPC contig information (i.e. FPC contig number) from ctgnames file is
# not included in the tables as these are dynamic and constantly
# changing with the assembly.
# FILE OF BAC CLONE ACCESSIONS
# http://www.ncbi.nlm.nih.gov/genome/clone/DATA/clac.out
# copy over file of BAC internal names, accessions and external names
cp /cluster/data/danRer3/bed/bacends/bacends.1/BACClonesIdsandAccs.txt .
# use zfishBacClonesandSts to create tab files for loading into
# bacCloneAlias and bacCloneXRef tables
# make output directory
rm -r /cluster/bluearc/danRer3/bacEnds/out
mkdir -p /cluster/bluearc/danRer3/bacEnds/out
# edit zfishBacClonesandSts.c to add prefixes for CHORI73 library:
# CHORI73_ for internal name, CH73- for external name
# in ctgnames.27.07.05.txt and clonemarkers.27.07.05.txt
perl -pi.bak -e 's/zH([0-9]+)/CHORI73_$1/' *.27.07.05.txt
mv ctgnames.27.07.05.txt.bak ctgnames.27.07.05.orig
mv clonemarkers.27.07.05.txt.bak clonemarkers.27.07.05.txt.orig
# no change to markers file so remove .bak file
rm markers.27.07.05.txt.bak
nice $HOME/bin/x86_64/zfishBacClonesandSts ctgnames.27.07.05.txt \
clonemarkers.27.07.05.txt markers.27.07.05.txt \
bacClones.namesandchrom.uniq BACClonesIdsandAccs.txt \
sangerandUniSTSId.txt ./out > ./out/zfishBacs.out &
# output is in /cluster/bluearc/danRer3/bacends/out so copy over
# sort alias tab file by sangerName
sort -k2 ./out/bacAlias.tab > bacAlias.sort.tab
cp ./out/bacXRef.tab .
wc -l *.tab
# 110961 bacAlias.sort.tab
# 540800 bacXRef.tab
ssh hgwdev
cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
hgsql -e 'drop table bacCloneAlias;' danRer3
hgsql -e 'drop table bacCloneXRef;' danRer3
hgLoadSqlTab danRer3 bacCloneAlias \
$HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab
hgLoadSqlTab danRer3 bacCloneXRef \
$HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab
# edit trackDb.ra to add bacEnds tracks and searches for the bacEndPairs
# and bacEndSingles tracks as for danRer1. copy over html from danRer2
# for bacEndPairs and bacEndSingles tracks.
# Replicate rows in table so reload after removing these
# (hartera, 2006-08-04)
ssh hgwdev
cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
sort bacAlias.sort.tab | uniq | sort -k2 > bacAlias.sort.tab.uniq
sort bacXRef.tab | uniq > bacXRef.tab.uniq
wc -l *.tab.uniq
# 57656 bacAlias.sort.tab.uniq
# 356453 bacXRef.tab.uniq
# Drop old tables and reload:
hgsql -e 'drop table bacCloneAlias;' danRer3
hgsql -e 'drop table bacCloneXRef;' danRer3
hgLoadSqlTab danRer3 bacCloneAlias \
$HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab.uniq
hgLoadSqlTab danRer3 bacCloneXRef \
$HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab.uniq
# BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES
# (DONE, 2005-10-06, hartera)
# REDONE AFTER REMAKING bacCloneAlias AND bacCloneXRef TABLES - both ok.
# (DONE, 2006-06-12, hartera)
# REDONE AFTER REMAKING bacCloneAlias AND bacCloneXRef TABLES
# (DONE, 2006-08-04, hartera)
# The following tests were carried out to check that all the data
# in the bacCloneAlias and bacCloneXRef tables is correct.
ssh hgwdev
cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
cp ./testTablesNew/*.pl .
rm -r testTablesNew
mkdir -p testTablesNew
cd testTablesNew
# Check that the correct aliases are associated with their Sanger STS names
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
../markers.27.07.05.txt > sNameandaliases
# write script to get one Sanger name and one alias on each line
cp ../*.pl .
perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
sort sNameandaliases.format | uniq > sNameandaliases.sort
# get Sanger names and aliases from database
hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer3 \
| sort | uniq > alias.db.sort
wc -l alias.db.sort
# 57656 alias.db.sort
diff sNameandaliases.sort alias.db.sort
# No difference between data file and data from database so ok
# Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
# get Sanger names from alias table
hgsql -N -e 'select sangerName from bacCloneAlias;' danRer3 \
| sort | uniq > sName.alias.sort
wc -l sName.alias.sort
# 15309 sName.alias.sort
# get Sanger names from xRef table
hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
is not null;' danRer3 | sort | uniq > sName.xRef.sort
wc -l sName.xRef.sort
# 15522 sName.xRef.sort
comm -23 sName.alias.sort sName.xRef.sort
# nothing unique to alias file so all Sanger names in the alias table are
# also in the xRef table
comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias
wc -l sNamexRefNotAlias
# 213 sNamexRefNotAlias
awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.27.07.05.txt | sort | uniq \
> clonemarkers.sNames.sort
# get Sanger names from markers file
awk 'BEGIN {FS="|"}{print $2}' ../markers.27.07.05.txt > markers.sNames
# remove semi-colons and sort
sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
# sanger names unique to markers file
comm -13 clonemarkers.sNames.sort markers.sNames.sort
# there are none
comm -23 clonemarkers.sNames.sort markers.sNames.sort \
> sNames.clonemarkersOnly
wc -l sNames.clonemarkersOnly
# 213 sNames.clonemarkersOnly
diff sNames.clonemarkersOnly sNamexRefNotAlias
# No difference so all the extra Sanger Names in the xRef
# table are from the clonemarkers file and these have no aliases in
# the markers file so they are not in the alias table so this is all ok.
# Check that Sanger STS names and primers are associated correctly
cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases/testTablesNew
# get sanger names and primers from markers file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
../markers.27.07.05.txt > sNameandPrimers
# use script to reformat and write with one Sanger name per line
chmod +x getSangerandPrimers.pl
perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
sort sNameandPrimers.format > sNameandPrimers.format.sort
wc -l sNameandPrim*
# 12250 sNameandPrimers
# 15309 sNameandPrimers.format
# 15309 sNameandPrimers.format.sort
# get Sanger names and primers from database
hgsql -N -e \
'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
where sangerName is not null and leftPrimer is not null and \
rightPrimer is not null;' danRer3 | sort | uniq \
> sNamesandprimers.fromdb.sort
wc -l sNamesandprimers.fromdb.sort
# 15309 sNamesandprimers.fromdb.sort
diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
# No difference so ok.
# Check that UniSTS IDs and Sanger STS names are associated correctly
# get Sanger names and UniSTS IDs from the database
hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
uniStsId is not null;' danRer3 | sort | uniq > sNameUniSTS.fromdb.sort
wc -l sNameUniSTS.fromdb.sort
# 5634 sNameUniSTS.fromdb.sort
# Need to reformat the sNameUniSTS.fromdb.sort
chmod +x formatUniSts.pl
perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
> sNameUniSTS.fromdb.format.sort
# get Sanger names from data file and see how many UniSTS IDs there are
# for each name
awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
> sangerandUniSTSId.count
# the most is 3
# 3 etID9786.21
# 3 etID9056.23
# 3 etID9042.2
# 3 etID8627.2
# 3 etID8281.9
# 3 etID11096.5
sort ../sangerandUniSTSId.txt > sangerandUniSTSId.txt.sort
diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort \
> sangerandUniSTSIdvsdb
# No difference between data from original file and that in database so ok
# Check that chrom mappings and external BAC clone names are correct
# get extNames and chroms they map to from the database
hgsql -N -e 'select name, chroms from bacCloneXRef where \
chroms is not null;' danRer3 | sort | uniq \
> nameandchromsfromdb.sort
# reformat nameandchromsfromdb.sort
perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
> nameandchromsfromdb.format.sort
# compare extNames and chroms from db to those in data file
cp ../bacClones.namesandchrom .
sort -u bacClones.namesandchrom > bacClones.namesandchrom.uniq
diff bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
# no difference - all ok
# Check Genbank accessions and internal BAC clone names
hgsql -N -e 'select intName,genbank from bacCloneXRef where \
genbank is not null;' danRer3 | sort | uniq \
> intNamesandAccs.fromdb.sort
# this should be a subset of zfish_accsMerged.txt - not all BAC clones
# listed here appear in either our BAC ends tracks or the markers files.
awk 'BEGIN {OFS="\t"} {print $1,$2}' ../BACClonesIdsandAccs.txt \
| sort -u > BACClonesIntandAccs.sort
comm -23 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort
# there is nothing in the database that is not in BACClonesIntandAccs.sort
comm -13 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort \
> onlyinzfishAccs
wc -l onlyinzfishAccs
# 86 onlyinzfishAccs
hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' \
danRer3 | sort | uniq > intNamesNoAcc.fromdb.sort
awk '{print $1;}' BACClonesIntandAccs.sort > intNames.withAccs.sort
comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
> indbNoAccsandAccs.out
# none of these names are common to both so all accessions from
# BACClonesIdsandAccs.txt are in the database for the internal names stored
# where there are accessions available.
# Test Sanger STS names, internal names and external names are all correct
# Test Sanger STS name and internal BAC clone names are associated correctly
# get internal names and Sanger names from data file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.27.07.05.txt \
| sort | uniq > intNameandSanger.sort
hgsql -N -e 'select intName, sangerName from bacCloneXRef \
where sangerName is not null;' danRer3 \
| sort | uniq > intNameandSanger.fromdb.sort
diff intNameandSanger.sort intNameandSanger.fromdb.sort
# No difference between data from file and that from database so ok
# Check BAC clone internal name and relationship fields
# get internal names and relationships from data file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.27.07.05.txt \
| sort | uniq > intNameandRelation.sort
# get internal names and relationships from database
hgsql -N -e 'select intName, relationship from bacCloneXRef \
where relationship != 0;' danRer3 \
| sort | uniq > intNameandrelation.fromdb.sort
# differences unique to database file
comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
> intNameRelation.indbonly
# differences unique to data file
comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
> intNameRelation.incloneMarkersonly
wc -l intNameRelation*
# 4650 intNameRelation.incloneMarkersonly
# 4650 intNameRelation.indbonly
awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
awk '{print $1}' intNameRelation.incloneMarkersonly \
> intNameRelation.incloneMarkersonly.names
diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
# there is no difference in the internal names with relationship fields
# no difference in names and the only places these should differ is that
# the second column should all be 3 in the data from the database only.
# this is because all the relationship entries that were blank were
# in the clonemarkers file were changed to 3 when entered into the database.
awk '{print $2}' intNameRelation.indbonly | sort | uniq
# 3 - correct so all ok
# all the differences should be that those that are blank in clonemarkers
# are 3 in the database.
# check that those that have 0 in the database bacCloneXRef relationshipe
# field are not in the list from cloneMarkers
# select these internal names with 0 relationship from the database
hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
danRer3 | sort | uniq > intNameNoRelation.fromdb.sort
# get all the internal names from the data file
awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.07.05.txt \
| sort | uniq > intNamefromCloneMarkers.sort
comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
# nothing in common between these two files as expected so there are
# no internal names in the db with 0 in the relationship field that
# appear in the clonemarkers file.
# Check all BAC clone internal names and external names from the
# ctgnames file are in the database
# get intName and extName from ctgnames file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.27.07.05.txt \
| sort | uniq > intNameandextNamefromCtgNames.sort
# get intName and extName from database
hgsql -N -e 'select intName,name from bacCloneXRef;' danRer3 \
| sort | uniq > intNameandextName.fromdb.sort
wc -l intNameandextName*
# 340039 intNameandextName.fromdb.sort
# 167858 intNameandextNamefromCtgNames.sort
comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
> intandextindbAndCtgNames
wc -l intandextindbAndCtgNames
# 167858 intandextindbAndCtgNames
# there are 167858 name pairs common between the file and the database
# and this is the same number of name pairs as in the data file
diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
# no difference between those name pairs from the data file and those that
# are common between the data file and the database so all internal and
# external names from ctgNames file are in the database
# get the list of extra ones from db
comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
> intandextNamesindbNotinCtgNames
wc -l intandextNamesindbNotinCtgNames
# 172181 intandextNamesindbNotinCtgNames
# get list of internal names from the clonemarkers file
awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.07.05.txt | sort | uniq \
> clonemarkers.intName.sort
wc -l clonemarkers.intName.sort
# 13471 clonemarkers.intName.sort
# compare these intNames to those from the database not in the ctgnames file
comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
# none of these clone markers internal names are in this list so they
# must all be in the ctgnames file too. These extra internal names will be
# translations of external names found in the list of mappings of BAC clones
# to chroms.
# Check that all the BAC clone external names from the list of chromosome
# mappings and from the ctgnames file are in the database.
# get all extNames from baclones.namesandchrom.uniq and from ctgnames
awk '{print $1}' ../bacClones.namesandchrom.uniq > \
extNames.ctgnamesandbacClones
awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.27.07.05.txt \
>> extNames.ctgnamesandbacClones
wc -l extNames.ctgnamesandbacClones
# 510169 extNames.ctgnamesandbacClones
sort extNames.ctgnamesandbacClones | uniq \
> extNames.ctgnamesandbacClones.sort
wc -l extNames.ctgnamesandbacClones.sort
# 340039 extNames.ctgnamesandbacClones.sort
# get extNames from the database
hgsql -N -e 'select name from bacCloneXRef;' danRer3 | sort | uniq \
> extNames.fromdb.sort
wc -l extNames.fromdb.sort
# 340039 extNames.fromdb.sort
comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
> extNames.fromdbandfiles
wc -l extNames.fromdbandfiles
# 340039 extNames.fromdbandfiles
# find extNames in common from data files and database
diff extNames.fromdb.sort extNames.fromdbandfiles
# no difference, all extNames from files are in db
# Check that all BAC clone internal names from the ctgnames and clonemarkers
# files are in the database
# get internal names from ctgnames and clonemarkers files
awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.27.07.05.txt \
> intNames.ctgnamesandclonemarkers
awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.27.07.05.txt \
>> intNames.ctgnamesandclonemarkers
wc -l intNames.ctgnamesandclonemarkers
# 197743 intNames.ctgnamesandclonemarkers
sort intNames.ctgnamesandclonemarkers | uniq \
> intNames.ctgnamesandclonemarkers.sort
wc -l intNames.ctgnamesandclonemarkers.sort
# 167858 intNames.ctgnamesandclonemarkers.sort
# get internal names from database
hgsql -N -e 'select intName from bacCloneXRef;' danRer3 | sort | uniq \
> intNames.fromdb.sort
wc -l intNames.fromdb.sort
# 340039 intNames.fromdb.sort
# some of these intNames are derived from the corresponding extNames
# all of the intNames from the file should be in the db
comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
> intNames.fromdbandfiles
wc -l intNames.fromdbandfiles
# 167858 intNames.fromdbandfiles
diff intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
# no difference, all intNames from files are in db
# Check that all translations are correct between BAC clone
# external and internal names.
# write script to get the prefixes from internal and external names
chmod +x getNamePrefixes.pl
hgsql -N -e 'select name, intName from bacCloneXRef;' danRer3 \
| sort | uniq > extandintNames.fromdb.sort
perl getNamePrefixes.pl < extandintNames.fromdb.sort \
> extandintNames.prefixes
sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
# these all look good
# BUSM1 dZ
# CH211 zC
# CH211 zc
# CH73 CHORI
# CT7 bP
# DKEY zK
# DKEY zk
# DKEYP zKp
# RP71 bZ
# XX bY
# zk is a internal name prefix for the external name prefix, DKEY-. There
# is only one example where this is used (DKEY-81G7) and this in the
# ctgnames file and is in the bacCloneXRef table so that is ok.
# All data looks good in these tables now.
# BLASTZ TETRAODON (tetNig1) (DONE, 2005-10-20, hartera)
# REMADE DOWNLOADS FOR net, all.chain AND over.chain AS THEY HAD BEEN DELETED.
# MOVE ALL THE RUN FILES AND OUTPUT FROM THE SAN RUN DIRECTORY TO A DIRECTORY
# ON /cluster/data AS THIS IS MORE PERMANENT. (DONE, 2005-11-17, hartera).
# Tetraodon is quite distant from zebrafish, more distant than human/chicken
# so use the HoxD55.q matrix for the Blastz alignments.
# Blastz requires lineage-specific repeats but there are none
# available between these two fish species
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/blastz.tetNig1.2005-10-11
cd /cluster/data/danRer3/bed
ln -s blastz.tetNig1.2005-10-11 blastz.tetNig1
cd /cluster/data/danRer3/bed/blastz.tetNig1
# create a 2bit file for danRer3 with all chroms (1-25 and M) and the
# scaffolds for NA and Un if it does not exist already
cd /cluster/data/danRer3
faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
Un/scaffoldUn.fa NA/scaffoldNA.fa danRer3ChrUnNAScafs.2bit
ssh hgwdev
# move the 2 bit file for danRer3 to the san if not there already
mkdir -p /san/sanvol1/scratch/danRer3/
mv /cluster/data/danRer3/danRer3ChrUnNAScafs.2bit \
/san/sanvol1/scratch/danRer3/
# also copy over the danRer3 2 bit file for all chroms and the
# lift file for NA and Un scaffolds to chrNA and chrUn.
cp /cluster/data/danRer3/danRer3.2bit /san/sanvol/scratch/danRer3/
cp /cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
/san/sanvol1/scratch/danRer3/
# also copy over tetraodon sequences to the san
mkdir -p /san/sanvol1/scratch/tetNig1/contigs
cp /cluster/bluearc/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit \
# see makeTetNig1.doc for making tetNig1ChrContigsRandomScafs.2bit
# make output and run directories
mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut
cd /cluster/data/danRer3/bed/blastz.tetNig1
ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut
# also copy over tetraodon sequences to the san
mkdir -p /san/sanvol1/scratch/tetNig1/contigs
cp /cluster/bluearc/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit \
/san/sanvol1/scratch/tetNig1/contigs/
# use tetraodon sequence in contigs for dynamic masking - see below
# for dynamic masking: M=50. Each time a base is hit at least 50 times, it
# is masked out.
# Blastz danRer3 chroms and scaffolds vs tetNig1 ordered chrom contigs and
# scaffolds from random chromosomes. lift up the tetNig1 contigs to chrom
# level. Then make the chains and then liftUp all the scaffolds to chrom
# level before sorting and merging chains and then netting.
# get all contigs from mapped ordered chroms and make 2bit file
# see makeTetNig1.doc
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
BLASTZ_H=2500
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM and scaffolds
SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
# 0.5 Mb chunk for target with 5 kb overlap
SEQ1_CHUNK=500000
SEQ1_LAP=5000
# QUERY - Tetraodon (tetNig1)
# soft-masked 500 kb contigs for chroms, scaffolds for randoms
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1000000000
SEQ2_LAP=0
BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
SEQ2_LEN=$BASE/S2.len
TMPDIR=/scratch/tmp
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
cp /cluster/data/danRer3/chrom.sizes ./S1.len
twoBitInfo /san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit \
chromsUnNAScafs.sizes
twoBitInfo \
/san/sanvol1/scratch/tetNig1/contigs/tetNig1ChrContigsRandomScafs.2bit ./S2.len
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk -smallClusterHub=pk -workhorse=pk -stop cat \
-blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut \
`pwd`/DEF >& do.log &
# PID 32339 Start: Tue Oct 11 14:55
# use Hiram's script to kill 4 empty shell commands on Thurs Oct 13th
# /cluster/bin/scripts/findEmpty.sh -r to find
# /cluster/bin/scripts/findEmpty.sh -K to kill
# Fri Oct 14 10:41
# Checking finished jobs
# crashed: 32
# running: 20
# ranOk: 3716
# failed 4 times: 32
# total jobs in batch: 3768
# check problems:
# 141 jobs crashed on host: kkr10u19.kilokluster.ucsc.edu
# Just removed this machine with parasol remove machine as over 9000 jobs
# crashed for opossum run on this machine.
# run again with para push -retries=20
# By 16:00 on Fri Oct 14, all jobs finished but 2 failed 4 times so repush
# with para push -retries=20.
# para time
# Completed: 3768 of 3768 jobs
# CPU time in finished jobs: 12465019s 207750.32m 3462.51h 144.27d 0.395 y
# IO & Wait Time: 873594s 14559.90m 242.66h 10.11d 0.028 y
# Average job time: 3540s 59.00m 0.98h 0.04d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 19777s 329.62m 5.49h 0.23d
# Submission to last job: 264857s 4414.28m 73.57h 3.07d
ssh pk
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/run.blastz
para time > run.time
# run doBlastzChainNet.pl to continue with cat step since the script
# crashed when some of the jobs failed 4 times.
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk -smallClusterHub=pk -workhorse=pk -continue cat -stop cat \
-blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsOut \
`pwd`/DEF >& doCat.log &
# Took about 7 minutes.
# Now need to liftUp the contigs for tetNig1 to chrom-level but
# not the scaffolds. All the scaffolds will be lifted after the
# chaining step.
ssh kolossus
# liftUp contigs for tetraodon query:
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
mv pslParts pslPartsNotLifted
mkdir /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun/liftedPsl
set dir=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
# use carry for "how" as this will carry items not in liftSpec to dest
# file without translation. lift file is only for contigs not scaffolds.
# use nohead option otherwise psl header added at the top of each file.
# need to add the blastz params header
zcat ./pslPartsNotLifted/part958.lst.psl.gz | head -3 > header
# first lift to pseudo-contig level and then to chroms
foreach f (./pslPartsNotLifted/*.psl.gz)
set g=$f:r:t
zcat $f | liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted.psl \
/cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft carry stdin
liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted2.psl \
/cluster/data/tetNig1/jkStuff/liftAll.lft carry $dir/liftedPsl/${g}.lifted.psl
cat header $dir/liftedPsl/${g}.lifted2.psl > $dir/liftedPsl/${g}
rm $dir/liftedPsl/${g}.lifted*
end
# check a couple of files and see that they have the correct number of lines
# then move the contents of this directory to pslParts
mkdir $dir/pslParts
foreach f ($dir/liftedPsl/*.psl)
gzip $f
mv ${f}.gz $dir/pslParts/
end
# carry on with doBlastzChainNet.pl from the chaining step
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
cp DEF DEF.tetraContigs
# edit DEF file so that tetNig1 now has a 2bit file of the chroms and
# scaffolds for randoms in the CTGDIR and also there is a lift file
# for the scaffolds.
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
BLASTZ_H=2500
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM and scaffolds
SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
# 0.5 Mb chunk for target with 5 kb overlap
SEQ1_CHUNK=500000
SEQ1_LAP=5000
# QUERY - Tetraodon (tetNig1)
# soft-masked chroms, and scaffolds for randoms
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1000000000
SEQ2_LAP=0
BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
SEQ2_LEN=$BASE/S2.len
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
TMPDIR=/scratch/tmp
#DEBUG=1
'_EOF_'
# if it does not exist already, make the file of sizes for the tetNig1
# chroms and scaffolds.
twoBitInfo \
/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit \
/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
# Also, need to change the sequence sizes file for tetNig1 to the
# chrom sizes and not the scaffolds and contigs sizes.
cp S2.len S2contigsAndScafs.len
cp /cluster/data/tetNig1/chrom.sizes S2.len
# then run doBlastzChainNet.pl script again
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-continue chainRun \
-chainMinScore=5000 \
`pwd`/DEF >& doChains.log &
# Start: Fri Oct 14 17:47 Finished: Oct 14 17:57
# crashed as one job failed after 4 retries, problem is that
# part958.lst.psl.gz is not recognized as a psLayout file. It is empty
# except for parameter comment lines so it can be ignored.
# Also, need to change the sequence sizes file for tetNig1 to the
# chrom sizes and not the scaffolds and contigs sizes.
ssh pk
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain/run/
para time > run.time
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
# crashes while doing chainMerge so add a flag into DEF file to indicate
# that the genomes are in scaffolds so there is a large number of chain
# files. Changed doBlastzChainNet.pl so that if this flag is seen then
# the chain files are concatentated and then chainSort is used to sort
# the resulting chain file by score and chainMergeSort is used to renumber
# the chain IDs so that they are unique. chainMergeSort expects chain
# files sorted by score as input.
# add this line to the DEF file: GENOME_IN_SCAFFOLDS=1
nice ./doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-continue chainMerge \
-chainMinScore=5000 \
`pwd`/DEF >& doChainMergeNet.log &
# Start: Wed Oct 19 12:52 Finish: Oct 19 13:13
# Add a trackDb.ra entry for chainTetNig1 and netTetNig1 and add html
# pages. Modify track descriptions to describe the process using
# scaffolds for danRer3 chrNA and chrUn and the fact that dynamic
# masking was used for the Blastz alignments Edit the README for
# the downloads to add in information about using scaffolds for Blastz
# for danRer3 chrNA and chrUn and for tetNig1 random unordered chroms,
# and how the tetNig1 genome was aligned as a file of contigs for chroms
# and scaffolds for randoms for the Blastz alignments and so that
# each danRer3 chunk was aligned with the whole of the tetraodon
# genome to take advantage of dynamic masking (M=50).
# Finally, run a doBlastzChainNet.pl swap for this to create danRer3
# chains and net tracks on tetNig1 - see makeTetNig1.doc.
# featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.746%, chainTetNig1Link 7.167%, both 0.672%, cover 90.17%,
# enrich 12.58x
# featureBits -chrom=chr2 danRer2 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.750%, chainTetNig1Link 4.463%, both 0.621%, cover 82.84%,
# enrich 18.56x
# so better coverage for danRer3 but less enrichment than for danRer2.
# Make the download files for all.chain, over.chain and net again as these
# files have been removed. Put the files on /cluster/data rather than the
# san so that they are not moved again. (hartera, 2005-11-17)
ssh kolossus
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain
chainMergeSort ./run/chain/*.chain | nice gzip -c \
> danRer3.tetNig1.all.chain.gz
# copy over.chain file from bedOver directory to axtChain directory
cp /cluster/data/danRer3/bed/bedOver/danRer3.tetNig1.over.chain.gz \
/cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain/
# recreate net file
# make noClass.net
#Make nets ("noClass", i.e. without rmsk/class stats which are added later)
chainPreNet danRer3.tetNig1.all.chain.gz ../S1.len ../S2.len \
stdout | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout \
/dev/null | netSyntenic stdin noClass.net
# memory usage 251383808, utime 562 s/100, stime 41
# create net file
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun/axtChain
netClass -verbose=0 -noAr noClass.net danRer3 tetNig1 danRer3.tetNig1.net
# compress net file
gzip danRer3.tetNig1.net
# Move these files to /cluster/data and remake download links as the
# san is not a permanent storage space.
mv /san/sanvol1/scratch/danRer3/blastzTetNig1/chromsAndScafsRun \
/cluster/data/danRer3/bed/blastz.tetNig1/
# Then change the symlinks in the downloads directory to point to the files
# on /cluster/data
cd /usr/local/apache/htdocs/goldenPath/danRer3/vsTetNig1/axtNet
set runDir=/cluster/data/danRer3/bed/blastz.tetNig1/chromsAndScafsRun
rm *.gz
foreach f ($runDir/axtNet/*.axt.gz)
ln -s $f .
end
cd ..
rm *.gz
foreach f ($runDir/axtChain/*.gz)
ln -s $f
end
# remake the md5sum file
rm md5sum.txt
md5sum *.gz */*.gz > md5sum.txt
# Test Runs for chr2 and chrUn
cd /cluster/data/danRer3/bed/blastz.tetNig1
mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
# create blastz output directory
mkdir -p /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out
ln -s /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out
mkdir /san/sanvol1/scratch/danRer3/chrUnand2
cd /san/sanvol1/scratch/danRer3/chrUnand2
cp ../nib/chr2.nib ../nib/chrUn.nib .
rsync -a --progress /cluster/bluearc/tetNig1/contigs/tetNig1Contigs.2bit \
/san/sanvol1/scratch/tetNig1/contigs/
cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
BLASTZ_M=50
BLASTZ_H=2500
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - zebrafish (danRer3) soft-masked chr1-25 and chrM
SEQ1_DIR=/san/sanvol1/scratch/danRer3/chrUnand2
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_IN_CONTIGS=0
# 0.5 Mb chunk for target
SEQ1_CHUNK=500000
SEQ1_LAP=500
# QUERY - Tetraodon (tetNig1)
# soft-masked 500 kb contigs for chroms, scaffolds for randoms
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/contigs/tetNig1Contigs.2bit
SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1000000000
SEQ2_LAP=0
BASE=/san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
cp /cluster/data/danRer3/chrom.sizes ./S1.len
twoBitInfo \
/san/sanvol1/scratch/tetNig1/contigs/tetNig1Contigs.2bit ./S2.len
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-stop cat \
-blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
-chainMinScore=5000 \
`pwd`/DEF >& do.log &
# PID: 4890 Start: Thu Sep 29 14:50
# ran quickly, 30 mins
# crashed as some jobs crashed and failed after 4 retries so
# push them again.
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-continue cat \
-stop cat \
-blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
-chainMinScore=5000 \
`pwd`/DEF >& doCat.log &
# Took a couple of minutes
# need to lift up the contigs to chrom level for tetNig1
# liftUp contig files for tetraodon query:
# if file is empty, then liftUp gets stuck reading commented lines
# so make a list of files which contain alignment data and not just
# commented lines starting with # (blastz parameters)
foreach f (./pslPartsNotLifted/*.psl.gz)
zcat $f | awk '{if ($1 !~ /#/) print "'$f'";}' >> pslParts.lst
end
sort pslParts.lst | uniq > pslPartsNotEmpty.lst
cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run
mv pslParts pslPartsNotLifted
mkdir /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run/liftedPsl
set dir=/san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Run
# use carry for "how" as this will carry items not in liftSpec to dest
# file without translation. lift file is only for contigs not scaffolds.
# use nohead option otherwise psl header added at the top of each file.
# need to add the blastz params header
zcat \
./pslPartsNotLifted/chrUn.nib:chrUn:99500000-100000500.psl.gz \
| head -3 > header
# first lift to pseudo-contig level and then to chroms
foreach f (`cat pslPartsNotEmpty.lst`)
set g=$f:r:t
zcat $f | liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted.psl \
/cluster/data/tetNig1/bed/blastzSelf/contigSeqs/500kbcontigs.lft warn stdin
liftUp -pslQ -nohead $dir/liftedPsl/${g}.lifted2.psl \
/cluster/data/tetNig1/jkStuff/liftAll.lft warn $dir/liftedPsl/${g}.lifted.psl
cat header $dir/liftedPsl/${g}.lifted2.psl > $dir/liftedPsl/${g}
rm $dir/liftedPsl/${g}.lifted*
end
mv liftedPsl pslParts
# need to gzip these again
foreach f (./pslParts/*.psl)
gzip $f
end
# then carry on with chaining for these danRer3 NA and Un scaffolds
# tetNig1.2bit has full chroms for ordered chroms
# and randoms as scaffolds
cp DEF DEF.contigs
# copy over 2bit file with chroms for tetNig1 if not
# there already.
mv S2.len S2.contigs
twoBitInfo \
/san/sanvol1/scratch/tetNig1/tetNig1.2bit ./S2.len
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-continue chainRun \
-stop net \
-blastzOutRoot /san/sanvol1/scratch/danRer3/blastzTetNig1/chrUnand2Out \
-chainMinScore=5000 \
`pwd`/DEF >& doNet.log &
# PID 1117 Start: Thu Sep 29 16:20 Finished: 16:24
# crashed: says it can't find [danRer3.tetNig1.]all.chain[.gz] but it
# is there.
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-continue net \
-stop net \
-chainMinScore=5000 \
`pwd`/DEF >& doNet2.log &
# Took 1 minute
# TO DO: load tables
cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run/axtChain/chain
foreach f (*.chain)
set c=$f:r
hgLoadChain danRer3 ${c}_chainTetNig1NoScafs $f
end
cd /cluster/data/danRer3/bed/blastz.tetNig1/chrUnand2Run/axtChain
# add gap/repeat stats to net file using db tables
netClass -verbose=0 -noAr noClass.net danRer3 tetNig1 danRer3.tetNig1.net
# load nets
netFilter -minGap=10 danRer3.tetNig1.net \
| hgLoadNet -verbose=0 danRer3 netTetNig1NoScafs stdin
# then need to load chains and net into browser with a different name
# featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.742%, chainTetNig1Link 7.166%, both 0.670%, cover 90.26%,
# enrich 12.60x
# featureBits -chrom=chr2 danRer3 refGene:cds chainTetNig1NoScafsLink -enrichment
# refGene:cds 0.742%, chainTetNig1NoScafsLink 7.171%, both 0.670%, cover 90.30%, enrich 12.59x
# featureBits -chrom=chrUn danRer3 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.497%, chainTetNig1Link 6.175%, both 0.441%, cover 88.68%, enrich 14.36x
# featureBits -chrom=chrUn danRer3 refGene:cds chainTetNig1NoScafsLink -enrichment
# refGene:cds 0.497%, chainTetNig1NoScafsLink 6.179%, both 0.441%, cover 88.67%, enrich 14.35x
Rows in chainTetNig1Link:
tetNig1 tetNig1NoScafs
chr2 308576 303236
chrUn 1133922 1114061
#nets:
# featureBits -chrom=chr2 danRer3 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.742%, netTetNig1 62.053%, both 0.715%, cover 96.34%, enrich 1.55x
# featureBits -chrom=chr2 danRer3 refGene:cds netTetNig1NoScafs -enrichment
# refGene:cds 0.742%, netTetNig1NoScafs 63.095%, both 0.717%, cover 96.63%, enrich 1.53x
# featureBits -chrom=chrUn danRer3 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.497%, netTetNig1 48.803%, both 0.477%, cover 95.87%, enrich 1.96x
# featureBits -chrom=chrUn danRer3 refGene:cds netTetNig1NoScafs -enrichment
# refGene:cds 0.497%, netTetNig1NoScafs 49.207%, both 0.478%, cover 96.01%, enrich 1.95x
# Rows in netTetNig1
# tetNig1 tetNig1NoScafs
chr2 17370 17415
chrUn 56259 56360
# featureBits -chrom=chr2 danRer2 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.739%, chainTetNig1Link 4.463%, both 0.617%, cover 83.44%,
# enrich 18.69x
# featureBits -chrom=chr2 danRer3 refGene:cds chainNoHoxD55TetNig1Link -enrichment
# refGene:cds 0.668%, chainNoHoxD55TetNig1Link 4.815%, both 0.587%,
# cover 87.95%,enrich 18.27x
# featureBits -chrom=chr2 danRer3 refGene:cds chainHoxD55TetNig1Link -enrichment
# refGene:cds 0.668%, chainHoxD55TetNig1Link 7.846%, both 0.612%, cover 91.71%, enrich 11.69x
# HoxD55.q with mm6 parameters but H=2500:
# featureBits -chrom=chr2 danRer3 refGene:cds chainHoxD55v2TetNig1Link -enrichment
# refGene:cds 0.668%, chainHoxD55v2TetNig1Link 7.400%, both 0.601%,
# cover 90.10%,enrich 12.18x
# if H=2000 is used, one job does not finish for blastz after a day.
# makes little difference if use mm6 parameters
# Database Table Number of chains
# danRer2 chr2_chainTetNig1 21176
# danRer3 chr2_chainNoHoxD55TetNig1 16076
# danRer3 chr2_chainHoxD55TetNig1 23951
# danRer3 chr2_chainHoxD55v2TetNig1 21378
# also there are more lower scoring chains with HoxD55 alone than for
# no HoxD55 or using the mm6 parameters with HoxD55. However, using HoxD55
# seems to increase the number of higher scoring chains.
# BLASTZ, CHAIN AND NET FOR OPOSSUM (monDom2) (DONE, 2005-10-18, hartera)
# MOVE ALL THE RUN FILES AND OUTPUT FROM THE SAN RUN DIRECTORY TO A DIRECTORY
# ON /cluster/data AS THIS IS MORE PERMANENT. (DONE, 2005-11-17, hartera).
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/blastz.monDom2.2005-10-07
cd /cluster/data/danRer3/bed
ln -s blastz.monDom2.2005-10-07 blastz.monDom2
# create a 2 bit for danRer3 with all chroms (1-25 and M) and the
# scaffolds for NA and Un.
cd /cluster/data/danRer3
faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
Un/scaffoldUn.fa NA/scaffoldNA.fa danRer3ChrUnNAScafs.2bit
ssh hgwdev
mkdir -p /san/sanvol1/scratch/danRer3/
mv /cluster/data/danRer3/danRer3ChrUnNAScafs.2bit \
/san/sanvol1/scratch/danRer3/
# make output and run directories
mkdir -p /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
mkdir -p /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut
cd /cluster/data/danRer3/bed/blastz.monDom2
ln -s /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
ln -s /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut
cd chromsAndScafsRun
cat << '_EOF_' > DEF
# zebrafish (danRer3) vs opossum (monDom2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
ALIGN=blastz-run
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0
# TARGET - zebrafish (danRer3) soft-masked chroms 1-25 and chrM, and
# scaffolds for NA and Un
SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/danRer3/liftNAandUnScaffoldsToChrom.lft
SEQ1_RMSK=
# lineage-specific repeats
# we don't have that information for these species
SEQ1_SMSK=
SEQ1_FLAG=
SEQ1_LIMIT=30
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Opossum (monDom2)
# soft-masked sequence in scaffolds
SEQ2_DIR=/san/sanvol1/scratch/monDom2/monDom2.2bit
SEQ2_SMSK=
SEQ2_FLAG=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ1_CTGLEN=$BASE/chromsUnNAScafs.sizes
SEQ2_LEN=$BASE/S2.len
TMPDIR=/scratch/tmp
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
cp /cluster/data/danRer3/chrom.sizes S1.len
twoBitInfo /san/sanvol1/scratch/danRer3/danRer3ChrUnNAScafs.2bit \
chromsUnNAScafs.sizes
cp /cluster/data/monDom2/chrom.sizes S2.len
# now do the run
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-stop cat \
-blastzOutRoot /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut \
-chainMinScore=5000 \
`pwd`/DEF >& do.log &
# chromsAndScafs PID 19811 Start: Fri Oct 7 15:16
# Friday Oct 14th 10:30 -
# Checking finished jobs
# crashed: 3271
# ranOk: 90399
# failed 4 times: 3271
# total jobs in batch: 93670
# more than 9000 crashed on one machine: kkr10u19.kilokluster.ucsc.edu
# so remove this machine.
# run again with para push -retries=20
# still 7 jobs crashed so repush again with para push -retries=20
# Now try using the SEQ1_LIMIT option in the DEF file to limit the
# number of sequences in a partition file to 30. Before, there would
# be a lot of small sequences in a partition file that would take a long
# time to run.
# finished around 21:40 Fri Oct 14 Took about 7 days, maybe a little less
# as a number of jobs crashed last night.
# carry on from the cat step to the end
ssh pk
cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun/run.blastz
para time > run.time
# para time
# Completed: 93670 of 93670 jobs
# CPU time in finished jobs: 55738486s 928974.77m 15482.91h 645.12d 1.767 y
# IO & Wait Time: 1276213s 21270.22m 354.50h 14.77d 0.040 y
# Average job time: 609s 10.14m 0.17h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 1470s 24.50m 0.41h 0.02d
# Submission to last job: 627367s 10456.12m 174.27h 7.26d
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-continue cat \
-blastzOutRoot /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsOut \
-chainMinScore=5000 \
`pwd`/DEF >& doCatChainNet.log &
# Took 13 minutes to cat then chain. It had 70 jobs crash at the chaining
# step. These are empty files - when axtChain opens them using
# pslxFileOpenWithMeta (in psl.c) it aborts as the file is empty apart from
# meta data and therefore not psLayout format. Ignore these crashed jobs
# for now and then modify psl.c so it will skip over these empty files.
# Next, the script crashed on the chainMergeSort step
# since there are too many chains due to opossum being scaffold-based.
# chainMergeSort opens all the files at once.
# Added a flag to the DEF file to show if an assembly is scaffold-based:
# GENOME_IN_SCAFFOLDS=1
# and then modify doBlastzChainNet.pl so that if it sees this flag, then
# chains are merged into one file then run chainSort to sort the file
# and then chainMergeSort to change the IDs so they are unqiue.
# chainMergeSort assumes that the input files are sorted already.
nice ./doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-continue chainMerge \
-chainMinScore=5000 \
`pwd`/DEF >& doChainMergeNet.log &
# Start: Tue Oct 18 12:55 Finished: 15:02
# add trackDb.ra entries for monDom2 chain and net tracks and add html for
# these tracks too. Modified html pages to describe the process using
# scaffolds for chrUn and chrNA for danRer3.
# Modify the downloads README.txt to include a description of the process
# of running blastz with scaffolds for the chrUn and chrNA unordered chroms.
# Finally run the swap for this to get danRer3 chains and net tracks
# on monDom2 - see makeMonDom2.doc.
# Move the run directory files to /cluster/data and remake download links
# as the san is not a permanent storage space (hartera, 2005-11-17)
ssh hgwdev
mv /san/sanvol1/scratch/danRer3/blastzMonDom2/chromsAndScafsRun \
/cluster/data/danRer3/bed/blastz.monDom2/
# then change the symlinks in the downloads directory to point to the files
# on /cluster/data
cd /usr/local/apache/htdocs/goldenPath/danRer3/vsMonDom2/axtNet
set runDir=/cluster/data/danRer3/bed/blastz.monDom2/chromsAndScafsRun
rm *.gz
foreach f ($runDir/axtNet/*.axt.gz)
ln -s $f .
end
cd ..
rm *.gz
foreach f ($runDir/axtChain/*.gz)
ln -s $f
end
# remake the md5sum file
rm md5sum.txt
md5sum *.gz */*.gz > md5sum.txt
# RADIATION HYBRID (RH) MAP TRACK (DONE, 2005-09-06, hartera)
# Data from Leonard Zon's lab at the Childrens Hospital, Boston
# Provided by Anhua Song: asong@enders.tch.harvard.edu
# Updated data provided on 2006-02-23
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap
cd /cluster/data/danRer3/bed/ZonLab/rhMap
# download data from e-mail to this directory
# new sequences (2006-02-23) are available
unzip rhSequenceSubmit022306.zip
# sequences are in rhSequenceSubmit022306/rhSequenceSubmitSeq022306.txt
# primer information is in rhSequenceSubmit022306/rhSequenceSubmit022306.txt
mv rhSequenceSubmitSeq022306.txt rhMap022306.fa
mv rhSequenceSubmit022306.txt rhMapPrimers022306.txt
# first remove ^M from end of lines
dos2unix rhMap022306.fa
dos2unix rhMapPrimers022306.txt
grep '>' rhMap022306.fa | wc -l
# 11514
wc -l rhMapPrimers022306.txt
# 13438 rhMapPrimers022306.txt
grep '>' rhMap022306.fa > rhMap.names
# remove '>' from names and grab first field
perl -pi.bak -e 's/>//' rhMap.names
awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
> rhMap.namesOnly.sort
awk 'BEGIN {FS="|"} {print $1;}' rhMapPrimers022306.txt | sort | uniq \
> rhMapPrimers.namesOnly.sort
wc -l *.sort
# 11514 rhMap.namesOnly.sort
# 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
# There are no replicates this time for rhMap sequences but there are for
# the primers set:
awk 'BEGIN {FS="|"} {print $1;}' rhMapPrimers022306.txt | sort | uniq -c \
| sort -nr > rhMapPrimers.names.count
# These replicates are blank lines so there are no replicates
# Total 11514 sequences in rhMap, but 13436 primer sets
# 11527 rhMap.namesOnly.sort
# 13436 rhMapPrimers.namesOnly.sort
# get a list of headers from the FASTA file
grep '>' rhMap022306.fa > rhMap.headers
awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
# BAC_END
# EST
# GENE
# SSLP
# STS
# 5 types of sequence
awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
# BACends
# Custom
# Insertion_Mutant
# Insertion_Mutants
# MGH
# NCBI
# Sanger SG
# Sequencing_Project
# ThisseClone
# Thisse_Clone
# other_zfEst
# wu_zfEst
# wz
# Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
# So there are 11 different sources.
awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
# CHBG
# MPIEB
# There are 2 sequences with problem primers. E-mailed Peter Song about
# these and he suggested to delete thoser primers:
# >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
# >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A|
# edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
# need to reformat FASTA headers so they are in the format:
# NAME.SOURCE.TYPE.ORIGIN
# Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
# so change these to have the same name. Also shorten Sanger SG to Shotgun.
perl -pi.bak -e 's/Insertion_Mutant/InsertMut/' rhMap022306.fa
perl -pi.bak -e 's/Insertion_Mutants/InsertMut/' rhMap022306.fa
perl -pi.bak -e 's/Sanger SG/Shotgun/' rhMap022306.fa
perl -pi.bak -e 's/ThisseClone/Thisse/' rhMap022306.fa
perl -pi.bak -e 's/Thisse_Clone/Thisse/' rhMap022306.fa
perl -pi.bak -e 's/Sequencing_Project/Seqproj/' rhMap022306.fa
# use a script to reformat the names for the FASTA headers to the format
# >NAME.SOURCE where name is the first field separated by "|" and source
# is the 9th field. The source is used to make the name unique. Some
# of these names are BAC ends that occur in the BAC ends track so there
# are name clashes in the seq table if the names are not made unique.
# Also make the name upper case as for those for the danRer1 and danRer2
# RH map.
cat << '_EOF_' > rhFix
#!/usr/bin/awk -f
#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
split(toupper($0), a, "\\|");
print a[1]"."a[9];
next;
}
/^[0-9]+ / {
$0 = $2;
}
{
print $0;
}
'_EOF_'
# << keep emacs coloring happy
chmod +x rhFix
rhFix rhMap022306.fa > rhMap.fa
# Blat sequences vs danRer3 genome
ssh pk
mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
# make output directory
mkdir -p /san/sanvol1/scratch/danRer3/rhMap/psl
cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
ln -s /san/sanvol1/scratch/danRer3/rhMap/psl .
# copy input to the san
cp \
/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/rhMap.fa \
/san/sanvol1/scratch/danRer3/rhMap/
# do the blat run to align RH map sequences to danRer3 and do separate
# runs for chroms and scaffolds from chrUn and chrNA
ls -1S /san/sanvol1/scratch/danRer3/rhMap/rhMap.fa > rhMap.lst
ls -1S /san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/chr[0-9M]*.fa \
> genome.lst
# use the individual scaffolds for chrUn and chrNA alignments
foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/Zv5_*.fa)
ls -1S $f >> genome.lst
end
wc -l genome.lst
# 15149 genome.lst
cp -p /cluster/data/danRer3/bed/ooc/danRer3_10.ooc \
/san/sanvol1/scratch/danRer3
# try same parameters as for BAC ends
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/blat {check in line+ $(path1)} {check in line+ $(path2)} -tileSize=10 -ooc=/san/sanvol1/scratch/danRer3/danRer3_10.ooc {check out line+ /san/sanvol1/scratch/danRer3/rhMap/psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
# gensub2 genome.lst rhmap.lst gsub spec
gensub2 genome.lst rhMap.lst gsub spec
para create spec
para try, check, push, check etc.
# para time
# Completed: 15149 of 15149 jobs
# CPU time in finished jobs: 16326s 272.09m 4.53h 0.19d 0.001 y
# IO & Wait Time: 41360s 689.34m 11.49h 0.48d 0.001 y
# Average job time: 4s 0.06m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 61s 1.02m 0.02h 0.00d
# Submission to last job: 263s 4.38m 0.07h 0.00d
cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
# Make & check the psl table
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create rhmap.psl
pslSort dirs raw.psl tmp psl
pslReps -nearTop=0.0001 -minAli=0.80 -minCover=0.20 raw.psl \
contig.psl /dev/null
# There are 11514 sequences in total in rhMap.fa
# Experimented with different parameters:
# little difference if STS markers BLAT parameters were used
# i.e. -ooc=11.ooc and -stepSize=5.
# For Blat parameters used above (-ooc=10.ooc and -tileSize=10), try
# different pslReps parameters using minCover=0.40 and nearTop=0.0001:
# minAli=0.96, 83%, most aligned sequence has 11 alignments.
# minAli=0.90, 88% align, most aligned seq has 11 alignments
# minAli=0.80, 88%, 10120 sequences aligned.
# at minAli=0.50, there are still 10120 sequences aligned so those that
# are not aligning must have very low sequence identity. Took a look at
# some that are not aligning e.g. 2217C, 2791C and these are not passing
# the minCover=0.40 criterion. Some sequences have Ns in them too
# e.g. ZC92E13.YBF so has a lot of short alignments that do not pass
# the minCover parameter. Lowering minCover increases the number of
# sequences aligned:
# minAli=0.80, minCover=0.20, there are 10850 (94%) of sequences aligned.
# minAli=0.90, minCover=0.20, there are 10837 (94%) of sequences aligned
# with 21 less alignment than for minAli=0.80.
# Most alignments for one sequence is 99, second most is 11. There are
# about 1851 sequences with more than > 1 alignment (many of these
# have 2 alignments) while for minAli=0.80 and minCover=0.40, there were
# 1266 sequences with more than 1 alignment. With lower minCover, more
# sequences align, but there are more sequences with higher numbers of
# multiple alignments. At minCover=0.0, there is 1 sequence with 1353
# alignments, the second largest number of alignments for 1 sequence
# is 532, then 329 etc. So use minAli=0.80 and minCover=0.20 to get the
# most sequences aligned without having sequences aligning too many times.
# at minAli=0.80 and minCov=0.20, there are 10850 sequences aligned (94%).
# 88% of sequences were aligned for danRer2.
# merge together liftAll and scaffolds lift then lift psl to chrom level.
cat /cluster/data/danRer3/liftSuperToChrom/liftNAandUnScaffoldsToChrom.lft \ /cluster/data/danRer3/jkStuff/liftAll.lft \
> /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft
liftUp rhMap.psl \
/cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft \
warn contig.psl
# Got 30168 lifts
pslCheck rhMap.psl
# psl is ok
# Load sequence alignments into database.
ssh hgwdev
cd /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
# drop old table and reload (hartera, 2006-03-26)
echo "drop table rhMap;" | hgsql danRer3
hgLoadPsl danRer3 rhMap.psl
# cleanup
rm -r /san/sanvol1/scratch/danRer3/rhMap/psl
rm psl para.results batch batch.bak spec
rm -r err
gzip *.psl
# Copy sequences to gbdb if they are not already there.
mkdir -p /gbdb/danRer3/rhMap
ln -s \
/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/rhMap.fa \
/gbdb/danRer3/rhMap/rhMap022306.fa
# then add sequences to database:
# reloaded (hartera, 2006-03-26)
hgLoadSeq danRer3 /gbdb/danRer3/rhMap/rhMap022306.fa
# Note: first time these sequences were loaded there was a problem
# 2215 are not loaded into database, these all
# have names with extensions like .YB, .YC etc. so remove from extFile
# and seq. Sequences with the same IDs are already in the seq table
# for the BAC ends tracks so need to make these RH map names unique.
hgsql -e 'delete from seq where extFile = 736113;' danRer3
hgsql -e 'delete from extFile where id = 736113;' danRer3
hgsql -e 'update history set errata = "Removed sequences. Error so not all asequences loaded." where ix = 23;' danRer3
# Check that all the headers from rhMap.headers are also in the primers
# file which seems to contain the same headers from the FASTA file
# as well as additional markers.
ssh kkstore02
cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306
perl -pi.bak -e 's/>//' rhMap.headers
sort rhMap.headers > rhMap.headers.sort
sort rhMapPrimers022306.txt > rhMapPrimers.sort
wc -l *.sort
# 11514 rhMap.headers.sort
# 13437 rhMapPrimers.sort
comm -12 rhMap.headers.sort rhMapPrimers.sort | wc -l
# 11514 in common
# so all FASTA headers from rhMap022306.fa are in the primers file
# Get headers again from rhMap.fa file as the names of the sources have
# been changed. Parse out information from headers to add to an rhMapInfo
# table so that this information can be displayed on the details page for
# the RH map markers.
# Fields: 1 - name, 2 - linkage group (chrom), 3 - position number on the
# RH map for that linkage group, 4 - distance (in cR) from the
# top of a linkage group, 4 - position number in entire RH map (ordered
# from LG1 to LG25, 5 - type of marker (SSLP, BAC_END, EST, GENE, STS),
# 9 - source, 10 - institute that mapped the marker, 11 - 5' forward primer,
# 12 - 3' reverse primer.
# Sort headers by linkage group and by position
grep '>' rhMap022306.fa > rhMap.headers2
# then use the rhMap.headers2 file to extract the marker information
# and to reformat the names for the FASTA headers to the format
# >NAME.SOURCE where name is the first field separated by "|" and source
# is the 9th field so that names in the rhMap and rhMapInfo tables are
# the same. The source is used to make the name unique.
cat << '_EOF_' > getRhInfo
#!/usr/bin/awk -f
#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
sub(/>/,"",$0);
split(toupper($0), a, "\\|");
print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
next;
}
'_EOF_'
# << keep emacs coloring happy
chmod +x getRhInfo
getRhInfo rhMap.headers2 > rhMapInfo.tab
# Sort headers by linkage group (LG) and by position
sort -k 2,2 -k 3,3n rhMapInfo.tab > rhMapInfoSorted.tab
wc -l rhMapInfoSorted.tab
# 11514 rhMapInfoSorted.tab
ssh hgwdev
# Create a table with RH map item information including type, source,
# origin and primer sequences.
cat << 'EOF' > ~/kent/src/hg/lib/rhMapInfo.as
table rhMapInfo
"Radiation Hybrid map information"
(
string name; "Name of Radiation Hybrid (RH) map marker"
string linkageGp; "Linkage group to which the marker was mapped"
uint position; "Position number in RH map for this linkage group"
uint distance; "Distance from the top of linkage group (cR)"
string markerType; "Type of marker"
string source; "Source of marker"
string mapSite; "Institution that mapped the marker"
string leftPrimer; "Forward primer sequence"
string rightPrimer; "Reverse primer sequence"
)
'EOF'
# << happy emacs
# create .sql, .c and .h files using autoSql
autoSql rhMapInfo.as rhMapInfo
mv rhMapInfo.h ../inc
# rhMapInfo.sql - name is the primary key
# commit rhMapInfo.as, .sql, .c and .h files to CVS.
# create and load table (Reloaded: hartera, 2006-03-26)
cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306
echo "drop table rhMapInfo;" | hgsql danRer3
hgsql danRer3 < ~/kent/src/hg/lib/rhMapInfo.sql
hgsql -e \
'load data local infile "rhMapInfoSorted.tab" into table rhMapInfo' danRer3
# edit danRer3/trackDb.ra to add rhMap track and the search spec.
# add and edit rhMap.html to describe the info data.
# edit ~/kent/src/hg/hgc/hgc.c so that the rhMapInfo data is displayed
# on the details page for each marker - edit doRHmap function.
# Add a rule to all.joiner to check that all names in rhMap also appear
# in rhMapInfo
# Add a rule to all.joiner to check that all names in rhMap also appear
# in rhMapInfo..
# commit these to CVS.
# Changed termRegex for rhMap search in trackDb.ra so that it works
# for all IDs. (2006-04-19, hartera)
# SELF BLASTZ, CHAIN, NET, AXTNET, MAFNET AND DOWNLOADS
# (DONE, 2005-12-02, hartera)
ssh pk
mkdir -p /cluster/data/danRer3/bed/blastzSelf.2005-11-30
cd /cluster/data/danRer3/bed
ln -s blastzSelf.2005-11-30 blastzSelf
cd /cluster/data/danRer3/bed/blastzSelf
# make run directory on the san
mkdir -p /san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
ln -s /san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
# make 2 bit file of chr1-25 and chrM
cd /cluster/data/danRer3
faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa \
/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
twoBitInfo /san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit S1.len
cp S1.len S2.len
cat << '_EOF_' > DEF
# zebrafish vs zebrafish
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_L=5000
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0
# TARGET: Zebrafish danRer3
SEQ1_DIR=/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
SEQ1_IN_CONTIGS=0
SEQ1_LIMIT=30
SEQ1_CHUNK=500000
SEQ1_LAP=5000
# QUERY: Zebrafish danRer3
SEQ2_DIR=/san/sanvol1/scratch/danRer3/danRer3Chroms1to25andM.2bit
SEQ2_SELF=1
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=1800000000
SEQ2_LAP=0
BASE=/san/sanvol1/scratch/danRer3/blastzSelf/chromsRun
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
TMPDIR=/scratch/tmp
'_EOF_'
chmod +x DEF
ssh hgwdev
cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-chainMinScore=5000 \
-chainLinearGap=medium \
`pwd`/DEF >& do.log &
# Start: Wed Nov 30 17:07 Finish: Thur Dec 1 06:51
# Crashed at downloads step as these exist from previous run so remove
rm -r /usr/local/apache/htdocs/goldenPath/danRer3/vsSelf
# para time (blastz)
# Completed: 2425 of 2425 jobs
# CPU time in finished jobs: 4783120s 79718.66m 1328.64h 55.36d 0.152 y
# IO & Wait Time: 108014s 1800.24m 30.00h 1.25d 0.003 y
# Average job time: 2017s 33.62m 0.56h 0.02d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2762s 46.03m 0.77h 0.03d
# Submission to last job: 14993s 249.88m 4.16h 0.17d
# para time (axtChain)
# Completed: 26 of 26 jobs
# CPU time in finished jobs: 96405s 1606.74m 26.78h 1.12d 0.003 y
# IO & Wait Time: 731s 12.19m 0.20h 0.01d 0.000 y
# Average job time: 3736s 62.27m 1.04h 0.04d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 7405s 123.42m 2.06h 0.09d
# Submission to last job: 7411s 123.52m 2.06h 0.09d
# Carry on from downloads step.
cd /cluster/data/danRer3/bed/blastzSelf/chromsRun
nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk \
-smallClusterHub=pk \
-workhorse=pk \
-fileServer=kolossus \
-continue download \
-chainMinScore=5000 \
-chainLinearGap=medium \
`pwd`/DEF >& doDownloads.log &
# Took 2 minutes.
# check trackDb entry exists. Put html at danRer3 level of trackDb and edit
# these and the downloads README to state that chrNA and chrUn were not
# aligned for this track.
# Remove extra downloads made by script:
# Only chain track is pushed to the RR so remove the net and axtNet
# downloads, re-make md5sum.txt and edit README.txt accordingly.
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/danRer3/vsSelf
rm danRer3.danRer3.net.gz md5sum.txt
rm -r axtNet
md5sum *.gz > md5sum.txt
# Original run with loose linear gap matrix and scaffolds for chrNA and chrUn
# done 2005-10-26.
# filtering chains from above on minScore 10,000. done 2005-11-18
# Using the medium linear gap matrix for axtChain. minScore=5,000.
# done 2005-11-30.
# chainSelf - loose linearGap matrix, filtered minScore=5000
# chainSelfFilt10k - loose linearGap matrix, filtered minScore=10000
# chainSelfMedGap - medium linearGap matrix, filtered minScore=5000
# featureBits -chrom=chr1 danRer3 refGene:cds chainSelfLink -enrichment
# refGene:cds 0.743%, chainSelfLink 65.056%, both 0.560%, cover 75.29%,
# enrich 1.16x
# featureBits -chrom=chr1 danRer3 refGene:cds chainSelfFilt10kLink -enrichment
# refGene:cds 0.743%, chainSelfFilt10kLink 64.019%, both 0.554%, cover 74.54%,
# enrich 1.16x
# number of rows in tables for chr1:
# chainSelf 941416
# chainSelfFilt10k 530292
# chainSelfMedGap 997525
# chainSelfLink 9110071
# chainSelfFilt10kLink 7226815
# chainSelfMedGapLink 9149100
# featureBits -chrom=chr1 danRer3 refGene:cds chainSelfMedGapLink -enrichment
# refGene:cds 0.743%, chainSelfMedGapLink 64.525%, both 0.549%, cover 73.80%,
# enrich 1.14x
# so the medium linearGap matrix increases the number of chains by about 5%
# but coverage is little different.
# for the chains filtered with minScore=10000
# 12192577 chains out of 17592225 do not have chrNA or chrUn as query or
# target which is about 69%.
# 12192577 out of 12807964 do not have chrNA or chrUn as the query for just
# chr1-25 and chrM which is about 95%.
# so make the chains without chrNA and chrUn and using the medium linearGap
# matrix which is for species that are not so distant.
# 2005-12-02
# medium linearGap matrix for axtChain, minScore=5000 and no chrNA or chrUn.
# number of rows in tables for chr1:
# chainSelf 943482
# chainSelfLink 8707208
# featureBits -chrom=chr1 danRer3 refGene:cds chainSelfLink -enrichment
# refGene:cds 0.743%, chainSelfLink 60.876%, both 0.503%, cover 67.65%,
# enrich 1.1
# coverage dropped about 8% without chrNA and chUn alignments so not a
# huge difference.
# BLASTZ SWAP FOR HUMAN (hg18) (DONE, 2005-12-24, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
ssh hgwdev
# Blastz requires lineage-specific repeats
# Treat all repeats as lineage-specific for all alignments except those
# involving danRer3 chrUn and chrNA where the dynamic masking
# functionality of Blastz was used. hg18 random chroms were aligned
# as contigs and danRer3 chrNA and chrUn were aligned as scaffolds -
# see zebrafish (danRer3) chain and net track section in makeHg18.doc
# for further details.
# do swap of hg18 vs. danRer3 chain and net alignments to
# create danRer3 vs. hg18 see makeHg18.doc for details.
cd /cluster/data/hg18/bed/blastz.danRer3/chromsRun
# edit DEF file and add location of danRer3 and hg18 lineage-specific
# repeats - move chrUn and chrNA lineage-specific repeats into a tmp
# directory as they were not used.
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl \
-bigClusterHub=pk -swap -chainMinScore=5000 \
-chainLinearGap loose `pwd`/DEF >& doSwap.log &
# Took about 27 minutes.
# Blastz parameters are as for hg18 vs. danRer3 - see makeHg18.doc
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# BLASTZ_ABRIDGE_REPEATS=1
# make html files and trackDb.ra entry for chain and net tracks.
# check README.txt for downloads.
# featureBits -chrom=chr2 danRer3 refGene:cds chainHg18Link -enrichment
# refGene:cds 0.767%, chainHg18Link 4.370%, both 0.607%, cover 79.15%,
# enrich 18.11x
# featureBits -chrom=chr2 danRer2 refGene:cds chainHg17Link -enrichment
# refGene:cds 0.769%, chainHg17Link 4.576%, both 0.605%, cover 78.69%,
# enrich 17.20x
# Similar coverage and enrichment as for danRer2 vs hg17 but there are less
# chains: 7057 for hg18 on danRer3, 1111 for hg17 on danRer2 (chr1).
# 5-WAY VAR_MULTIZ ALIGNMENTS (DONE, 2006-02-06, hartera)
# MAF ANNOTATION ADDED (DONE, 2006-02-6, braney)
# FINISHED MAKING TREE IMAGE FOR TRACK DESCRIPTION PAGE
# (DONE, 2006-02-07, hartera)
# Species: zebrafish(danRer3), human (hg18), mouse(mm7),
# fugu(fr1) and tetraodon(tetNig1)
# Opossum (monDom2) was dropped since there were many more alignments
# for monDom2 than monDom1 and the chains were shorter on average. The
# reason for this is unknown so they will not be included in the
# conservation track at this time.
# rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/multiz5way
cd /cluster/data/danRer3/bed/multiz5way
mkdir mafLinks
# set up directories for links to mafs for each pairwise alignment
mkdir mafLinks/hg18
mkdir mafLinks/mm7
mkdir mafLinks/fr1
mkdir mafLinks/tetNig1
set dir=/cluster/data/danRer3/bed
# need to make links to all the mafNet files for pairwise blastz
# alignments for each species. Make sure files are all called chrN.maf.gz
ln -s $dir/blastz.hg18.swap/mafNet/*.maf.gz ./mafLinks/hg18
ln -s $dir/blastz.mm7.swap/mafNet/*.maf.gz ./mafLinks/mm7
ln -s $dir/blastz.fr1/mafNet/*.maf.gz ./mafLinks/fr1
ln -s $dir/blastz.tetNig1.2005-10-11/chromsAndScafsRun/mafNet/*.maf.gz \
./mafLinks/tetNig1
# copy files over to the san for the pitakluster cluster run
ssh pk
mkdir /san/sanvol1/scratch/danRer3/multiz5way
cd /san/sanvol1/scratch/danRer3/multiz5way
rsync -a --copy-links --progress \
/cluster/data/danRer3/bed/multiz5way/mafLinks/ .
# 277 Mb of data - took less than 1 minute
mkdir penn
cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/v10.5.x86_64/multiz-tba/maf_project penn
# Progressive alignment up the tree w/o stager,
# using multiz.v10 (var_multiz)
# Method: align internal subtrees (using 0 flag to var_multiz)
# Then, align these to human (using 1 flag to var_multiz)
# NOTE: must use maf_project after each multiz run, in order
# to order output. Single-cov guaranteed by use of net MAF's,
# so it is not necessary to run single_cov2.
# make output dir and run dir
cd /cluster/data/danRer3/bed/multiz5way
mkdir -p maf
mkdir -p run
cd run
# create scripts to run var_multiz on cluster
cat > oneMultiz.csh << 'EOF'
#!/bin/csh -fe
set c = $1
set db = danRer3
set multi = /scratch/tmp/$db/multiz5way.$c
set pairs = /san/sanvol1/scratch/$db/multiz5way
set penn = $pairs/penn
# special mode --
# with 1 arg, cleanup
if ($#argv == 1) then
echo "cleanup"
echo "rm -fr $multi"
rm -fr $multi
echo "rmdir --ignore-fail-on-non-empty /scratch/tmp/$db"
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
exit
endif
# special mode --
# with 3 args, saves an alignment file
if ($#argv == 3) then
echo "cp $multi/$2/$c.maf $3"
ls -og $multi/$2/$c.maf
cp $multi/$2/$c.maf $3
exit
endif
set s1 = $2
set s2 = $3
set flag = $4
# locate input files -- in pairwise dir, or multiple dir
set d1 = $multi
set d2 = $multi
if (-d $pairs/$s1) then
set d1 = $pairs
set f1 = $d1/$s1/$c.maf.gz
set t1 = /tmp/$s1.$c.maf
zcat $f1 > $t1
else
set f1 = $d1/$s1/$c.maf
set t1 = /tmp/$s1.$c.maf
cp -p $f1 $t1
endif
if (-d $pairs/$s2) then
set d2 = $pairs
set f2 = $d2/$s2/$c.maf.gz
set t2 = /tmp/$s2.$c.maf
zcat $f2 > $t2
else
set f2 = $d2/$s2/$c.maf
set t2 = /tmp/$s2.$c.maf
cp -p $f2 $t2
endif
# write to output dir
set out = $multi/${s1}${s2}
mkdir -p $out
# check for empty input file
if (-s $t1 && -s $t2) then
echo "Aligning $f1 $f2 $flag"
$penn/multiz $t1 $t2 $flag $out/$c.unused1.maf \
$out/$c.unused2.maf > $out/$c.full.maf
cat $out/$c.full.maf $out/$c.unused1.maf $out/$c.unused2.maf > \
$out/$c.tmp.maf
echo "Ordering $c.maf"
$penn/maf_project $out/$c.tmp.maf $db.$c > $out/$c.maf
rm -f $t1 $t2
else if (-s $t1) then
cp -p $t1 $out/$c.maf
rm -f $t1
else if (-s $t2) then
cp -p $t2 $out/$c.maf
rm -f $t2
endif
'EOF'
# << keep emacs coloring happy
chmod +x oneMultiz.csh
cp -p oneMultiz.csh \
/san/sanvol1/scratch/danRer3/multiz5way/penn/oneMultiz.csh
# Create 6way.nh file of tree. This was used in the distant past for
# early versions of phastCons. Now, this is merely a convenient
# reference to the tree under construction. This is also used to draw
# a graphic tree as species5.nh, see below.
cat << '_EOF_' > /cluster/data/danRer3/bed/multiz5way/5way.nh
(hg18,mm7),((tetNig1,fr1),danRer3))
'_EOF_'
# << this line keeps emacs coloring happy
# using the tree diagram as above, arrange these alignments
# in order of the tree branches
cat > allMultiz.csh << 'EOF'
#!/bin/csh -fe
# multiple alignment steps:
set c = $1
set db = danRer3
set s = "/san/sanvol1/scratch/$db/multiz5way/penn/oneMultiz.csh"
$s $c hg18 mm7 0
$s $c tetNig1 fr1 1
$s $c tetNig1fr1 hg18mm7 1
# get final alignment file
$s $c tetNig1fr1hg18mm7 /cluster/data/$db/bed/multiz5way/maf/$c.maf
#cleanup
$s $c
'EOF'
# happy emacs
chmod +x allMultiz.csh
cat << 'EOF' > template
#LOOP
./allMultiz.csh $(root1) {check out line+ /cluster/data/danRer3/bed/multiz5way/maf/$(root1).maf}
#ENDLOOP
'EOF'
awk '{print $1}' ../../../chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
para try, para check, para push, para check ... etc
para time
# Completed: 28 of 28 jobs
#CPU time in finished jobs: 3546s 59.10m 0.98h 0.04d 0.000 y
# IO & Wait Time: 115s 1.92m 0.03h 0.00d 0.000 y
# Average job time: 131s 2.18m 0.04h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 553s 9.22m 0.15h 0.01d
# Submission to last job: 709s 11.82m 0.20h 0.01d
# do not filter mafs as only removes a small fraction of alignments
# better to keep them all. check for single column alignments (these
# just have a single base for each species in the alignment). There
# should be none of these now. Previously had to do a glueing step to
# deal with these. There are none here.
# Build maf annotation and load database (braney, 2006-02-06)
cd /cluster/data/danRer3/bed/multiz5way
mkdir anno
cd anno
cat ../../maf/chr1.maf | awk "/^s/ {print \$2}" | sed "s/\..*$//" | sort -u > species.names
mkdir maf run
cd run
rm sizes nBeds
for i in `cat species.names`
do
ln -s /cluster/data/$i/chrom.sizes $i.len
ln -s /cluster/data/$i/$i.N.bed $i.bed
echo $i.bed >> nBeds
echo $i.len >> sizes
done
for i in ../../maf/*.maf
do
echo mafAddIRows -nBeds=nBeds -sizes=sizes $i /cluster/data/danRer3/danRer3.2bit ../maf/`basename $i`
done > jobs
sh -x jobs
ssh hgwdev
cd /cluster/data/danRer3/bed/multiz5way/anno/mafs
cat *.maf | hgLoadMafSummary danRer3 multiz5way stdin
# Dropped unused indexes (2006-05-09 kate)
# NOTE: this is not required in the future, as the loader
# has been fixed to not generate these indexes
hgsql danRer3 -e "alter table multiz5waySummary drop index chrom_2"
hgsql danRer3 -e "alter table multiz5waySummary drop index chrom_3"
mkdir /gbdb/danRer3/multiz5way
for i in *.maf
do
ln -s `pwd`/$i /gbdb/danRer3/multiz5way
done
hgLoadMaf danRer3 multiz5way
rm *.tab
cd /cluster/data/danRer3/bed/multiz5way
mkdir frames
cd frames
cp /cluster/data/mm7/bed/multiz17wayFrames/mkMafFrames .
cp /cluster/data/mm7/bed/multiz17wayFrames/Makefile .
#edit Makefile to correct species names
mkdir -p /san/sanvol1/scratch/danRer3/multiz5wayFrames/maf
for i in ../../maf/*.maf; do echo $i; cp $i /san/sanvol1/scratch/danRer3/multiz5wayFrames/maf/$i; done
make getGenes
make getFrames
make loadDb
###
# rebuild frames to get bug fix, using 1-pass maf methodology
# (2006-06-09 markd)
ssh kkstore02
cd /cluster/data/danRer3/bed/multiz5way/frames
mv mafFrames/ mafFrames.old
nice tcsh # easy way to get process niced
(zcat ../maf/*.maf.gz | time genePredToMafFrames danRer3 stdin stdout danRer3 genes/danRer3.gp.gz fr1 genes/fr1.gp.gz hg18 genes/hg18.gp.gz mm7 genes/mm7.gp.gz tetNig1 genes/tetNig1.gp.gz | gzip >multiz5way.mafFrames.gz)>&log&
ssh hgwdev
cd /cluster/data/danRer3/bed/multiz5way/frames
hgLoadMafFrames danRer3 multiz5wayFrames multiz5way.mafFrames.gz >&log&
#end of multiz5way annotation and load
# create tree image - like tree.nh but with common names
# (hartera, 2006-02-07)
ssh hgwdev
cd /cluster/data/danRer3/bed/multiz5way
cat << '_EOF_' > species5.nh
((human,mouse),((tetraodon,fugu),zebrafish))
'_EOF_'
/cluster/bin/phast/$MACHTYPE/draw_tree -b -s species5.nh > species5.ps
convert species5.ps 5way.jpg
# using GIMP, edit tree and remove whitespace
# Photoshop used to edit the image (kuhn, 2006-02-07)
cp 5way.jpg /usr/local/apache/htdocs/images/phylo/danRer3_5way.jpg
# change permissions for display
chmod +r /usr/local/apache/htdocs/images/phylo/danRer3_5way.jpg
# check for all.joiner entry for multiz5way - ok
# add trackDb.ra entry in ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer3:
# track multiz5way
# shortLabel 5-Way Conservation
# longLabel 5-Way Vertebrate Multiz Alignment & Conservation
# group compGeno
# priority 104
# visibility pack
# color 0, 10, 100
# altColor 0,90,10
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
# summary multiz5waySummary
# speciesGroups vertebrate mammal
# sGroup_mammal hg18 mm7
# sGroup_vertebrate tetNig1 fr1
# add this line to trackDb entry as above for the tree image (2006-02-07):
# treeImage phylo/danRer3_5way.jpg
# PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 6-WAY ALIGNMENT
# (DONE, 2006-02-06, hartera)
ssh kkstore02
mkdir /cluster/data/danRer3/bed/multiz5way/cons
cd /cluster/data/danRer3/bed/multiz5way/cons
# create a starting-tree.mod based on chr5 (73Mb - largest chrom)
# chr5 is the largest chrom apart from NA and Un
/cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr5.maf \
--refseq ../../../5/chr5.fa --in-format MAF \
--windows 100000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
# takes about 30 seconds
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "((danRer3,(tetNig1,fr1)),(mm7,hg18))" \
--out-root starting-tree
# took less than 1 minute
rm s1.*ss
# Get genome-wide average GC content (for all species together,
# not just the reference genome). If you have a globally
# estimated tree model, as above, you can get this from the
# BACKGROUND line in the .mod file. E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.307629 0.191708 0.192177 0.308486
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.384 is the GC content. This is used in the -gc argument below.
# If you do *not* have a global tree model and you do not know your
# GC content, you can get it directly from the MAFs with a command
# like:
# /cluster/bin/phast/$MACHTYPE/msa_view \
# --aggregate danRer3,tetNig1,fr1,mm7,hg18 -i MAF \
# -S /cluster/data/danRer3/bed/multiz5way/maf/chr*.maf > maf_summary.txt
# This gives a GC content of 0.438
# break up the genome-wide MAFs into pieces on the san filesystem
ssh kkstore02
mkdir -p /san/sanvol1/scratch/danRer3/cons/ss
cd /san/sanvol1/scratch/danRer3/cons/ss
bash
for C in `awk '{print $1}' /cluster/data/danRer3/chrom.sizes`
do
if [ -s /cluster/data/danRer3/bed/multiz5way/maf/${C}.maf ]; then
mkdir ${C}
echo msa_split $C
chrN=${C/chr/}
/cluster/bin/phast/$MACHTYPE/msa_split \
/cluster/data/danRer3/bed/multiz5way/maf/${C}.maf \
--refseq /cluster/data/danRer3/${chrN}/${C}.fa \
--in-format MAF --windows 1000000,0 --between-blocks 5000 \
--out-format SS -I 1000 --out-root ${C}/${C}
fi
done
# took about 20 minutes to run
# Create a random list of 50 1 mb regions (do not use chrNA and chrUn)
ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
# Set up parasol directory to calculate trees on these 50 regions
ssh pk
mkdir /san/sanvol1/scratch/danRer3/cons/treeRun1
cd /san/sanvol1/scratch/danRer3/cons/treeRun1
mkdir tree log
# now set up cluster job to estimate model parameters. Parameters
# will be estimated separately for each alignment fragment then
# will be combined across fragments. Tuning this loop should come
# back to here to recalculate. Tuning target-coverage and expected-length.
# Create little script that calls phastCons with right arguments
cat > makeTree << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/x86_64/phastCons ../ss/$1 \
/cluster/data/danRer3/bed/multiz5way/cons/starting-tree.mod \
--gc 0.438 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-length 12 --target-coverage 0.17 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
# emacs happy
chmod a+x makeTree
# Make sure that the correct GC content is subsituted in here. Notice
# the target coverage of 0.17. Here we are going to aim
# for 65% coverage of coding regions by conserved elements.
# Create gensub file
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
# happy emacs
# Make cluster job and run it
gensub2 ../randomSs.list single template jobList
para create jobList
para try,check,push,check etc.
# para time
# Completed: 50 of 50 jobs
# CPU time in finished jobs: 714s 11.90m 0.20h 0.01d 0.000 y
# IO & Wait Time: 132s 2.20m 0.04h 0.00d 0.000 y
# Average job time: 17s 0.28m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 26s 0.43m 0.01h 0.00d
# Submission to last job: 353s 5.88m 0.10h 0.00d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kkstore02
cd /san/sanvol1/scratch/danRer3/cons/treeRun1
ls tree/chr*/*.cons.mod > cons.txt
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
--output-average ../ave.cons.mod > cons_summary.txt
ls tree/chr*/*.noncons.mod > noncons.txt
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
--output-average ../ave.noncons.mod > noncons_summary.txt
cd ..
cp -p ave.*.mod /cluster/data/danRer3/bed/multiz5way/cons
# measuring entropy
# consEntropy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
# never stops with the --NH argument
# target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
# the expected length that produces this entropy is the one
# to use for phastCons.
/cluster/bin/phast/$MACHTYPE/consEntropy 0.17 12 \
ave.cons.mod ave.noncons.mod
# -target-coverage=0.17 -expected-lengths 12
#Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=0.618383 bits/site
# Expected min. length: L_min=17.978234 sites
# Expected max. length: L_max=10.983828 sites
# Phylogenetic information threshold: PIT=L_min*H=11.117434 bits
# then the above steps from creating the treeRun directory onwards were
# repeated with the target coverage and expected lengths parameters set as
# below:
# -target-coverage=0.25 -expected-lengths 12
#Transition parameters:gamma=0.250000, omega=12.000000, mu=0.083333,nu=0.027778
#Relative entropy: H=0.637721 bits/site
#Expected min. length: L_min=15.535855 sites
#Expected max. length: L_max=10.157133 sites
#Phylogenetic information threshold: PIT=L_min*H=9.907536 bits
#### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####
# Parameters used for danRer2 6-way conservation track:
# -target-coverage=0.35 -expected-lengths 18
#Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
# Relative entropy: H=0.592725 bits/site
# Expected min. length: L_min=16.435656 sites
# Expected max. length: L_max=12.564154 sites
# Phylogenetic information threshold: PIT=L_min*H=9.741828 bits
# need to iterate and get the right coverage and parameters
# try running phastCons below with parameters used above and check the
# coverage of coding regions by the most conserved elements
# Create cluster dir to do main phastCons run
ssh pk
mkdir -p /san/sanvol1/scratch/danRer3/cons/consRun1
cd /san/sanvol1/scratch/danRer3/cons/consRun1
mkdir ppRaw bed
cp -p /san/sanvol1/scratch/danRer3/cons/ave.*.mod .
# Create script to run phastCons with right parameters
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
--expected-length 18 --target-coverage 0.35 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.*.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# emacs happy
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
# happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
# combine predictions and transform scores to be in 0-1000 interval
ssh kkstore02
cd /san/sanvol1/scratch/danRer3/cons/consRun1
# The sed's and the sort get the file names in chrom,start order
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed /cluster/data/danRer3/bed/multiz5way
# Figure out how much is actually covered by the mostConserved data as so:
cd /cluster/data/danRer3
faSize */chr*.fa
# 1644032962 bases (48201758 N's 1595831204 real 816464533 upper
# 779366671 lower) in 28 sequences in 28 files
# The non-N size is 1595831204 bases
cd /cluster/data/danRer3/bed/multiz5way
awk '{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/1595831204\n",100.0*sum/1595831204,sum}' \
mostConserved.bed
-target-coverage 0.35: % 3.06 = 100.0*48883581/1595831204 length=18
-target-coverage 0.
ssh hgwdev
cd /cluster/data/danRer3/bed/multiz5way
# get an or of refGene and mgcGenes CDS regions
featureBits danRer3 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
# 11338034 bases of 1630323462 (0.695%) in intersection
featureBits danRer3 refSeqOrMgcCds.bed mostConserved.bed -enrichment
# refSeqOrMgcCds.bed 0.695%, mostConserved.bed 2.998%, both 0.464%,
# cover 66.71%, enrich 22.25x
# so use this result for -target-coverage=0.35 -expected-lengths=18
# with entropy (PIT) value of 9.74 (aiming for around 9.8) and
# 66.7% coverage of coding regions with most conserved elements
# (aiming for about 65%)
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/danRer3/bed/multiz5way
hgLoadBed danRer3 phastConsElements mostConserved.bed
# Loaded 552331 elements of size 5
featureBits danRer3 mgcGenes:cds phastConsElements -enrichment
# mgcGenes:cds 0.531%, phastConsElements 2.998%, both 0.363%,
# cover 68.39%, enrich 22.81x
featureBits danRer3 refGene:cds phastConsElements -enrichment
# refGene:cds 0.658%, phastConsElements 2.998%, both 0.440%, cover 66.82%,
# enrich 22.28x
# Create merged posterier probability file and wiggle track data files
# the sed business gets the names sorted by chromName, chromStart
# so that everything goes in numerical order into wigEncode
ssh kkstore02
cd /san/sanvol1/scratch/danRer3/cons/consRun1
find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| wigEncode stdin phastCons5way.wig phastCons5way.wib
# takes a few minutes
ls -l phastCons*
# -rw-rw-r-- 1 hartera protein 198399845 Feb 6 16:05 phastCons5way.wib
# -rw-rw-r-- 1 hartera protein 45304940 Feb 6 16:05 phastCons5way.wig
cp -p phastCons5way.wi? /cluster/data/danRer3/bed/multiz5way/cons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/danRer3/bed/multiz5way/cons
mkdir -p /gbdb/danRer3/wib
ln -s `pwd`/phastCons5way.wib /gbdb/danRer3/wib/phastCons5way.wib
# use this if need to reload table
hgsql -e 'drop table phastCons5way;' danRer3
# load table
hgLoadWiggle danRer3 phastCons5way phastCons5way.wig
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/danRer3/bed/multiz5way/cons
bash
time hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=danRer3 phastCons5way > histogram.data 2>&1
# real 2m33.069s
# user 1m58.310s
# sys 0m16.170s
# create plot of histogram:
cat << '_EOF_' > histo.gp
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer3 Histogram phastCons5 track"
set xlabel " phastCons5 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# happy emacs
gnuplot histo.gp > histo.png
display histo.png &
# add line: wiggle phastCons5way to trackDb.ra for multiz5way to display the
# wiggle for the conservation track.
# check all.joiner for entries for phastCons5way and phastConsElements5way -ok
# copy over html for multiz and edit.
# PHASTCONS SCORES DOWNLOADABLES (DONE, 2006-02-07, hartera)
# prepare compressed copy of ascii data values for downloads
ssh kkstore02
cd /san/sanvol1/scratch/danRer3/cons/consRun1
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons5Scores
for D in ppRaw/chr*
do
C=${D/ppRaw\/}
out=phastCons5Scores/${C}.data.gz
echo "========================== ${C} ${D}"
find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
gzip > ${out}
done
'_EOF_'
chmod +x gzipAscii.sh
time ./gzipAscii.sh
# 192.852u 8.835s 4:04.05 82.6% 0+0k 0+0io 1pf+0w
# creates 331 Mb of data.
# copy data for downloads
ssh kkstore02
mkdir /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores
cd /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores
rsync -a --progress \
pk:/san/sanvol1/scratch/danRer3/cons/consRun1/phastCons5Scores/ .
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores
cd /usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores
ln -s /cluster/data/danRer3/bed/multiz5way/phastCons5wayScores/*.gz .
md5sum *.gz > md5sum.txt
# copy over and edit README.txt from the hg17 phastCons.
# MULTIZ 5-WAY DOWNLOADABLES (DONE, 2006-02-22, hartera)
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/danRer3
mkdir -p multiz5way
cd multiz5way
foreach f (/cluster/data/danRer3/bed/multiz5way/maf/*.maf)
set c = $f:r:t
echo $c
nice gzip $f
ln -s $f.gz .
end
md5sum *.gz > md5sum.txt
# copy over README and edit for this 5-way multiple alignment
##################################################################
# HGNEAR TABLES (also used by the Known Genes details page links)
# GET LATEST PROTEIN SEQUENCE FOR ALL HGNEAR SPECIES (DONE, 2005-02-10, hartera)
# # For species with knownGene, use that; otherwise, download the latest
# version of the main model organism database for this species.
# Human: use knownGene proteins.
# need to get hg18 peptide sequence:
mkdir -p /cluster/data/hg18/bed/blastp
cd /cluster/data/hg18/bed/blastp
pepPredToFa hg18 knownGenePep known.faa
# # Mouse: use knownGene proteins.
# already done:
# mkdir -p /cluster/data/mm7/bed/geneSorter/blastp
# cd /cluster/data/mm7/bed/geneSorter/blastp
# pepPredToFa mm7 knownGenePep known.faa
# Rat: use knownGene proteins.
# already done:
# mkdir /cluster/data/rn3/bed/blastp
# cd /cluster/data/rn3/bed/blastp
# pepPredToFa rn3 knownGenePep known.faa
# Fly: use FlyBase proteins - already done
# /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
# Worm: use WormBase proteins.
mkdir -p /cluster/data/ce2/bed/blastp
cd /cluster/data/ce2/bed/blastp
# Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
# to find out the latest version. It is WormPep 154 so use that.
wget --timestamping -O wormPep154.faa \
ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep154/wormpep154
# Yeast: use SGD proteins.
mkdir -p /cluster/data/sacCer1/bed/blastp
cd /cluster/data/sacCer1/bed/blastp
# get latest version - from Jan 26, 2006
wget -O orf_trans.fasta.jan26.gz \
ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
# rename old version of peptide sequences
mv sgdPep.faa sgdPep.jan9.faa
zcat orf_trans.fasta.jan26.gz > sgdPep.faa
# HGNEAR PROTEIN BLAST TABLES (DONE, 2006-02-10, hartera)
# RENAME SELF BLASTP TABLE AND CHANGE CONFIG.RA FILE (DONE, 2006-04-19, hartera)
# NOTE: mmBlastTab was updated to mm8 as a result of running doHgNearBlastp.pl
# for mm8 on 2006-03-13 (see makeMm8.doc).
# RECREATE THE HGNEAR TABLES FOR RAT AND MOUSE TO UPDATE THEM
# (DONE, 2006-05-31, hartera)
# RE-MADE THE ZEBRAFISH BLASTP TABLES USING THE TRANSCRIPT ID INSTEAD OF THE
# PEPTIDE ID FOR EACH SEQUENCE - FOR ALL OTHER SPECIES THE PEPTIDE SEQUENCES
# ARE REPRESENTED BY THEIR KNOWN GENES TRANSCRIPT ID
# (DONE, 2006-07-03, hartera)
# CHANGED INDEX ON ensZfishBlastTab (DONE, 2006-11-03, hartera)
ssh hgwdev
mkdir -p /cluster/data/danRer3/bed/hgNearBlastp
cd /cluster/data/danRer3/bed/hgNearBlastp
# zebrafish vs fly table has already been created as a result of
# creating the blastp table for dm2 (see makeDm2.doc)
cat << _EOF_ > config.ra
# Latest zebrafish vs. other Gene Sorter orgs:
# human, mouse, rat, worm, yeast
# zebrafish vs fly already done (dm2)
targetGenesetPrefix ensZfish
targetDb danRer3
queryDbs hg18 mm7 rn3 ce2 sacCer1
danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm7Fa /cluster/data/mm7/bed/geneSorter/blastp/known.faa
rn3Fa /cluster/data/rn3/bed/blastp/known.faa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
buildDir /cluster/data/danRer3/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp
_EOF_
# << this line makes emacs coloring happy
nice doHgNearBlastp.pl config.ra >& do.log &
tail -f do.log
# Took about 2 hours to finish.
# The target geneset (self Blastp) should be prefixed with ensZfish
# so change the config.ra and rename the table (2006-04-19, hartera)
hgsql -e 'alter table flyBaseBlastTab rename ensZfishBlastTab;' danRer3
# Update mouse to mm8 and rat to rn4
mkdir updates
cd updates
hgsql -e 'drop table mmBlastTab;' danRer3
hgsql -e 'drop table rnBlastTab;' danRer3
cat << _EOF_ > config.ra
# Update of zebrafish vs. other Gene Sorter orgs:
# mouse mm8 and rat rn4
targetGenesetPrefix ensZfish
targetDb danRer3
queryDbs mm8 rn4
danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates
scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates
_EOF_
# << this line makes emacs coloring happy
nice doHgNearBlastp.pl config.ra >& do.log &
tail -f do.log
# Took about 25 minutes.
# Need to remake all the BlastTab tables using the transcript Id instead
# of the protein ID for zebrafish Ensembl Genes.
# create ensZfishBlastTab and drBlastTab tables using the Ensembl
# transcript Ids for the tables instead of the peptide Ids
# (2006-07-03, hartera)
ssh hgwdev
# create the FASTA file of Ensembl peptide sequences with transcript IDs
# there is a one to one relationship between these IDs.
cd /cluster/data/danRer3/bed/blastp
# then create a fasta file of the sequences:
pepPredToFa danRer3 ensPep ensPep.faa
mkdir /cluster/data/danRer3/bed/hgNearBlastp/updates2
cd /cluster/data/danRer3/bed/hgNearBlastp/updates2
cat << _EOF_ > config.ra
# Latest zebrafish vs. other Gene Sorter orgs:
# human, mouse, rat, fly, worm, yeast
targetGenesetPrefix ensZfish
targetDb danRer3
queryDbs hg18 mm8 rn4 dm2 ce2 sacCer1
danRer3Fa /cluster/data/danRer3/bed/blastp/ensPep.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates2
scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates2
_EOF_
# << this line makes emacs coloring happy
nice doHgNearBlastp.pl config.ra >& do.log &
tail -f do.log
# Took about 45 minutes
# update sacCer1 otherOrgs.ra to use danRer3 instead of danRer1
# for drBlastTab.
# also need to update:
# dm1, hg{15,16,17}, mm{5,6,7}, rn{2,3}
# Human (hg15 and hg16), Drosophila, mouse mm5 and rat all use danRer1.
# Human hg17 and mouse mm6 and mm7 uses danRer2.
# Update these all to use the Zv5 (danRer3) Ensembl proteins.
# Ensembl 38 (April 2006)
ssh hgwdev
cd /cluster/data/danRer3/bed/hgNearBlastp/updates2
cat << _EOF_ > config2.ra
# Latest zebrafish vs. other Gene Sorter orgs:
# human, mouse, rat, fly - older databases
targetGenesetPrefix ensZfish
targetDb danRer3
queryDbs hg17 hg16 hg15 mm7 mm6 mm5 rn3 rn2 dm1
danRer3Fa /cluster/data/danRer3/bed/blastp/ensPep.faa
hg17Fa /cluster/data/hg17/bed/blastp/known.faa
hg16Fa /cluster/data/hg16/bed/blastp/known.faa
hg15Fa /cluster/data/hg15/bed/blastp/known.faa
mm7Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
mm6Fa /cluster/data/mm6/bed/geneSorter/blastp/known.faa
mm5Fa /cluster/data/mm5/bed/geneSorter/blastp/known.faa
rn3Fa /cluster/data/rn3/bed/blastp/known.faa
rn2Fa /cluster/data/rn2/bed/blastp/known.faa
dm1Fa /cluster/data/dm1/bed/blastp/bdgp.faa
buildDir /cluster/data/danRer3/bed/hgNearBlastp/updates2
scratchDir /san/sanvol1/scratch/danRer3HgNearBlastp/updates2
_EOF_
# << this line makes emacs coloring happy
# create BlastTab tables for all queries vs target and no self blastp
nice doHgNearBlastp.pl config2.ra -noSelf -queryOnly >& do2.log &
tail -f do2.log
# Took about 30 minutes
# Update and commit hgGeneData and hgNearData files to make sure that
# all queries and links now work for the transcript ID instead of
# peptide ID for ensZfishBlastTab and drBlastTab tables.
# Gene Sorter is very slow for danRer3. ensZfishBlastTab has an index
# on both the query and target. All the other BlatTab tables have only
# an index on the query so try dropping the index on the target.
hgsql -e 'alter table ensZfishBlastTab drop index target;' danRer3
# Gene Sorter still loads slowly.
# Index is too short. hgLoadBlastTab used to load table and index on
# query is query(12). The first 12 characters are not unique for
# the Ensembl IDs so extend to query(20).
hgsql -e 'alter table ensZfishBlastTab drop index query;' danRer3
hgsql -e 'create index query on ensZfishBlastTab (query(20));' danRer3
# Much faster now.
# END OF HGNEAR STUFF
####################################################
# GENE SET BASED ON ENSEMBL GENES (PROTEIN CODING GENES)
# (in progress, 2005-11-23, hartera)
# see ENSEMBL GENES section for documentation of creation of
# the ensGene, ensGtp and ensPep tables and the track.
# compare the Ensembl and Human Proteins tracks
featureBits danRer3 refGene:cds ensGene:cds -enrichment
# refGene:cds 0.658%, ensGene:cds 1.994%, both 0.589%, cover 89.60%,
# enrich 44.94x
featureBits danRer3 refGene:cds blastHg17KG -enrichment
# refGene:cds 0.658%, blastHg17KG 1.292%, both 0.385%, cover 58.52%,
# enrich 45.30x
# little difference in enrichment and less coverage for Human Proteins so
# it seems like Ensembl is the best choice in terms of genome coverage
# and intersection with RefSeq CDS regions.
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/ensGenes
cd /cluster/data/danRer3/bed/ensGenes
# use Ensembl's BioMart to download the Ensembl Genes UniProt IDs and
# descriptions. For genes with no description, use the InterPro domain.
# Go to http://www.ensembl.org/Multi/martview
# Follow this sequence through the pages:
# Page 1) Select the Ensembl dataset (now v38 here, v36 and v37 is the
# same for Zv5 Danio rerio protein coding genes) and the Danio_rerio
# choice (ZFISH5 here).
# Hit next. 25541 entries total.
# Ensembl 37 from Feb 2006 - this dataset is the same as for the
# version 32 downloaded as above for the Ensembl Genes track.
# (Checked on 2006-03-09, hartera)
# Ensembl 38 from April 2006 - this dataset is the same as for the
# version 32 downloaded as above for the Ensembl Genes track.
# (Checked on 2006-05-31, hartera)
# Page 2) In the GENE section, select Gene type as protein_coding.
# Then hit next. There are now 22877 entries in this filtered version.
# Page 3) Choose the "Features" Attribute Page from the pulldown menu
# at the top. Make sure that under the GENE section, the Ensembl
# Attributes checked are the Ensembl Transcript ID, External Gene ID and the
# Description. Under External References, select Unified UniProt
# accession, and ZFIN Primary ID. Under the Protein section, select
# InterPro Description and InterPro ID under InterPro
# Attributes. Select text, tab-separated for output. Choose gzip
# compression. Hit export. Save as ensGeneInfo37Coding.tsv.gz. Same as for
# Ensembl v36 so update to Ensembl v37. Ensembl v38 is the same too
# so update to this version (2006-05-31, hartera). Also add External Gene
# ID for the Ensembl Attributes.
gunzip ensGeneInfo38Coding.txt.gz
# this file has some errors in it - there is a newline character in the
# middle of the descriptions for the genes with the following UniProt
# IDs: Q5TYV0, Q5SPG7, Q5SPG5, Q5RIJ2, Q5RID3. This causes the table
# to be loaded incorrectly. Edit the ensGeneInfo38Coding.txt file manually
# to remove these extra newlines.
# Repeat above steps and get the Ensembl transcript ID from Ensembl
# Attributes and then get EntrezGene ID, RefSeq DNA ID, and RefSeq
# Peptide ID and from the External References section. Select text,
# tab-separated for output. Choose gzip compression. Hit export. Again
# Ensembl v36 gives the same result for Danio rerio.
# Save as ensGeneInfo38Coding2.txt.gz
cd /cluster/data/danRer3/bed/ensGenes
gunzip ensGeneInfo38Coding2.txt.gz
wc -l ensGeneInfo38*
# 85607 ensGeneInfo38Coding.txt
# 32457 ensGeneInfo38Coding2.txt
# 85607 ensGeneInfo37Coding.tsv
# 33233 ensGeneInfo37Coding2.tsv
# find how many Transcripts have multiple SWISS-PROT IDs
tail +2 ensGeneInfo38Coding.txt | awk '{FS="\t"} {OFS="\t"} \
{print $1, $2, $4}' > ensGene38UniProtandExtId.txt
tail +2 ensGeneInfo38Coding.txt | awk '{FS="\t"} {OFS="\t"} \
{if ($2 != "") print $1, $4}' \
> ensGene38UniProt.txt
sort ensGene38UniProt.txt | uniq > ensGene38UniProt.txt.uniq
awk '{print $1}' ensGene38UniProt.txt.uniq | sort | uniq -c | sort -nr \
> ens38UniProt.count
awk '{if ($1 > 1) print $2}' ens38UniProt.count \
> ens38UniProtMorethanOne.txt
wc -l ens38UniProtMorethanOne.txt
# 2257 ens38UniProtMorethanOne.txt
awk '{if ($1 == 1) print $2}' ens38UniProt.count \
> ens38UniProtOnlyOne.txt
wc -l ens38UniProtOnlyOne.txt
# 8172
# get list of Ensembl transcripts with more than 1 UniProt ID and
# the list of UniProt IDs.
grep -f ens38UniProtMorethanOne.txt ensGene38UniProt.txt.uniq \
> ens38UniProtMorethanOne.uniProtIds
# get list of Ensembl transcripts with more than 1 UniProt ID and
# the list of UniProt IDs and external database IDs.
sort ensGene38UniProtandExtId.txt | uniq \
> ensGene38UniProtandExtId.txt.uniq
grep -f ens38UniProtMorethanOne.txt ensGene38UniProtandExtId.txt.uniq \
> ens38UniProtMorethanOne.uniProtandExtIds
# to do blastp of Ensembl Proteins vs UniProt
# (last uniProt update 2006-01-23):
ssh hgwdev
mkdir -p /cluster/data/danRer3/bed/ensGenes/blastDb
cd /cluster/data/danRer3/bed/ensGenes/blastDb
# create a table of Danio Rerio (Brachydanio rerio in UniProt)
# SWISS-PROT sequences (2006-05-31)
hgsql uniProt -e ' \
create table test.danioProt select protein.* from protein,accToTaxon \
where accToTaxon.taxon = 7955 and accToTaxon.acc = protein.acc;'
# then create a fasta file of the sequences:
pepPredToFa test danioProt danioUniProt.fa
grep '>' danioUniProt.fa | wc -l
# 14297
# then select just those UniProt IDs for the Ensembl Transcript IDs that
# have multiple UniProt IDs associated with them.
ssh kkstore02
cd /cluster/data/danRer3/bed/ensGenes/blastDb
# get list of UniProt IDs
awk '{print $2}' ../ens38UniProtMorethanOne.uniProtIds \
> ens38MultiUniProtIds.idsOnly
sort ens38MultiUniProtIds.idsOnly | uniq \
> ens38MultiUniProtIds.idsOnly.uniq
faSomeRecords danioUniProt.fa ens38MultiUniProtIds.idsOnly.uniq \
ens38DanioUniProt.fa
# 4410 UniProt IDs but 4293 in the FASTA file so 117 are missing.
grep '>' ens38DanioUniProt.fa | sort > uniProtSeq.ids
perl -pi.bak -e 's/>//' uniProtSeq.ids
comm -13 uniProtSeq.ids ens38MultiUniProtIds.idsOnly.uniq > uniProtMissing
# these missing sequences are missing because the uniProt IDs are
# secondary IDs. Find the primary ID.
hgsql -N -e 'select o.acc, o.val from otherAcc as o, accToTaxon as a \
where o.acc = a.acc and a.taxon = 7955;' uniProt > otherAccs.zfish.txt
wc -l otherAccs.zfish.txt
# 321 otherAccs.zfish.txt
grep -f uniProtMissing otherAccs.zfish.txt > uniProtMissing.otherAccs.txt
# found 83 of them
awk '{print $2}' uniProtMissing.otherAccs.txt | sort | uniq > otherAccsFound
comm -13 otherAccsFound uniProtMissing > stillMissing
# check list of deleted TrEMBL IDs - delac_tr.txt from Expasy site.
sort delac_tr.txt > delac_tr.sort
sort stillMissing > stillMissing.sort
comm -12 delac_tr.sort stillMissing.sort | wc
# 34. There are 34 in the stillMissing file and these are all in the
# delac_tr.txt file.
#This file lists the accession numbers of TrEMBL entries which have
#been deleted from the database. Most deletions are due to the deletion of
#the corresponding CDS in the source nucleotide sequence databases EMBL-
#Bank/DDBJ/GenBank. In addition, some entries are recognised to be Open
#Reading frames (ORFs) that have been wrongly predicted to code for
#proteins. When there is enough evidence that these hypothetical proteins
#are not real, we take the decision to remove them from TrEMBL.
# Get the sequences for otherAccsFound from danioUniProt.fa
awk '{print $1}' uniProtMissing.otherAccs.txt | sort | uniq \
> otherAccsFound.altAccs
faSomeRecords danioUniProt.fa otherAccsFound.altAccs ens38DanioOtherAccs.fa
grep '>' ens38DanioOtherAccs.fa | wc
# 73
wc -l otherAccsFound.altAccs
# 73 otherAccsFound.altAccs
cat ens38DanioUniProt.fa ens38DanioOtherAccs.fa > ens38DanioAllUniProt.fa
# create blastDb database
ssh pk
cd /cluster/data/danRer3/bed/ensGenes/blastDb
mkdir format
cd format
mv ../ens38DanioAllUniProt.fa .
/scratch/blast/formatdb -i ens38DanioAllUniProt.fa \
-t ensUniProt -n ensUniProt
# Copy database over to the san
mkdir -p /san/sanvol1/scratch/danRer3/ensGenes/blastDb
cp ensUniProt* /san/sanvol1/scratch/danRer3/ensGenes/blastDb/
ssh hgwdev
mkdir /cluster/data/danRer3/bed/ensGenes/blastp
cd /cluster/data/danRer3/bed/ensGenes/blastp
# get FASTA file of Ensembl sequences
pepPredToFa danRer3 ensPep ensPep.fa
# get list of Ensembl transcripts to use in Blastp
cp ../blastDb/stillMissing .
# need to remove the missing ones (those no longer in TrEMBL) from list
grep -v -f stillMissing ../ens38UniProtMorethanOne.uniProtIds \
> ens38UniProt.uniProtIdsforBlastp
# get final list of Ensembl Transcript Ids
awk '{print $1}' ens38UniProt.uniProtIdsforBlastp | sort | uniq \
> ens38IdsOnlyForBlastp.txt
wc -l ens38IdsOnlyForBlastp.txt
# 2252 ens38IdsOnlyForBlastp.txt
# grab the protein sequences just for these Ensembl Transcripts:
faSomeRecords ensPep.fa ens38IdsOnlyForBlastp.txt ens38ForBlastp.fa
# check that there are 2252 records
# set up the Blastp run
ssh pk
cd /cluster/data/danRer3/bed/ensGenes/blastp
# split Ensembl peptide sequences FASTA file into chunks for cluster
mkdir split
faSplit sequence ens38ForBlastp.fa 200 split/ens38
# make parasol run directory
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/csh -ef
setenv BLASTMAT /san/sanvol1/scratch/blast64/blast-2.2.11/data
/san/sanvol1/scratch/blast64/blast-2.2.11/bin/blastall \
-p blastp -d /san/sanvol1/scratch/danRer3/ensGenes/blastDb/ensUniProt \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod +x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../split/*fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try, check, push, check ... etc.
# Completed: 190 of 190 jobs
# CPU time in finished jobs: 279s 4.65m 0.08h 0.00d 0.000 y
# IO & Wait Time: 2293s 38.22m 0.64h 0.03d 0.000 y
# Average job time: 14s 0.23m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 30s 0.50m 0.01h 0.00d
# Submission to last job: 37s 0.62m 0.01h 0.00d
# Load these into a temporary database table. hgLoadBlastTab
# picks the best hit for each of the queries (Ensembl peptide).
ssh hgwdev
cd /cluster/data/danRer3/bed/ensGenes/blastp/run/out
time hgLoadBlastTab -maxPer=1 test ensUniProtBlastTab *.tab
# 0.154u 0.008s 0:00.66 22.7% 0+0k 0+0io 0pf+0w
# there were 2252 queries
# BLASTP OF ALL ENS PEP VS ALL DANIO UNIPROT SEQS
# Try doing Blastp again but this time using all the zebrafish UniProt
# sequences as the database and all the Ensembl peptides as queries.
# create blastDb database
ssh pk
cd /cluster/data/danRer3/bed/ensGenes/blastDb
mkdir zfishUniProt
cd zfishUniProt
cp ../danioUniProt.fa .
/san/sanvol1/scratch/blast64/blast-2.2.11/bin/formatdb \
-i danioUniProt.fa -t danioUniProt -n danioUniProt
# Copy database over to the san
mkdir -p /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt
cp danioUniProt* /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt
# split Ensembl peptide sequences FASTA file into chunks for cluster
cd /cluster/data/danRer3/bed/ensGenes/blastp
mkdir splitAll
grep '>' ensPep.fa | wc -l
# 32143
faSplit sequence ensPep.fa 8000 splitAll/ens38All
# make parasol run directory
mkdir runAll
cd runAll
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/csh -ef
setenv BLASTMAT /san/sanvol1/scratch/blast64/blast-2.2.11/data
/san/sanvol1/scratch/blast64/blast-2.2.11/bin/blastall \
-p blastp \
-d /san/sanvol1/scratch/danRer3/ensGenes/blastDb/uniProt/danioUniProt \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
# << keep emacs happy
chmod +x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << keep emacs happy
# Create parasol batch
echo ../splitAll/*fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try, check, push, check ... etc.
para time
#Completed: 7609 of 7609 jobs
#CPU time in finished jobs: 11414s 190.23m 3.17h 0.13d 0.000 y
#IO & Wait Time: 401489s 6691.48m 111.52h 4.65d 0.013 y
#Average job time: 54s 0.90m 0.02h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 77s 1.28m 0.02h 0.00d
#Submission to last job: 1096s 18.27m 0.30h 0.01d
# Load these into a temporary database table. hgLoadBlastTab
# picks the best hit for each of the queries (Ensembl peptide).
ssh hgwdev
cd /cluster/data/danRer3/bed/ensGenes/blastp/runAll/out
# cat files together as argument list too long for hgLoadBlastTab
foreach t (*.tab)
cat $t >> ensAll.tab
end
time hgLoadBlastTab -maxPer=1 test ensUniProtAllBlastTab ensAll.tab
# 4.168u 0.737s 0:06.03 81.0% 0+0k 0+0io 5pf+0w
# filter these and select just those with identity >= 95%
# and eValue <= 0.00001
hgsql -N -e 'select distinct(target) from ensUniProtAllBlastTab where \
identity >= 95 and eValue <= 0.00001;' test | sort > out
# get 11910 UniProt IDs mapping to Ensembl transcripts
# there are 11343 unique UniProt IDs in ensGeneInfo38Coding.txt
# load the ensGeneInfo38Coding.txt file into a table
cat << 'EOF' > ens38Zfish.sql
CREATE TABLE ens38Zfish (
transcriptId varchar(255) not null,
extDbId varchar(255) not null,
description longblob not null,
uniProt varchar(255) not null,
zfinId varchar(255) not null,
interProDesc longblob not null,
interProId varchar(255) not null
);
'EOF'
# << emacs
chmod a+r ensGeneInfo38Coding*
tail +2 ensGeneInfo38Coding.txt > ens38Coding.tab
hgLoadSqlTab test ens38Zfish ens38Zfish.sql ens38Coding.tab
hgsql -N -e 'select distinct(uniProt) from ens38Zfish;' test \
| sort > ens38Zfish.uniProt.uniq
wc -l ens38Zfish.uniProt.uniq out
# 11344 ens38Zfish.uniProt.uniq
# 9208 out
comm -12 ens38Zfish.uniProt.uniq out | wc
# 8526 in common
comm -13 ens38Zfish.uniProt.uniq out > fromBlastPOnly
comm -23 ens38Zfish.uniProt.uniq out > fromEns38Only
wc -l from*
# 682 fromBlastPOnly
# 2817 fromEns38Only
# find out how many from fromEns38Only are on the list of deleted from
# TrEMBL IDs
comm -12 fromEns38Only ./blastDb/delac_tr.sort > deletedFromTrEMBL
comm -13 deletedFromTrEMBL fromEns38Only > fromEns38Only2
# get list of transcripts matched to a UniProt by blastP that
# are not in ens38Zfish
hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where uniProt = "";' test | sort > ens38Zfish.noUniProt
hgsql -N -e 'select distinct(query) from ensUniProtAllBlastTab where \
identity >= 95 and eValue <= 0.00001;' test | sort > queryBlast.sort
comm -12 queryBlast.sort ens38Zfish.noUniProt
# 1967
# 9943 transcripts.
# delac_sp.txt in ./blastDb - list of deleted SWISS-PROT IDs
# as of May 30, 2006. 331 IDs.
sort blastDb/delac_sp.txt > blastDb/delac_sp.sort
# compare to list of SP IDs that are not in Blastp hits
comm -12 blastDb/delac_sp.sort fromEns38Only2
# there are none in common
# get list of Danio rerio UniProt IDs
hgsql -N -e 'select distinct(acc) from danioProt;' test | sort \
> danioProt.accs.uniq
comm -13 danioProt.accs.uniq fromEns38Only2
comm -12 danioProt.accs.uniq fromEns38Only2 > inuniProtAndfromEns38Only
hgsql -e 'create table test.ensBlastp select * from ensUniProtAllBlastTab where identity >= 95 and eValue <= 0.00001;' test
## wc -l in*Only
# 1967 inBlastpOnly
# 278 inEns38Only
# these are trancsript IDs
# find the UniProt IDs for the 278 inEns38Only
cd test6/tmp
hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
> ensBlastp.tId.sort
hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where uniProt = "";' test | sort > ens38ZfishwithUniProt.tId.sort
comm -13 ensBlastp.tId.sort ens38ZfishwithUniProt.tId.sort > inEns38Only
comm -23 ensBlastp.tId.sort ens38ZfishwithUniProt.tId.sort > inBlastpOnly
wc -l in*Only
# 9943 inBlastpOnly
# 19955 inEns38Only
wc -l *.sort
# 32143 ens38Zfish.tId.sort
# 11910 ensBlastp.tId.sort
# So there are 9943 that have Blastp hits assigned and 19955 in
# Ensembl 38 that do not have Blastp hits
# find those with no description and also have no UniProt ID.
# there are 21236 and this is the same number without a description
hgsql -N -e 'select distinct(transcriptId) from ens38Zfish where description = "" and uniProt = "";' test | sort > ens38ZfishNoDesc.tid.sort
# 21236 ens38ZfishNoDesc.tid.sort
# compare this to the set of transcript IDs in Ensembl 38 Only
# and for Blastp Only
comm -12 inEns38Only ens38ZfishNoDesc.tid.sort > noBlastHitNoDesc
comm -12 inBlastpOnly ens38ZfishNoDesc.tid.sort > blastHitNoDesc
wc -l *NoDesc
# 0 blastHitNoDesc
# 19712 noBlastHitNoDesc
# then get list of transcript IDs with no description in Ensembl 38 but
# do have a Blastp hit
comm -13 inEns38Only ens38ZfishNoDesc.tid.sort > blastpHitNoDesc.tid
wc -l blastpHitNoDesc.tid
# 1524 blastpHitNoDesc.tid
# These are sequences with a Blastp hit but no description
hgsql -N -e 'select distinct(target) from ensBlastp;' test \
| sort > blastp.uniProt.sort
hgsql -N -e 'select distinct(uniProt) from ens38Zfish;' test \
| sort > ens38.uniProt.sort
wc -l *uniProt.sort
# 9208 blastp.uniProt.sort
# 11344 ens38.uniProt.sort
# there are 8526 in common
comm -13 blastp.uniProt.sort ens38.uniProt.sort > ens38Only.uniProt
comm -23 blastp.uniProt.sort ens38.uniProt.sort > blastpOnly.uniProt
wc -l *.uniProt
# 682 blastpOnly.uniProt
# 2817 ens38Only.uniProt
# there are 80 in the ense38Only.uniProt list that are deleted from TrEMBL
# there are 3 in the blastpOnly.uniProt list that are deleted from TrEMBL
# Q503U2
# Q7SY13
# Q8AW80
# Remove these from each list:
comm -23 ens38Only.uniProt ../../blastDb/delac_tr.sort > ens38Only.uniProt2
comm -23 blastpOnly.uniProt ../../blastDb/delac_tr.sort > blastpOnly.uniProt2
# some of these will be ones where there were several SWISS-PROT IDs for
# each transcript ID and only one is chosen so the others are dropped.
# find how many of these ens38Only.uniProt2 are not in danioProt.accs.uniq
comm -13 ../../danioProt.accs.uniq ens38Only.uniProt2 \
> ens38Only.uniProt.notinDanioProt
# there are 88 of these.
# find list of zebrafish accs with alternative accs in uniProt
hgsql -N -e 'select val from otherAcc as a, accToTaxon as t where a.acc = t.acc and taxon = 7955;' uniProt | sort | uniq > zfishVals.otherAccs.uniq
comm -12 ens38Only.uniProt.notinDanioProt zfishVals.otherAccs.uniq \
# 88 so all of these have alternate accessions.
# remove these from list so:
comm -13 ens38Only.uniProt.notinDanioProt ens38Only.uniProt2 \
> ens38Only.uniProt3
wc -l ens38Only.uniProt3
# 2649 ens38Only.uniProt3
# find number of uniProt IDs belonging to transcript IDs that have multiple
# uniProt IDs: ../../blastDb/ens38MultiUniProtIds.idsOnly.uniq is list of
# uniProt IDs for such transcripts.
comm -12 ens38Only.uniProt3 ../../blastDb/ens38MultiUniProtIds.idsOnly.uniq \
> ens38Only.multiUniProtIds
# there are 2310 of these.
comm -13 ens38Only.multiUniProtIds ens38Only.uniProt3 > ens38Only.uniProt4
# 339 of these left
grep -f ens38Only.uniProt4 ../../ensGene38UniProt.txt \
> ens38Only.uniProt4.tIdAndUpId
awk '{print $1}' ens38Only.uniProt4.tIdAndUpId | sort | uniq \
> ens38Only.uniProt4.tId.uniq
wc -l ens38Only.uniProt4.tId.uniq
# 368 ens38Only.uniProt4.tId.uniq
# Do these all have SWISS-PROT IDs by Blastp?
hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
> ensBlastp.query.sort
comm -12 ens38Only.uniProt4.tId.uniq ensBlastp.query.sort
# 183 so remove these:
comm -23 ens38Only.uniProt4.tId.uniq ensBlastp.query.sort \
> ens38Only.uniProt4.tId.noBlastp
wc -l ens38Only.uniProt4.tId.noBlastp
# 185 ens38Only.uniProt4.tId.noBlastp
#e.g. ENSDART00000002826, this has only 91% ID to Q6DBUS (Q6NYR4 in BioMart
# download. It is 91.7% ID to Q6DBUS in Blastp table.
hgsql -e 'create table test.ensBlastp90 select * from ensUniProtAllBlastTab where identity >= 90 and eValue <= 0.00001;' test
hgsql -N -e 'select distinct(query) from ensBlastp;' test | sort \
> ensBlastp.tId.sort
hgsql -N -e 'select distinct(query) from ensBlastp90;' test | sort \
> ensBlastp90.tId.sort
# transcript IDs in ensBlastp90 and not in ensBlastp
comm -23 ensBlastp90.tId.sort ensBlastp.tId.sort > ensBlastp90Only.tId
wc -l ensBlastp90Only.tId
# 704 ensBlastp90Only.tId
# check these against list of ens38 with no description
comm -12 ens38ZfishNoDesc.tid.sort ensBlastp90Only.tId \
> ensBlastp90Only.noUniProtInEns38
# 416
# also check against list of ens38Only.uniProt4.tId.noBlastp
comm -12 ens38Only.uniProt4.tId.noBlastp ensBlastp90Only.tId
# 140
comm -23 ens38Only.uniProt4.tId.noBlastp ensBlastp90Only.tId \
> ens38Only.uniProt4.tId.noBlastp90
# 45 of these left
# ENSDART00000009971 has only 48% Identity to Q5DTD0. maps to Q58EF8 on
# Ensembl web page.
# Check 10 alignments with >= 95% and 10 that have >= 90% and < 95%
cd /cluster/data/danRer3/bed/ensGenes/blastp/runAll2/out
# ens38Blastp.out has the alignments in NCBI format
# 95-96% 226
# 96-97% 322
# 97-98% 526
# 98-99% 1333
# 99-100% 9503 (both inclusive)
# lower score can be due to shorter query and target
# for >= 95% identity (ensBlastp table in test db). Get BlastP results
# and check Ensembl. All Ensembl records show the UniProt ID given below
# except where noted.
# Query Target Identity qLen qAli tLen tAli E-value Score misMatch Comment
# ENSDART00000012253 Q9W6E8 99.51 609 609 609 609 0 978 3
# ENSDART00000013114 Q6NYT1 99.63 267 267 267 267 4e-143 502 1
# ENSDART00000067816 Q6NZZ8 95.78 433 433 471 460 0 838 2 query doesn't
# begin with Met, no associated UniProt ID in Ensembl
# ENSDART00000018931 Q9DG41 99.42 346 346 552 346 0 709 2 query is partial,
# doesn't begin with Met
# ENSDART00000023846 Q7ZUQ4 98.33 300 300 625 300 1e-179 624 5 query doesn't
# begin with Met
# ENSDART00000006095 Q6P2V4 99.32 443 443 443 443 0 941 3
# ENSDART00000039597 Q5G9L7 100 146 146 146 146 3e-81 295 0 100% coverage
# ENSDART00000028930 Q90442 97.53 84 81 85 81 5e-42 164 2
# ENSDART00000028255 Q8JHY2 100 63 63 63 63 2e-32 132 0
# ENSDART00000042947 Q4QRH1 95.22 1849 456 479 452 0 808 10 alignment length =
# 460 bp, Ensembl doesn't show a UniProt protein ID for this.
# Maybe there is a coverage criterion.
# >= 90% and < 95% identity from ensBlastP90 table in test db:
# There are 705 of these. 11911 have identity >= 95%
# Query Target Identity qLen qAli tLen tAli E-value Score misMatch Comment
# ENSDART00000031211 Q6R5A4 94.21 779 779 846 789 0 1266 38 (gapOpen 6)
# bases 66-846 of target is aligning. Ensembl does not have a UniProt ID
# for this transcript.
# ENSDART00000028390 Q5TKR3 90.87 241 240 243 241 1e-125 444 21 (gapOpen 1)
# ENSDART00000053312 Q5SYD9 92.64 325 325 322 322 8e-175 608 19 (gapOpen 2)
# ENSDART00000056703 Q5CZR2 91.02 323 323 323 323 7e-124 605 29 (gapOpen 0)
# Ensembl has no UniProt ID for this transcript. 91 % ID to NP_001013324.1,
# also 323 bp.
# ENSDART00000044490 Q3ZMH2 90.74 992 985 1082 994 0 1682 64 (gapOpen 7)
# Ensembl has no UniProt ID, just InterPro domains.
# ENSDART00000031487 Q5RHD6 92.81 320 320 319 319 7e-172 598 22 (gapOpen 1)
# Ensembl has no UniProt ID, just InterPro domain.
# ENSDART00000020233 Q6DHI1 91.72 298 298 299 299 6e-145 508 18 (gapOpen 2)
# ENSDART00000061435 Q6PBV8 93.72 76 76 76 76 2e-33 135 5 (gapOpen 0)
# ENSDART00000056959 Q4V9F6 94.21 433 426 440 431 0 728 18 (gapOpen 2)
# only InterPro domain given for Ensembl, no UniProt ID.
# ENSDART00000040220 Q504G5 90.12 172 172 174 172 3e-100 358 17 (gapOpen 0)
# only InterPro domain given for Ensembl, no UniProt ID.
# ENSDART00000066247 Q58EK5 90.08 767 231 485 251 3e-124 441 3 (gapOpen 3)
# only InterPro domain given for Ensembl, no UniProt ID.
# for 95% identity and above, there are only 18 proteins that have
# mismatch > 40.
# for between 90-95% then there are 62 with mismatch > 40.
# use grep -A 100 -w
# look at examples with high mismatch but identity < 95%.
# ---+------+--------+------+--------+----------+
# | query | target | identity | aliLength | mismatch | gapOpen
# |qStart | qEnd | tStart | tEnd | eValue | bitScore |
# ENSDART00000012435 | Q6IQX1 | 91.2 | 1932 | 163 | 5
# | 2 | 1931 | 3 | 1931 | 0 | 3093 |
# this has a high number of mismatches but distributed throughout
# the protein and the UniProt sequence aligns to the genome with the
# same exon structure as for ENSDART00000012435.
# ENSDART00000050066 | Q7M558 | 91.69 | 3008 | 249 | 1
# | 0 | 3008 | 0 | 3007 | 0 | 5543 |
# this is a very large protein so the mismatch is small compared to
# the protein size. has same exon structure as Ensembl protein at
# chr17:18,247,969-18,259,468. Blats to several regions - could be a
# processed pseudogene or assembly artifact.
# If identity < 95% and mismatch > 40 then size is at least around 450bp.
# ENSDART00000028708 | Q7T296 | 90.12 | 486 | 45 | 1
# | 0 | 486 | 18 | 501 | 0 | 907 |
# The most gaps in a sequence is 9 - only 1 sequence < 95% identity and
# most have 0-2 gaps. Same for those >= 95% identity.
# ENSDART00000039735 | Q7T1C9 | 98.15 | 1406 | 12 | 9
# | 0 | 1394 | 0 | 1404 | 0 | 2175 |
# Gaps are spread throughout the seqeunce and are short. Blat of this
# UniProt sequence gives the same exon structure as for the Ensembl seq.
# | ENSDART00000053813 | Q7M560 | 90.07 | 2275 | 104 | 9
# | 0 | 2178 | 99 | 2349 | 0 | 3966 |
# There are several large gaps in the first third of the sequence. The
# rest of the gaps are short. Ensembl does not have a UniProt ID for this
# transcript. Blat aligns this sequence to several places on the genome
# all in close proximity to each other. One alignment corresponds to the
# an Ensembl ID but not the one above. It does align to the region of
# ENSDART00000053813 but with a different exon structure.
# ENSDART00000044490 | Q3ZMH2 | 90.74 | 1004 | 64 | 7
# | 0 | 985 | 88 | 1082 | 0 | 1682 |
# This has a couple of larger gaps. The UniProt sequence aligns to the
# same region as ENSDART00000044490 which has 3 extra exons. There is
# another transcript with the same exon structure.
# | ENSDART00000041503 | Q3ZMH2 | 91.42 | 991 | 63 |
# 5 | 0 | 974 | 82 | 1068 | 0 | 1684 |
# This has only slightly higher identity.
# ENSDART00000025635 | Q4FE55 | 99.33 | 2545 | 6 | 7
# | 0 | 2542 | 0 | 2537 | 0 | 4859 |
# just short gaps. This Blats to the same region of ENSDART00000025635
# and gives the same exon structure.
# could filter more using pslReps but should not filter on minAli since
# either the query or target could be partial.
# Use identity >= 90% as the cutoff and then associate the RefSeqs with
# ZFIN IDs and update the official ZFIN Gene symbols.
#
ssh hgwdev # kkstore02
cd /cluster/data/danRer3/bed/ensGenes
mkdir alignments
cd alignments
# Add a proteinID column to the ensGene table:
ssh hgwdev
cd /cluster/data/danRer3/bed/ensGenes
# Add protein ID column:
hgsql -e 'alter table ensGene add proteinID varchar(40) NOT NULL;' danRer3
# Add index to this column:
# Next step, download the ZFIN IDs and UniProt IDs
hgsql -e 'alter table ensGene add index(proteinID);' danRer3
hgsql -e 'select count(*) from ensGene;' danRer3
# 32143
hgsql -e 'update ensGene set proteinID = "";' danRer3
# ensBlastp is the table in the test database where proteins have
# >=90% identity to the Ensembl proteins.
hgsql -e 'select count(*) from ensGene as g, test.ensBlastp90 as p \
where g.name = p.query;' danRer3
# for >= 90% there are
# 12614
# for >=95%, there are
# 11910
# Use these UniProt IDs to fill in proteinID table.
hgsql -e 'update ensGene as g, test.ensBlastp90 as p \
set g.proteinID = p.target where g.name = p.query;' danRer3
# check that there are 12614 rows with proteinID filled.
hgsql -e 'select count(*) from ensGene where proteinID != "";' danRer3
# 12614
# once this is done, can create ensCanonical and ensIsoforms table -
# see section on "BUILD GENE SORTER TABLES".
# Add table for Ensembl 38 Ensembl Transcript IDs and RefSeq IDs
# and Entrez Gene ID.
ssh hgwdev
cd /cluster/data/danRer3/bed/ensGenes
cat << 'EOF' > ens38Zfish2.sql
CREATE TABLE ens38Zfish2 (
transcriptId varchar(255) not null,
entrezGeneId varchar(255) not null,
refSeqId varchar(255) not null,
refSeqProtId varchar(255) not null
);
'EOF'
# << emacs
tail +2 ensGeneInfo38Coding2.txt > ens38Coding2.tab
hgLoadSqlTab test ens38Zfish2 ens38Zfish2.sql ens38Coding2.tab
# 24523 lines where there is no Entrez Gene Id so these are set to 0.
hgsql -N -e 'select distinct(entrezGeneId) from ens38Zfish2;' test \
| sort > ens38Zfish2.geneId.uniq
wc -l ens38Zfish2.geneId.uniq
# 6764 ens38Zfish2.geneId.uniq
hgsql -e 'select count(distinct extDbId) from ens38Zfish;' test
# 9028
hgsql -N -e 'select distinct(extDbId) from ens38Zfish;' test \
| sort > ens38Zfish.extDbId.sort
grep -v NM ens38Zfish.extDbId.sort > ens38Zfish.extDbIdNoNM.sort
# 8982 left
grep -v BRARE ens38Zfish.extDbIdNoNM.sort \
> ens38Zfish.extDbIdNoNMandNoSP.sort
grep -v NP ens38Zfish.extDbIdNoNMandNoSP.sort \
> ens38Zfish.extDbIdNoNMNoSPNoNP.sort
wc -l ens38Zfish.extDbIdNoNMNoSPNoNP.sort
# 5284 ens38Zfish.extDbIdNoNMNoSPNoNP.sort
awk '{print $2}' ens38/ensToRefSeqvsZFIN.txt | sort | uniq \
> ensToRefSeqvsZFIN.names.uniq
# how many in common
comm -12 ens38Zfish.extDbIdNoNMNoSPNoNP.sort ensToRefSeqvsZFIN.names.uniq \
> common
wc -l common
# 4176 common
comm -23 ens38Zfish.extDbIdNoNMNoSPNoNP.sort ensToRefSeqvsZFIN.names.uniq \
> extDbIdNotfromZFINviaRefSeq
hgsql -N -e 'select mrnaAcc from refLink where locusLinkId != "";' danRer3 | sort | uniq > mrnaAcc.refLink.dr3.uniq
wc -l mrnaAcc.refLink.dr3.uniq
# 8811 mrnaAcc.refLink.dr3.uniq
comm -12 mrnaAcc.refLink.dr3.uniq ensToRefSeq.refseq | wc
# 7738
wc -l ensToRefSeq.refseq
# 7738
# merge the ens38Zfish2 table with ens38ZfishNew.
# for the Known Genes details pages. Changed table name from
# ensGeneXRef to ensXRefZfish as there are a number of tables already
# with similar names to ensGeneXRef so this would be confusing.
# create a table definition for ensXRefZfish:
# (updated 2006-11-08, hartera)
cd ~/kent/src/hg/lib
cat << 'EOF' > ensXRefZfish.as
table ensXRefZfish
"Link from an Ensembl Transcript ID to other database IDs and description."
(
string ensGeneId; "Ensembl Transcript ID"
string zfinId; "ZFIN ID"
string uniProtId; "Unified UniProt protein accession"
string spDisplayId; "UniProt Display ID"
string geneId; "ZFIN Gene Symbol (formerly LocusLink) ID"
string geneSymbol; "Official ZFIN Gene Symbol"
string refSeq; "RefSeq DNA Accession"
string protAcc; "RefSeq Protein Accession"
string description; "Description"
)
'EOF'
autoSql ensXRefZfish.as ensXRefZfish
mv ensXRefZfish.h ../inc
# commit ensXRefZfish* files to CVS.
# add zfinId, uniProtId, spDisplayId, geneId, geneSymbol, refSeq and
# protAcc as keys. ensGeneId is already the primary key.
# description field is not long enough so it must be changed to a
# longblob.
perl -pi.bak -e 's/description varchar\(255\)/description longblob/' \
ensXRefZfish.sql
# get the gene2refseq file from NCBI to give the Entrez Gene ID
# and symbol for refSeq accessions. Taxonomy ID is 7955 for Danio rerio.
# columns in file are tax_id, GeneID, status,
# RNA nucleotide accession.version, RNA nucleotide gi,
# protein accession.version, protein gi, genomic nucleotide
# accession.version, genomic nucleotide gi, start position on the genomic
# accession, end position on the genomic accession, orientation.
# for the gene_info file, column headings are:
# tax_id, GeneID, Symbol, LocusTag, Synonyms, dbXrefs, chromosome,
# map location, description, type of gene, Symbol from nomenclature
# authority, Full name from nomenclature authority, Nomenclature status.
# DOWNLOAD LATEST versions (from Nov. 8, 2006)
ssh kkstore02
mkdir /cluster/data/danRer3/bed/ensGenes/downloads
cd /cluster/data/danRer3/bed/ensGenes/downloads
wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2refseq.gz
wget --timestamp ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
gunzip gene2refseq.gz
gunzip gene_info.gz
# get records for taxon ID: 7955
awk '{if ($1 == 7955) print;}' gene2refseq > zfish.gene2refseq
wc -l zfish.gene2refseq
# 14659 zfish.gene2refseq
# 50465 zfish.gene2refseq - in March
# Most of the ones no longer in the gene2refseq file are
# PREDICTED, PROVISIONAL AND MODEL.
# 37206 MODEL
# 6278 PREDICTED
# 6174 PROVISIONAL
# 43 NA
# 13 Reviewed
# 5 REVIEWED
# 1 VALIDATED
# New sequences added:
# 7021 PROVISIONAL
# 6801 PREDICTED
# 52 NA
# 13 Reviewed
# 12 VALIDATED
# 10 INFERRED
# 5 REVIEWED
awk '{if ($1 == 7955) print;}' gene_info > zfish.gene_info
wc -l zfish.gene_info
# 38915 zfish.gene_info
# 38126 zfish.gene_info - in March
# checked that the Ensembl 38 genes for zebrafish are the same as
# for Ensembl 35 for which these files were downloaded (see above -
# updated file names to reflect v38).
# also download the file from ZFIN that gives gene Symbols, ZFIN IDs
# and RefSeq accessions. ZFIN associates more than one ZFIN ID with
# UniProt IDs but there is a one to one relationship for ZFIN IDs
# and RefSeq accessions. Therefore the RefSeq accessions can be used
# to identify a ZFIN ID and gene name and vice versa.
wget --timestamping http://zfin.org/data_transfer/Downloads/refseq.txt
# already the ensGeneInfo38Coding.txt and ensGeneInfo38Coding2.txt
# files into tables so that the information can be put together.
# these are ens38Zfish and ens38Zfish2 in the test database.
# first copy the ens38Zfish table and then replace the uniProtId column
# with the best hits from the ensBlastp90 table.
ssh hgwdev
cd /cluster/data/danRer3/bed/ensGenes
sed -e 's/ens38Zfish/ens38ZfishNew/' ens38Zfish.sql > ens38ZfishNew.sql
# create table
hgsql test < ens38ZfishNew.sql
hgsql -e 'insert into ens38ZfishNew select * from ens38Zfish;' test
# Add spDisplayId column:
hgsql -e \
'alter table ens38ZfishNew add spDisplayId varchar(255) NOT NULL;' test
# add some indices
hgsql -e 'create index uniProt on ens38ZfishNew (uniProt);' test
hgsql -e 'create index query on ens38ZfishNew (transcriptId(20));' test
# first remove uniProt IDs and add those found by Blastp:
hgsql -e 'update ens38ZfishNew set uniProt = "";' test
# add displayIds from uniProt to this table
hgsql -e 'select count(*) from ens38ZfishNew as g, ensBlastp90 as p \
where g.transcriptId = p.query;' test
# 37362
hgsql -e 'update ens38ZfishNew as g, ensBlastp90 as p \
set g.uniProt = p.target where g.transcriptId = p.query;' test
# check that 37362 rows have an entry for uniProt - ok
hgsql -e 'select count(*) from ens38ZfishNew as g, uniProt.displayId as p \
where g.uniProt = p.acc;' test
# 36647
# 36647 have display IDs in UniProt
hgsql -e 'update ens38ZfishNew as g, uniProt.displayId as p \
set g.spDisplayId = p.val where g.uniProt = p.acc;' test
# check that 36647 of the rows have spDisplayId - ok.
# add new columns for ens38ZfishNew
hgsql -e \
'alter table ens38ZfishNew add entrezGeneId varchar(255) NOT NULL;' test
hgsql -e \
'alter table ens38ZfishNew add refSeqId varchar(255) NOT NULL;' test
hgsql -e \
'alter table ens38ZfishNew add refSeqProtId varchar(255) NOT NULL;' test
# merge together the tables.
hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
set g.entrezGeneId = e.entrezGeneId \
where g.transcriptId = e.transcriptId;' test
hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
set g.refSeqId = e.refSeqId \
where g.transcriptId = e.transcriptId;' test
hgsql -e 'update ens38ZfishNew as g, ens38Zfish2 as e \
set g.refSeqProtId = e.refSeqProtId \
where g.transcriptId = e.transcriptId;' test
cd /cluster/data/danRer3/bed/ensGenes/downloads/
hgsql -N -e 'select * from ens38ZfishNew;' test > ens38ZfishNew.txt
ssh kkstore04
cd /cluster/data/danRer3/bed/ensGenes/downloads/
# There are 308 cases where there is a RefSeq ID but no Entrez Gene ID.
# There are 1046 cases where there is an Entrez Gene ID but no RefSeq ID.
# Use the NCBI files to fill in the gaps where needed.
# get ZFIN file of ZFIN IDs, gene name and GenBank accession
# refseq.txt has ZFIN IDs, gene name and RefSeq ID.
wget --timestamping http://zfin.org/data_transfer/Downloads/gene_seq.txt
awk '{print $1, $2}' gene_seq.txt | sort | uniq > geneSeq.genes
awk '{print $1, $2}' refseq.txt | sort | uniq > refSeq.genes
comm -23 refSeq.genes geneSeq.genes > refSeqOnly
comm -13 refSeq.genes geneSeq.genes > geneSeqOnly
wc -l *SeqOnly
# 9542 geneSeqOnly
# 827 refSeqOnly
# get certain fields from each file and merge
awk 'BEGIN {FS="\t"} {OFS="\t"} {print $1, $2, $3, $4, $6;}' \
zfish.gene2refseq > zfish.gene2refseqSubset.txt
awk 'BEGIN {FS="\t"} {OFS="\t"} \
{print $2, $3, $5, $6, $9, $10, $11, $12;}' \
zfish.gene_info > zfish.gene_infoSubset.txt
# need to sort on the GeneID field (second field in refseq file and
# first field in gene_info file):
sort -n -k2 zfish.gene2refseqSubset.txt | uniq \
> zfish.gene2refseqSubset.sort
sort -n -k1 zfish.gene_infoSubset.txt | uniq > zfish.gene_infoSubset.sort
# join the two files based on the GeneID (Entrez Gene ID) which is
# the second field in refseq file and first field in gene_info file.
# Need to set the $tab variable in .tcshrc file:
# set tab = " "
join -t "$tab" -1 2 -2 1 zfish.gene2refseqSubset.sort \
zfish.gene_infoSubset.sort \
> zfish.gene2refSeqPlusInfo.txt
# The program needs to be written to fill in these gaps for RefSeq ID,
# Entrez Gene ID and RefSeq Peptide ID. It should then check for the
# gene symbol using the ZFIN ID using RefSeq ID.
# write program taking ensGene38Coding.tsv and ensGene38Coding2.tsv as
# input and also the RefSeq files to find Entrez Gene IDs and Gene Symbols.
# and give the tabbed output for loading into the ensXRefZfish table.
# hgEnsGeneXRef.c in ~/kent/src/hg/near/hgZfishEnsXRef
/cluster/home/hartera/bin/x86_64/hgZfishEnsXRef \
ensGeneInfo38.txt zfish.gene2refSeqPlusInfo.txt refseq.txt \
ens37XRefZfish.tab >& ens37XRefZfish.log
# load this tabbed file into ensXRefZfish table
ssh hgwdev
cd /cluster/data/danRer3/bed/ensGenes
# remove old table:
hgsql -e 'drop table ensXRefZfish;' danRer3
hgLoadSqlTab danRer3 ensXRefZfish ~/kent/src/hg/lib/ensXRefZfish.sql \
ens38XRefZfish.tab
# loaded with no problems.
# Now need to check its contents:
mkdir testing
cd testing
hgsql -N -e 'select zfinId, geneSymbol, refSeq from ensXRefZfish where \
zfinId != "" AND refSeq != "";' test > zfinIdsymbAndrefseq.txt
sort zfinIdsymbAndrefseq.txt | uniq > zfinIdsymbAndrefseq.sort
sort ../refseq.txt | uniq > refseq.sort
perl -pi.bak -e 's/\t\n/\n/' refseq.sort
comm -23 zfinIdsymbAndrefseq.sort refseq.sort | wc
comm -12 zfinIdsymbAndrefseq.sort refseq.sort | wc
cd /cluster/data/danRer3/bed/ensGenes/testProgram/tmp3
awk 'BEGIN {FS="\t"} {print $5}' ens38ZfishNew.sort | sort | uniq \
ensFile.zfinIds.sort
# There are 7321 zfin IDs
# 7284 ZFIN IDs in table and 6499 with a RefSeq.
hgsql -N -e 'select distinct(zfinId) from ensXRefZfish where refseq = "" \
and zfinId != "" and geneSymbol = "";' test \
| sort > zfinIdwithNoRefSeqNoSymb.sort
# There are 853 with no refseq but a zfinId and no gene symbol and 690
# are unique ZFIN IDs.
# compare these to ZFIN IDs in the zfish.gene2refSeqPlusInfo.txt from
# NCBI files:
awk 'BEGIN {FS="\t"} {print $8;}' zfish.gene2refSeqPlusInfo.txt \
| sort | uniq > zfinIds.fromNcbiFile.sort
# remove first line and "ZFIN:" prefix
tail +2 zfinIds.fromNcbiFile.sort | sed -e 's/ZFIN://' \
> zfinIds.fromNcbiFile.sort2
comm -13 zfinIds.fromNcbiFile.sort2 zfinIdwithNoRefSeqNoSymb.sort | wc
# 251 of these with no symbols are not found in the NCBI file
comm -12 zfinIds.fromNcbiFile.sort2 zfinIdwithNoRefSeqNoSymb.sort \
> zfinIds.inNcibFile.noRefSeqOrSymbinXRef
awk '{print $1}' refseq.txt | sort | uniq > refseq.zfId.sort
comm -13 refseq.zfId.sort zfinIdwithNoRefSeqNoSymb.sort | wc
# 176 of these with no symbols are not found in the ZFIN RefSeq file
comm -12 refseq.zfId.sort zfinIdwithNoRefSeqNoSymb.sort \
> zfinIds.inZfinFile.noRefSeqOrSymbinXRef
# 435 are in both of these lists
wc -l *.noRefSeqOrSymbinXRef
# 439 zfinIds.inNcibFile.noRefSeqOrSymbinXRef
# 514 zfinIds.inZfinFile.noRefSeqOrSymbinXRef
# edit ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/genome.ra to give
# mySQL queries to ensGtp and ensXRefZfish to retrieve name, protein and
# description. Changed XRef table name to new name.
cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/genome.ra
name global
knownGene ensGene
knownGenePep ensPep
nameSql select gene from ensGtp where transcript = '%s'
descriptionSql select description from ensXRefZfish where ensGeneId = '%s'
proteinSql select uniProtId from ensXRefZfish where ensGeneId = '%s'
_EOF_
# << happy emacs
# created blastp hgNear tables by alignment of Zebrafish Ensembl peptide
# sequences to the equivalent "Known Genes" peptide sets for other species
# - see hgNear sections above. Then create an otherOrg.ra file for
# zebrafish specifying the species and databases for these organisms
# with blastp homolog tables.
cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/otherOrgs.ra
name human
db hg18
name mouse
db mm8
name rat
db rn4
name drosophila
db dm1
name cElegans
db ce2
name yeast
db sacCer1
_EOF_
# << this line makes emacs coloring happy
# add Zebrafish-specific section.ra file
cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/section.ra
name method
shortLabel Methods
longLabel Ensembl Genes Methods, Credits, and Data Use Restrictions
priority 140
_EOF_
# << this line makes emacs coloring happy
# added links to the Zebrafish links.ra file
# update links.ra so that link for Ensembl Genes is to the correct
# stable archive link for Ensembl37 (feb 2006) and change XRef
# table name to new name.
cat << _EOF_ > ~/kent/src/hg/hgGene/hgGeneData/Zebrafish/links.ra
# Zebrafish-specific link info.
# This contains info to construct the quick links.
name genome
tables ensGene
idSql select chrom,txStart+1,txEnd from ensGene where name = '%s'
name family
tables ensGene
idSql select name from ensGene where name = '%s'
name ensemblGenes
shortLabel Ensembl Genes
tables ensGene
idSql select name from ensGene where name = '%s'
url http://feb2006.archive.ensembl.org/Danio_rerio/transview?transcript=%s
priority 25
name zfin
shortLabel ZFIN
tables ensXRefZfish
idSql select zfinId from ensXRefZfish where ensGeneId = '%s'
url http://zfin.org/cgi-bin/webdriver?MIval=aa-markerview.apg&OID=%s
priority 28
name tbSchema
shortLabel Table Schema
tables ensGene
name uniProt
shortLabel UniProt
tables ensXRefZfish
idSql select uniProtId from ensXRefZfish where ensGeneId = '%s'
priority 30
name refSeq
shortLabel RefSeq
tables ensXRefZfish
idSql select refSeq from ensXRefZfish where ensGeneId = '%s'
url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=Nucleotide&term=%s&doptcmdl=GenBank&tool=genome.ucsc.edu
priority 40
name refSeqPep
shortLabel RefSeq Peptide
tables ensXRefZfish
idSql select protAcc from ensXRefZfish where ensGeneId = '%s'
url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=protein&term=%s&doptcmdl=GenPept&tool=genome.ucsc.edu
priority 42
name entrezGene
shortLabel Entrez Gene
tables ensXRefZfish
idSql select geneId from ensXRefZfish where ensGeneId = '%s'
url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=gene&cmd=Retrieve&dopt=Graphics&list_uids=%s&tool=genome.ucsc.edu
priority 45
name genBank
hide
name pubMed
hide
name geneCards
hide
name stanfordSource
hide
name cgap
hide
name ensembl
hide
name aceView
hide
_EOF_
# << this line makes emacs coloring happy
# then make my to visualize in own sandbox
cd ~/kent/src/hg/hgGene
make my
# commit *.ra files for Zebrafish to CVS.
# edited hgGene.c so that the Gene Symbol (if available) is displayed
# in the description section of the details page.
# added ensXRefZfish to ensemblTranscriptId rules in all.joiner.
# add entry to danRer3/trackDb.ra:
# track ensGene
# shortLabel Ensembl Genes
# longLabel Ensembl v37 Gene Predictions (Protein Coding Genes)
# group genes
# priority 32.8
# visibility pack
# color 150,0,0
# type genePred ensPep
# hgGene on
# STS MARKERS (in progress, 2005-10-13, hartera)
# DOWNLOADED RECENTLY FROM NCBI
ssh kkstore02
mkdir -p /cluster/data/danRer3/bed/stsMarkers
cd /cluster/data/danRer3/bed/stsMarkers
# UniSTS is the a unique subset of markers that are STS markers from the
# six zebrafish mapping panels: GAT, HS, LN54, MGH, MOP, T51, and also
# ZMAP which contains markers from the other panels. Among markers in
# these map, a subset that are STSs and with available primers sequences
# were imported to UniSTS. These include submitted maps and those from
# the Zebrafish Information Network (ZFIN).
############################################################################
## BLASTZ swap from mm8 alignments (DONE - 2006-02-28 - Hiram)
ssh pk
cd /cluster/data/mm8/bed/blastzDanRer3.2006-02-28
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF > swap.out 2>&1 &
time nice -n +19 featureBits danRer3 chainMm8Link
# 54831876 bases of 1630323462 (3.363%) in intersection
# SWAP CHAINS/NET RN4 (DONE 4/2/06 angie)
ssh kkstore02
mkdir /cluster/data/danRer3/bed/blastz.rn4.swap
cd /cluster/data/danRer3/bed/blastz.rn4.swap
doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.danRer3/DEF \
-workhorse kkr7u00 >& do.log & tail -f do.log
ln -s blastz.rn4.swap /cluster/data/danRer3/bed/blastz.rn4
############################################################################
## BLASTZ swap from hg17 alignments (DONE 2006-04-09 markd)
ssh pk
mkdir /cluster/data/danRer3/bed/blastz.hg17.swap
ln -s blastz.hg17.swap /cluster/data/danRer3/bed/blastz.hg17
cd /cluster/data/danRer3/bed/blastz.hg17.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
/cluster/data/hg17/bed/blastz.danRer3/DEF >& swap.out&
# failed due to netChains: looks like previous stage was not
# successful (can't find [danRer3.hg17.]all.chain[.gz]).
#
mv swap.out swap.out.1
# rerun with -continue=net
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -continue=net -stop=net \
-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
/cluster/data/hg17/bed/blastz.danRer3/DEF >& swap.out&
# create the net filee (DONE 2006-04-09 markd)
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.hg17.swap/axtChain
nice netClass -verbose=0 -noAr noClass.net danRer3 hg17 danRer3.hg17.net
nice gzip danRer3.hg17.net
###########################################################################
# SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM OTHER ASSEMBLIES
# (DONE, 2006-04-17, hartera)
# ADD TO SAN FOR PK RUNS (DONE, 2006-05-30, hartera)
# followed instructions used in makePanTro2.doc
ssh kkr1u00
cd /cluster/data/danRer3/bed
mkdir -p liftOver
cd liftOver
makeLoChain-split danRer3 /cluster/data/danRer3/nib >&! split.log &
# Took about 30 minutes.
# add split10k to san for pk runs (2006-05-30, hartera)
ssh kk
rsync -a --progress /iscratch/i/danRer3/split10k \
/san/sanvol1/scratch/danRer3/
###########################################################################
# LIFTOVER CHAINS TO DANRER2 (DONE, 2006-04-25 - 2006-05-03, hartera)
# CLEANUP BLAT DIRECTORY (DONE, 2006-12-14, hartera)
# Split (using makeLoChain-split) of danRer2 is doc'ed in makeDanRer2.doc
# Do what makeLoChain-split says to do next (start blat alignment)
ssh kk
mkdir -p /cluster/data/danRer3/bed/liftOver
cd /cluster/data/danRer3/bed/liftOver
makeLoChain-align danRer3 /iscratch/i/danRer3/nib danRer2 \
/iscratch/i/danRer2/split10k \
/iscratch/i/danRer2/11.ooc >&! align.log &
# Took about 5 minutes.
# Do what its output says to do next (start cluster job)
cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/run
para try, check, push, check, ...
para time >&! run.time
# Completed: 782 of 784 jobs
# Crashed: 2 jobs
# CPU time in finished jobs: 4324484s 72074.73m 1201.25h 50.05d 0.137 y
# IO & Wait Time: 35200s 586.67m 9.78h 0.41d 0.001 y
# Average job time: 5575s 92.92m 1.55h 0.06d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 62741s 1045.68m 17.43h 0.73d
# Submission to last job: 355469s 5924.48m 98.74h 4.11d
# 2 jobs keep crashing so try them on the pk: chrUn_chrUn and chrUn_chr20
# need to copy the danRer2 split10k over to the pk
ssh kkr1u00
mkdir -p /san/sanvol1/scratch/danRer2/split10k
rsync -a --progress /iscratch/i/danRer2/split10k/* \
/san/sanvol1/scratch/danRer2/split10k/
# copy over 11.ooc file for danRer2
cp /iscratch/i/danRer2/11.ooc /san/sanvol1/scratch/danRer2
ssh pk
cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/run
mkdir extraRun raw
cd extraRun
grep chrUn_chrUn ../spec > spec
grep chrUn_chr20 ../spec >> spec
# change directories for spec file
perl -pi.bak -e 's#/iscratch/i#/san/sanvol1/scratch#g' spec
rm spec.bak
para create spec
para push, check etc.
para time >& run.time
# Completed: 2 of 2 jobs
# CPU time in finished jobs: 263163s 4386.05m 73.10h 3.05d 0.008 y
# IO & Wait Time: 62s 1.04m 0.02h 0.00d 0.000 y
# Average job time: 131613s 2193.54m 36.56h 1.52d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 147104s 2451.73m 40.86h 1.70d
# Submission to last job: 147104s 2451.73m 40.86h 1.70d
ssh kkr1u00
# merge all raw output:
cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25
mv ./run/raw/*.psl ./raw/
# lift alignments
cd /cluster/data/danRer3/bed/liftOver
makeLoChain-lift danRer3 danRer2 >&! lift.log &
# Took about 8 minutes to run.
# chain alignments
ssh kki
cd /cluster/data/danRer3/bed/liftOver
makeLoChain-chain danRer3 /iscratch/i/danRer3/nib \
danRer2 /iscratch/i/danRer2/nib >&! chain.log &
# Do what its output says to do next (start cluster job)
cd /cluster/data/danRer3/bed/blat.danRer2.2006-04-25/chainRun
para try, check, push, check etc. ...
para time >&! run.time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 2751s 45.86m 0.76h 0.03d 0.000 y
# IO & Wait Time: 879s 14.64m 0.24h 0.01d 0.000 y
# Average job time: 130s 2.16m 0.04h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 598s 9.97m 0.17h 0.01d
# Submission to last job: 1520s 25.33m 0.42h 0.02d
# net alignment chains
ssh kkstore02
cd /cluster/data/danRer3/bed/liftOver
makeLoChain-net danRer3 danRer2 >&! net.log &
# Took about 24 minutes to run.
# load reference to over.chain into database table,
# and create symlinks /gbdb and download area
ssh hgwdev
cd /cluster/data/danRer3/bed/liftOver
makeLoChain-load danRer3 danRer2 >&! load.log &
# clean up
rm *.log
# test by converting a region using the "convert" link on
# the browser, and comparing to blat of the same region
# CLEANUP for LiftOver blat directory (2006-12-14, hartera)
ssh kkstore02
rm -r blat.danRer2.2006-04-25
# REDO BACENDS - bacEndPairs, bacEndSingles, bacEndBadPairs and all_bacends
# (split as chrN_allBacends) ONLY (DONE, 2006-05-01 - 2006-05-08, hartera)
# RELOADED chrN_allBacends TABLES (DONE, 2006-06-08, hartera)
# RECREATED all_bacends table WITH ONLY RELEVANT PSLS FOR THE LFS BED
# TABLES FOR PAIRS, PAIRSBAD AND SINGLES (DONE, 2006-08-04, hartera)
# NOTE: there are overlapping BAC clone ends for danRer3. Some of these
# are only a few kb apart (from beginning of one to end of the other)
# so use stricter pslPairs parameters as for human and mouse.
# These BAC Ends should be about 150-200 kb. Typically, they are
# 50 - 300 kb apart.
# NOTE: IN FUTURE, IF SPLITTING all_bacends TABLE BY CHROM AND
# RENAMING AS chrN_allBacends THEN USE allBacends INSTEAD OF
# all_bacends AS ARGUMENT TO pslPairs. THIS WILL THEN AUTOMATICALLY
# ADD THE CORRECT PSL TABLE NAME TO THE BED (LFS) TABLES
ssh kkstore02
mkdir /cluster/data/danRer3/bed/bacends/pairsNew
cd /cluster/data/danRer3/bed/bacends/pairsNew
set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
/cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl \
$bacDir/bacEndPairs.txt all_bacends bacEnds
wc -l bacEnds.*
# 1725 bacEnds.long
# 12081 bacEnds.mismatch
# 242235 bacEnds.orphan
# 156444 bacEnds.pairs
# 616 bacEnds.short
# 1017 bacEnds.slop
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes'\
> ../header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
# make pairs bed file
cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairs.bed
# also need to process bacEndSingles.txt into a database table
# for singles in bacEndSingles.txt, create a dummy file where they
# are given zJA11B12T7 as dummy sequence pair. If the single is a forward
# sequence, put the dummy sequence in the second column, if the single is
# a reverse sequence put in first column. use a perl script to do this.
cd /cluster/data/danRer3/bed/bacends
set bacDir = /cluster/data/danRer3/bed/bacends/bacends.1
mkdir singlesNew
cd singlesNew
cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
perl formatSingles.pl $bacDir/bacEndSingles.txt > \
$bacDir/bacEndSingles.format
# then run pslPairs on this formatted file
/cluster/bin/i386/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
all_bacends bacEnds
wc -l bacEnds.*
# 0 bacEnds.long
# 0 bacEnds.mismatch
# 11439 bacEnds.orphan
# 0 bacEnds.pairs
# 0 bacEnds.short
# 0 bacEnds.slop
# there are 11439 orphans here and 242235 from pair analysis so
# a total of 253674 orphans
cat bacEnds.orphan ../pairsNew/bacEnds.orphan > bacEnds.singles
wc -l bacEnds.singles
# 253674 bacEnds.singles
# make singles bed file
cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndSingles.bed
cp bacEndSingles.bed ../pairsNew
cd ../pairsNew
# all slop, short, long, mismatch and orphan pairs go into bacEndPairsBad
# since orphans are already in bacEndSingles, do not add these
cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBad.bed
# add bacEndSingles.bed to bacEnds.load.psl - must not add pair orphans
# twice so create a bed file of bacEndPairsBadNoOrphans.bed without orphans
cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
| row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBadNoOrphans.bed
# use extractPslLoad later to get all_bacends.psl for database
# There are rows where the aligments were the same but the lfNames are
# different. This is due to the presence of multiple reads for the
# same BAC end sequence. Sometimes they are slightly different lengths
# so the alignments are a little different. It would be good to
# consolidate all of these. Firstly, the identical rows were merged into
# one with a list of all the lfNames corresponding to that alignment.
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends/pairsNew
mkdir -p /cluster/data/danRer3/bed/bacends/duplicatesNew
cd /cluster/data/danRer3/bed/bacends/duplicatesNew
mkdir -p /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
cd /cluster/data/danRer3/bed/bacends/duplicatesNew
ln -s /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
# write program to do this for linked feature series (lfs) which
# is the type of data structure used for BAC ends.
# Need a bed file sorted by chrom and chromStart
cd overlapRun
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
sort -k1,2 /cluster/data/danRer3/bed/bacends/pairsNew/${f}.bed \
> ${f}.lfs
end
wc -l *.lfs
# 155242 bacEndPairs.lfs
# 15311 bacEndPairsBadNoOrphans.lfs
# 221821 bacEndSingles.lfs
# remove replicate rows where names match and the overlapping region
# (chromEnd - chromStart) is greater than or equal to 0.999.
ssh kolossus
cd /cluster/data/danRer3/bed/bacends/duplicatesNew/overlapRun
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
echo "Processing $f"
nohup nice /cluster/bin/x86_64/lfsOverlap ${f}.lfs \
${f}.bed -name -minOverlap=0.999 -notBlocks
end
# Started: May 3 23:30 PID: 9199
# pairs started: May 5 18:10, PID: 13232
# Segmentation fault with bacEndSingles. This is a very large file so
# run again using the file split into two
# chr24 starts at line 109407
head -109406 bacEndSingles.lfs > bacEndSinglesPart1.lfs
tail +109407 bacEndSingles.lfs > bacEndSinglesPart2.lfs
# then try again:
foreach f (bacEndSinglesPart1 bacEndSinglesPart2)
echo "Processing $f"
nohup nice /cluster/home/hartera/bin/i386/lfsOverlap ${f}.lfs \
${f}.bed -name -minOverlap=0.999 -notBlocks
end
# merge results
cat bacEndSinglesPart*.bed > bacEndSingles.bed
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends/duplicatesNew/overlapRun
# check the numbers of lines are correct
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
| sort | uniq -c | sort -nr > ${f}.uniqCount
end
wc -l *
# 155164 bacEndPairs.bed
# 155242 bacEndPairs.lfs
# 155189 bacEndPairs.uniqCount
# 15293 bacEndPairsBadNoOrphans.bed
# 15311 bacEndPairsBadNoOrphans.lfs
# 15303 bacEndPairsBadNoOrphans.uniqCount
# 221771 bacEndSingles.bed
# 221821 bacEndSingles.lfs
# 221799 bacEndSingles.uniqCount
# 109390 bacEndSinglesPart1.bed
# 109406 bacEndSinglesPart1.lfs
# 112381 bacEndSinglesPart2.bed
# 112415 bacEndSinglesPart2.lfs
# different numbers for unique count since some of these alignments
# were not identical but very close to identical (>0.999 overlap)
cd /cluster/data/danRer3/bed/bacends/duplicatesNew
mv ./overlapRun/* .
rm -r overlapRun /cluster/bluearc/danRer3/bacends/duplicatesNew/overlapRun
# Use perl script to choose 2 BAC ends to represent each BAC clone.
# since there are often more than one read for each BAC end in this set,
# 2 were chosen for each BAC pair or 1 for the singles. This was based on
# the ones that had the largest region aligned (using lfSizes).
# copy perl script over that was used for danRer2
cp /cluster/data/danRer2/bed/ZonLab/bacends/duplicates/pickLfNames.pl \
pickLfNamesv2.pl
# edit so that regular expression for matching BAC end names is the
# same as that used in ../bacends.1/getBacEndInfov2.pl
# need to sort by chrom, chromStart
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
end
# run perl script: input bed file, pairs or singles, name of output file
perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
mv error.log log.pairs
# log.pairs is empty
perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
mv error.log log.singles
sort log.singles | uniq > log.singles.uniq
cp bacEndSinglesSort.bed bacEndSingles2Sort.bed
# log.singles has 15 cases where alignments for a BAC clone use
# different sequence reads for either the T7 or SP6 BAC end.
# singles may include both BAC ends for a clone in the case
# where they aligned to different chromosomes or a long way apart on
# the same chromsome (orphans). mostly those that have a different read
# align to an almost identical or largely overlapping region.
# CH211-189J23: zC189J23.ya and zC189J23.yb align to overlapping regions.
# Use zC189J23.yb as aligns to a longer region and remove the other one.
# CH211-42D5
# some sequences appear to be different: CH211-98J20 - zC98J20.yb and
# zC98J20.ya do not align to each other. DKEYP-107B4 - zKp107B4.ya looks
# like it has low complexity sequence, this is discarded and zKp107B4.yb
# is kept. zKp107B4.za and zKp107B4.zb only align in the first ~ 59bp.
# zKp107B4.zb is kept in this case. DKEYP-114B4 - zKp114B4.za: 15-61 bp
# on zKp114B4.za align to 11-58 bp on zKp114B4.zb. zKp114B4.za is kept.
# In these cases, the 2 sequences align to different regions.
# Some sequences have overlapping alignments as one sequence is a bit
# longer than the other.
perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
badPairs2lfNames.bed
mv error.log log.badPairs
# no alignments have a different pair of ends to other alignments
# for each of these new bed files, checks were made that there are
# only 2 BAC ends per alignments for pairs and 1 for singles.
# For each pair, there should only be 2 ends which can appear either
# way round depending on the orientation and there should be 1 end for
# the beginning (suffix T7, t7 or z) and one end for the end
# (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
# either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
# orientation. For singles, there should be a single BAC end for each
# alignment and for each BAC clone, a sequence for either or both types
# of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
# alignments.
e.g.
wc -l pairs2lfNames.bed
grep ',' pairs2lfNames.bed
# should be the same number, every line should have a comma
# should be twice the number of above, just 2 end names per line
awk '{print $11}' pairs2lfNames.bed | sort | uniq > pairs.ends
sed -e 's/,/\n/g' pairs.ends > pairs.ends2
wc -l pairs.ends2
# should be twice the number of above, just 2 end names per line
perl -pi.bak -e \
's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' pairs.ends
sort pairs.ends | uniq > pairs.ends.uniq
# check that each of these have the correct pair type
# Finally overlaps in BAC clone names were checked. All BAC clones
# represented in each of the pairs, badPairs and singles bed files are
# unique to that file. Between all three bed files, 300323 BAC clones
# have alignments. 512886 clone ends are aligned in these three bed files.
foreach f (*.bed)
awk '{print $4}' $f | sort | uniq > ${f}.names
end
comm -12 pairs2lfNames.bed.names badPairs2lfNames.bed.names
comm -12 pairs2lfNames.bed.names singles1lfName.bed.names
comm -12 badPairs2lfNames.bed.names singles1lfName.bed.names
# None of these files should have any BAC clone names in common and
# they do not so they are ok.
# clean up:
rm *Part1.bed *Part2.bed *.names *.ends *.ends2 *.Part1.lfs *Part2.lfs
rm *.uniqCount
# NOTE: using sort and uniq on hgwdev produces tab delimited output
# after merging rows with the same BAC name, the scoring is now
# wrong in the bed files.
# Scores should be 1000 if there is 1 row for that name, else
# 1500/number of rows for that sequence name - calculated by pslPairs.
# Correct the scores. The co-ordinates for the singles also need to be
# corrected.
mkdir -p /cluster/data/danRer3/bed/bacends/scoresAndCoords
cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
# copy over correctScores2.pl and checkscores.pl scripts from danRer2 and
# edit so both scripts so that hits file is split on space,not on tabs
cp \
/cluster/data/danRer2/bed/ZonLab/bacends/scoresAndCoords/correctScores2.pl .
cp \
/cluster/data/danRer2/bed/ZonLab/bacends/scoresAndCoords/checkScores.pl .
awk '{print $4}' ../duplicatesNew/pairs2lfNames.bed \
| sort | uniq -c > pairs.hits
perl correctScores2.pl ../duplicatesNew/pairs2lfNames.bed pairs.hits \
noBin > bacEndPairsGoodScores.bed
# same for singles
awk '{print $4}' ../duplicatesNew/singles1lfName.bed \
| sort | uniq -c > singles.hits
perl correctScores2.pl ../duplicatesNew/singles1lfName.bed singles.hits \
noBin > bacEndSinglesGoodScores.bed
# and for badPairs
awk '{print $4}' ../duplicatesNew/badPairs2lfNames.bed \
| sort | uniq -c > badPairs.hits
perl correctScores2.pl ../duplicatesNew/badPairs2lfNames.bed \
badPairs.hits noBin > bacEndPairsBadGoodScores.bed
# check that the scores are now correct
awk '{print $4, $5}' bacEndPairsGoodScores.bed \
| sort | uniq -c > pairs.count
perl checkScores.pl < pairs.count
# all the BAC clones should be in good.txt and none in bad.txt
# wc -l should give same number of lines in good.txt as in pairs.hits
# repeat for other bed files
awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
| sort | uniq -c > badPairs.count
perl checkScores.pl < badPairs.count
awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
| sort | uniq -c > singles.count
perl checkScores.pl < singles.count
# for the singles, 7 ended up in bad.txt because their scores
# were 214.285714285714 which is correct for 7 alignments. rounding the
# score caused the discrepancy.
# For singles, the co-ordinates in the lfs table are wrong. The
# chromStart should be the same as the lfsStart and chromEnd - chromStart
# should be the same as lfSizes. Need to correct these:
# pslPairs has added min/2 to the end or subtracted min/2 from the start
# depending on whether it is a left or a right BAC end and the
# alignment orientation. min used here was 25000.
awk 'BEGIN {FS="\t"} {OFS="\t"} \
{if ($2 != $9) print $1,$9,$3,$4,$5,$6,$7,$8,$9,$10,$11; \
else print $1,$2,$3 - 12500,$4,$5,$6,$7,$8,$9,$10,$11;}' \
bacEndSinglesGoodScores.bed \
> bacEndSinglesGoodScoresAndCoords.bed
# clean up
rm error.log *.txt *.count *.hits bacEndSinglesGoodScore.bed
ssh hgwdev
cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
# copy over table definition from danRer2
cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/bacEndSingles.sql \
../singlesNew/
# Now load database tables:
hgLoadBed danRer3 bacEndPairs bacEndPairsGoodScores.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
# Loaded 155164 elements of size 11
hgLoadBed danRer3 bacEndSingles bacEndSinglesGoodScoresAndCoords.bed \
-sqlTable=../singlesNew/bacEndSingles.sql -notItemRgb
# Loaded 221754 elements of size 11
# 221754 record(s), 0 row(s) skipped, 57 warning(s) loading bed.tab
# warnings are unknown but all of bed file loaded and the number
# of warnings is small so ignore
hgLoadBed danRer3 bacEndPairsBad bacEndPairsBadGoodScores.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
# Loaded 15293 elements of size 11
# load BAC end sequences into seq table so alignments may be viewed
mkdir -p /gbdb/danRer3/bacends
ln -s /cluster/data/danRer3/bed/bacends/bacSeqs/Zv5BACends.fa \
/gbdb/danRer3/bacends/Zv5BACends.fa
hgLoadSeq danRer3 /gbdb/danRer3/bacends/Zv5BACends.fa
# create file for loading all_bacends table
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
# for all_bacends table, just load the alignments for those sequences
# represented in the bacEndPairs, bacEndSingles and bacEndPairsBad tables
# bacEnds.load.psl is the file of alignments
# get all the names of sequences
foreach f (*.bed)
echo $f
awk '{print $11;}' $f >> allBacEnds.names
end
wc -l allBacEnds.names
# 392211 allBacEnds.names
# this is the total number of lines in the *.bed files
perl -pi.bak -e 's/,/\n/g' allBacEnds.names
sort allBacEnds.names | uniq > allBacEnds.names.uniq
wc -l allBacEnds.names.uniq
# 512321 allBacEnds.names.uniq
# get alignments for just the BAC ends that are in the database tables
# make bacEnds.load.psl
cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
extractPslLoad -noBin ../bacEnds.psl bacEndPairsGoodScores.bed \
bacEndPairsBadGoodScores.bed bacEndSinglesGoodScoresAndCoords.bed | \
sorttbl tname tstart | headchg -del > bacEnds.load.psl
# check that alignments are present for all BAC ends in
# allBacEnds.names.uniq
awk '{print $10}' bacEnds.load.psl | sort | uniq > bacEnds.names
comm -12 bacEnds.names allBacEnds.names.uniq | wc -l
# 512321
wc -l *
# 512321 allBacEnds.names.uniq
# 512321 bacEnds.names
# Reloaded split tables. Old bacEnds.load.psl was used
# last time. (2006-06-08, hartera)
ssh hgwdev
cd /cluster/data/danRer3/bed/bacends/scoresAndCoords
# remove old all_bacends table. This was moved over from hgwbeta after
# the recent crash of hgwdevold after the power failure.
hgsql -e 'drop table all_bacends;' danRer3
# Display is very slow for BAC ends on large regions. Try splitting
# bacEnds.load.psl and load tables as chrN_allBacends. The parsing
# code is confused if there are two underscores in the table name.
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
echo "Processing $c ..."
awk '{if ($14 == "'chr${c}'") print;}' \
/cluster/data/danRer3/bed/bacends/scoresAndCoords/bacEnds.load.psl \
> chr${c}.bacEnds.load.psl
end
# drop old tables
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
echo $c
hgsql -e "drop table chr${c}_allBacends;" danRer3
end
# load new tables
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
nice hgLoadPsl danRer3 -table=chr${c}_allBacends chr${c}.bacEnds.load.psl
end
# load of chr5_allBacends did not go as planned: 326147 record(s),
# 0 row(s) skipped, 1 warning(s) loading psl.tab
# load of chr8_allBacends did not go as planned: 212665 record(s),
# 0 row(s) skipped, 5 warning(s) loading psl.tab
# load of chr12_allBacends did not go as planned: 156947 record(s),
# 0 row(s) skipped, 1 warning(s) loading psl.tab
# load of chr15_allBacends did not go as planned: 181721 record(s),
# 0 row(s) skipped, 1 warning(s) loading psl.tab
# load of chr19_allBacends did not go as planned: 282423 record(s),
# 0 row(s) skipped, 1 warning(s) loading psl.tab
# load of chr20_allBacends did not go as planned: 315248 record(s),
# 0 row(s) skipped, 7 warning(s) loading psl.tab
# load of chrUn_allBacends did not go as planned: 1524765 record(s),
# 0 row(s) skipped, 487 warning(s) loading psl.tab
# There are still warnings on loading, most (487) are for chrUn.
# alter lfs (BED) tables so that pslTable field is "allBacends"
# instead of all_bacends (this was set by the pslPairs program).
foreach t (bacEndPairs bacEndSingles bacEndPairsBad)
hgsql -e "update $t set pslTable = 'allBacends';" danRer3
end
# This improves the performance a lot.
# corrected termRegex for some bacCloneXRef searches in trackDb.ra so
# that they work correctly (bacPairsIntName, bacSinglesIntName,
# bacPairsSangerSts and bacSinglesSangerSts). (2006-04-19, hartera)
# Remake the all_bacends table. extractPslLoad extracts psl alignments
# by name so even those that are filtered out end up in the all_bacends
# table. Wrote a program that matches BAC end psl alignments from the
# bacEnd{Pairs, PairsBad, Singles} tables by name, chrom, chromStart and
# chromEnd.
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends
mkdir extractPsl
cd extractPsl
# Some scores in bacEndSinglesGoodScoresAndCoords.psl are not integers
# so fix these and also for the other bacEnd files just in case.
cat << '_EOF_' > roundPslScore.pl
#!/usr/bin/perl -w
use strict;
my $file = $ARGV[0];
open(FILE, $file) || die "Can not open $file: $!\n";
while (<FILE>)
{
my (@f, $line, $num, $score);
$line = $_;
@f = split(/\t/, $line);
$num = $f[4];
$score = round($num);
$line =~ s/$num/$score/;
print $line;
}
sub round {
my($number) = shift;
return int($number + .5);
}
'_EOF_'
chmod +x roundPslScore.pl
set bacDir=/cluster/data/danRer3/bed/bacends
perl roundPslScore.pl $bacDir/scoresAndCoords/bacEndPairsGoodScores.bed \
> bacEndPairsRoundScore.bed
perl roundPslScore.pl $bacDir/scoresAndCoords/bacEndPairsBadGoodScores.bed \
> bacEndPairsBadRoundScore.bed
perl roundPslScore.pl \
$bacDir/scoresAndCoords/bacEndSinglesGoodScoresAndCoords.bed \
> bacEndSinglesRoundScore.bed
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl bacEndPairsRoundScore.bed bacPairs.psl
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl bacEndPairsBadRoundScore.bed bacPairsBad.psl
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl \
bacEndSinglesRoundScore.bed bacSingles.psl
cat bac*.psl > allBacends.load.psl
# Now load database tables:
# Do not need to reload singles table as it is still the same, the
# scores were rounded to 214 on loading. These are the only scores that
# are floats rather than integers.
# Drop old split bacends tables and reload new one with only those psls
# relevant to alignments in the lfs tables.
ssh hgwdev
cd /cluster/data/danRer3/bed/bacends/extractPsl
foreach c (`cat /cluster/data/danRer3/chrom.lst`)
hgsql -e "drop table chr${c}_allBacends;" danRer3
end
# change the bacEnd{Pairs, PairBad, Singles} tables so that the
# pslTable is all_bacends again.
foreach b (Pairs PairsBad Singles)
hgsql -e "update bacEnd${b} set pslTable = 'all_bacends';" \
danRer3
end
# Then load all_bacends table. Now there are many less alignments than
# before, they can all go in one table since the large table size
# was previously slowing down the Browser at zoomed out display levels
# due to slow access of the very large all_bacends table.
wc -l allBacends.load.psl
# 549408 allBacends.load.psl
hgLoadPsl danRer3 -table=all_bacends allBacends.load.psl
hgsql -e 'select count(*) from all_bacends;' danRer3
# 549408
# Table contains the correct number of rows.
# Get all the lfNames from the bed files and check that these are all
# represented in allBacends.load.psl
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends/extractPsl
foreach p (*RoundScore.bed)
awk '{print $11}' $p >> bedFiles.names
end
perl -pi.bak -e 's/,/\n/' bedFiles.names
sort bedFiles.names | uniq > bedFiles.names.uniq
# get psl file names
awk '{print $10}' allBacends.load.psl | sort | uniq > pslFile.names.uniq
wc -l *.uniq
# 512321 bedFiles.names.uniq
# 512321 pslFile.names.uniq
comm -12 bedFiles.names.uniq pslFile.names.uniq | wc -l
# 512321
# Therefore all names from BED files are in PSL file.
rm bedFiles* pslFile*
cd /cluster/data/danRer3/bed/bacends
rm -r all_bacends
rm ./scoresAndCoords/*.bacEnds.load.psl
# Duplicate rows in bacCloneXRef and bacCloneAlias tables so remove
# these, reload tables and test - see sections on
# CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES and
# BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES
#######################################################################
# RE-DO RH MAP:
# isPcr of sequences.
# 1) Make a list from FASTA file of sequences.
# 2) get one record per file. - need to just split on '>'
# 3) use rhFix to adapt to get primers, one set per file and name
# after sequence
# run isPcr as cluster job - one per sequence and primers set
# get RHmap info again. need to remove spaces in primers seqs
cat << '_EOF_' > getRhInfo
#!/usr/bin/awk -f
#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
sub(/>/,"",$0);
sub(/\//,"_", $0);
gsub(/ /,"",$0);
split(toupper($0), a, "\\|");
print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
next;
}
'_EOF_'
# << keep emacs coloring happy
chmod +x getRhInfo
getRhInfo ../../rhMap.headers2 > rhMapInfo.tab
ssh hgwdev
cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306
mkdir -p isPcr/primers
cd isPcr/primers
# create primers files
ssh kkstore02
cd \
/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers
awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($8 != "") print $1,$8,$9 \
> $1".primers.fa"}' rhMapInfo.tab
# there are 7519 primer sets which is correct.
# get list of sequences
cd ..
mkdir markerSeqs
cd markerSeqs
grep '>' ../../rhMap.fa | wc
# 11514
# get all sequences. There are 11514 total.
# use faSplit sequence 11514
# rhMap.fa is file. Need to fix that one name:
perl -pi.bak -e 's/\//_/' ../../rhMap.fa
# splits sequences up with one file per name named with sequence name
faSplit byname ../../rhMap.fa rhMap
ls | wc -l
# 11514
ssh pk
# make run dir on the san and link to isPcr dir
mkdir -p /san/sanvol1/scratch/danRer3/bacends/isPcrRun
cd /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr
ln -s /san/sanvol1/scratch/danRer3/bacends/isPcrRun .
# get list of sequences with primers
cd \
/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/isPcrRun
awk 'BEGIN {FS="\t"} {OFS="\t"} {if ($8 != "") print $1 \
> "primerSeqs.lst"}' \
/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers/rhMapInfo.tab
foreach m (`cat primerSeqs.lst`)
echo /cluster/bin/x86_64/isPcr -out=psl -minPerfect=2 -maxSize=5000 -tileSize=10 -ooc=/san/sanvol1/scratch/danRer3/danRer3_10.ooc -stepSize=5 /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/markerSeqs/${m}.fa /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/primers/${m}.primers.fa '{'check out line+ /cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr/isPcrRun/out/${m}.psl'}' >> jobList
end
para create jobList
para try, check, push, check etc. ...
# there are 654 that do not have isPcr results. Checked Z4664.MGH and
# found that the primers would not align with Blat either.
# these are in unmatchedPrimers. They crashed even if maxSize=50000 and
# if -flipReverse used.
mkdir notMatchedPrimers notMatchedSeqs
perl -pi.bak -e 's/\.fa//' unmatchedPrimers
foreach f (`cat unmatchedPrimers`)
set d=/cluster/data/danRer3/bed/ZonLab/rhMap/rhSequenceSubmit022306/isPcr
cp ${d}/primers/${f}.primers.fa ./notMatchedPrimers/
cp ${d}/markerSeqs/${f}.fa ./notMatchedSeqs
end
tar cvzf primers.tar.gz notMatchedPrimers/*primers.fa
tar cvzf markers.tar.gz notMatchedSeqs/*.fa
# sent these to Yi Zhou by e-mail and see if they can look at them.
# include the isPcr parameters.
# from PSL extract sequence. need tName, tStart and tEnd, fields 14, 16 and
# 17. Then used faFrag to get sequence from FASTA file.
############################################################################
## BLASTZ swap from panTro2 alignments (DONE 2006-05-07 markd)
ssh hgwdev64
mkdir /cluster/data/danRer3/bed/blastz.panTro2.swap
ln -s blastz.panTro2.swap /cluster/data/danRer3/bed/blastz.panTro2
cd /cluster/data/danRer3/bed/blastz.panTro2.swap
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
/cluster/data/panTro2/bed/blastz.danRer3/DEF >& swap.out&
# create the net files
ssh hgwdev
cd /cluster/data/danRer3/bed/blastz.panTro2.swap/axtChain
nice netClass -verbose=0 -noAr noClass.net danRer3 panTro2 danRer3.panTro2.net
###########################################################################
# LIFTOVER CHAINS TO DANRER4 (DONE, 2006-05-31 - 2006-06-06, hartera)
# CLEANUP BLAT DIRECTORY (DONE, 2006-12-14, hartera)
# Split (using makeLoChain-split) of danRer4 is doc'ed in makeDanRer4.doc
# Do what makeLoChain-split says to do next (start blat alignment)
# Use pk as runs faster than on kk. Scripts only run on kk so run manually.
ssh pk
mkdir -p /cluster/data/danRer3/bed/liftOver
cd /cluster/data/danRer3/bed/liftOver
cat << '_EOF_' > align.csh
#!/bin/csh -fe
set oldAssembly = $1
set oldNibDir = $2
set newAssembly = $3
set newSplitDir = $4
set ooc = $5
if ("$ooc" != "") then
set ooc = '-ooc='$ooc
endif
set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d`
echo "Setting up blat in $blatDir"
rm -fr $blatDir
mkdir $blatDir
cd $blatDir
mkdir raw psl run
cd run
echo '#LOOP' > gsub
echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \
'-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \
>> gsub
echo '#ENDLOOP' >> gsub
# target
ls -1S $oldNibDir/*.{nib,2bit} > old.lst
# query
ls -1S $newSplitDir/*.{nib,fa} > new.lst
gensub2 old.lst new.lst gsub spec
/parasol/bin/para create spec
echo ""
echo "First two lines of para spec:"
head -2 spec
echo ""
echo "DO THIS NEXT:"
echo " cd $blatDir/run"
echo " para try, check, push, check, ..."
echo ""
exit 0
'_EOF_'
# << emacs
chmod +x align.csh
align.csh danRer3 /san/sanvol1/scratch/danRer3/nib danRer4 \
/san/sanvol1/scratch/danRer4/split10k \
/san/sanvol1/scratch/danRer4/danRer4_11.ooc >&! align.log &
# Took a few seconds.
# Do what its output says to do next (start cluster job)
cd /cluster/data/danRer3/bed/blat.danRer4.2006-05-31/run
para try, check, push, check, ...
para time
# Completed: 784 of 784 jobs
# CPU time in finished jobs: 2011355s 33522.59m 558.71h 23.28d 0.064 y
# IO & Wait Time: 3926s 65.43m 1.09h 0.05d 0.000 y
# Average job time: 2571s 42.84m 0.71h 0.03d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 205412s 3423.53m 57.06h 2.38d
# Submission to last job: 219860s 3664.33m 61.07h 2.54d
ssh pk
cd /cluster/data/danRer3/bed/liftOver
cat << '_EOF_' > lift.csh
#!/bin/csh -ef
set oldAssembly = $1
set newAssembly = $2
set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k
set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly
set blatDir = `ls -td $prefix.20* | head -1`
echo "using dir $blatDir"
if ( ! -e $blatDir/raw ) then
echo "Can't find $blatDir/raw"
endif
if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then
echo "Can't find any .lft files in $newLiftDir"
exit 1
endif
cd $blatDir/raw
foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`)
echo $chr
liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl
end
set execDir = $0:h
echo ""
echo "DO THIS NEXT:"
echo " ssh pk"
echo " $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>"
echo ""
exit 0
'_EOF_'
# << emacs
chmod +x lift.csh
lift.csh danRer3 danRer4 >&! lift.log &
# makeLoChain-chain can be run on pk. chain alignments
makeLoChain-chain danRer3 /san/sanvol1/scratch/danRer3/nib \
danRer4 /san/sanvol1/scratch/danRer4/nib >&! chain.log &
cd /cluster/data/danRer3/bed/blat.danRer4.2006-05-31/chainRun
para try, check, push, check, ...
para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 3414s 56.91m 0.95h 0.04d 0.000 y
# IO & Wait Time: 3256s 54.26m 0.90h 0.04d 0.000 y
# Average job time: 238s 3.97m 0.07h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 280s 4.67m 0.08h 0.00d
# Submission to last job: 280s 4.67m 0.08h 0.00d
# net alignment chains
ssh kkstore02
cd /cluster/data/danRer3/bed/liftOver
makeLoChain-net danRer3 danRer4 >&! net.log &
# load reference to over.chain into database table,
# and create symlinks /gbdb and download area
ssh hgwdev
cd /cluster/data/danRer3/bed/liftOver
makeLoChain-load danRer3 danRer4 >&! load.log &
# clean up
rm *.log
# add md5sum.txt to include this new liftOver file
cd /usr/local/apache/htdocs/goldenPath/danRer3/liftOver
rm md5sum.txt
md5sum *.gz > md5sum.txt
# copy README.txt from another liftOver directory if it is not there already.
# test by converting a region using the "convert" link on
# the browser, and comparing to blat of the same region
# CLEANUP blat directory (2006-12-14, hartera)
ssh kkstore02
rm -r /cluster/data/danRer3/bed/blat.danRer4.2006-05-31
###########################################################################
# CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO
# AFFY ZEBRAFISH ALIGNMENTS (DONE, 2006-06-10, hartera)
# UPDATE ARRAY DATA TRACK AFTER PROCESSING ARRAY DATA DIFFERENTLY AND
# RELOADING INTO hgFixed (see hgFixed.txt for details).
# (DONE, 2006-10-20, hartera)
# UPDATE ARRAY DATA TRACK AFTER REPROCESSING ARRAY DATA TO ANTILOG THE LOG2
# VALUES FROM NORMALISATION TO GET THE ABSOLUTE VALUES AND
# RELOADING INTO hgFixed (see hgFixed.txt for details).
# (DONE, 2007-01-08, hartera)
# RE-ORDERED DISPLAY IN TRACK (DONE, hartera, 2007-04-09)
# Array data is for whole embryos of five wild type zebrafish strains.
# Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's
# Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
ssh hgwdev
mkdir /cluster/data/danRer3/bed/ZonLab/wtArray
cd /cluster/data/danRer3/bed/ZonLab/wtArray
# use AllRatio table for mapping. There are not many arrays in this
# dataset so using AllRatio will allow the selection of All Arrays
# from the track controls on the track description page. Also set up the
# Zebrafish microarrayGroups.ra so that the Medians of replicates or
# Means of replicates can also be selected for display.
# Create mapped data in zebrafishZonWT.bed.
rm zebrafishZonWT.bed
hgsql -e 'drop table affyZonWildType;' danRer3
hgMapMicroarray zebrafishZonWT.bed hgFixed.zebrafishZonWTAllRatio \
/cluster/data/danRer3/bed/affyZebrafish/affyZebrafish.psl
# Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTMedian
# Mapped 14494, multiply-mapped 4102, missed 0, unmapped 1123
# Load mapped data into database:
hgLoadBed danRer3 affyZonWildType zebrafishZonWT.bed
# Loaded 18596 elements of size 15
# add trackDb.ra entry at trackDb/zebrafish level
# look at range of scores:
hgsql -N -e 'select expScores from zebrafishZonWTAllRatio;' hgFixed \
> ratioExps.out
perl -pi.bak -e 's/,/\n/g' ratioExps.out
sort ratioExps.out | uniq -c > ratioExps.uniq.count
textHistogram -binSize=0.5 -real -maxBinCount=40 -minVal=-10 \
ratioExps.out > expRatios.hist
# Most values are between -3 and +2.
# Therefore use the following trackDb entry:
# track affyZonWildType
# shortLabel Wild Type Array
# longLabel Zon Lab Expression data for Wild Type Zebrafish strains
# group regulation
# priority 80
# visibility hide
# type expRatio
# expScale 2.0
# expStep 0.2
# groupings affyZonWildTypeGroups
# The .ra file in /usr/local/apache/cgi-bin/hgCgiData/Zebrafish
# (from ~/kent/src/hg/makeDb/hgCgiData/Zebrafish in the source tree)
# which is microarrayGroups.ra defines how the array data is
# displayed and also grouped for the Medians and Means of Replicates.
# It also defines the labels for the track controls for showing
# All Arrays, Arrays Grouped By Replicate Means or
# Arrays Grouped By Replicate Medians. This is in the description field.
# RE-ORDER DISPLAY IN TRACK - (hartera, 2007-04-09)
ssh hgwdev
cd ~/kent/src/hg/makeDb/hgCgiData/Zebrafish
# 14 somites and 15 somites should come before 36 hpf
# 14-19 somites stage is 16-19h.
# from hgFixed.zebrafishZonWTAllExps
# for AB, 0-8 should go after 14,
# for TL, 16-22 should go after 24
# for TU, 25-27 should go after 32
# re-order accordingly in the config file:
cd /cluster/data/danRer4/bed/ZonLab/wtArray
cat << '_EOF_' > formatArray
#!/usr/bin/awk -f
BEGIN {FS=","} {OFS=","}
/expIds/ {
sub(/expIds /,"",$0);
print "expIds "$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7,$8,$9,$16,$24,$25,$17,$18,$19,$20,$21,$22,$23,$29,$30,$31,$32,$33,$26,$27,$28,$34;
next;
}
/names AB-36-hpf,AB-36-hpf 2/ {
sub(/names /,"",$0);
print "names "$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7,$8,$9,$16,$24,$25,$17,$18,$19,$20,$21,$22,$23,$29,$30,$31,$32,$33,$26,$27,$28,$34;
next;
}
/names AB-36-hpf,AB-14-somites/ {
sub(/names /,"",$0);
print "names "$2,$1,$3,$5,$4,$7,$8,$6,$9;
next;
}
/groupSizes 9/ {
sub(/groupSizes /,"",$0);
print "groupSizes "$2,$1,$3,$5,$4,$7,$8,$6,$9;
next;
}
{
print $0;
}
'_EOF_'
chmod +x formatArray
formatArray ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra \
> microarrayGroups2.ra
cp microarrayGroups2.ra \
~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra
cd ~/kent/src/hg/makeDb/hgCgiData/
make my
# after doing make, check this in hgwdev-hartera
# then commit to CVS as it works fine.
###########################################################################
# BUILD GENE SORTER TABLES (AKA FAMILY BROWSER)
# (DONE, 2006-06-08 - 2006-06-12, hartera)
# Zon Lab WT Affy data tables in hgFixed renamed to reflect that the data
# is log2 transformed (DONE, 2006-07-30, hartera)
# Recreate the ensToAffyZebrafish and ensToAffyZonWildType tables after
# updating the Affy Zebrafish track with different filtering used for the
# Blat alignments - see UPDATE AFFY ZEBRAFISH TRACK section. Also the
# Affy Zon Lab Wild Type Array data was updated with a different method of
# processing - see hgFixed.txt (DONE, 2006-10-25, hartera)
# Recreated the ensCanonical and ensIsoforms table after updating proteinID
# in ensGene table (DONE, 2006-11-06, hartera)
# This should be done after creating ensGene, ensGtp and ensPep tables
# for the Ensembl Genes track.
# The BlastTab tables are already built - see HGNEAR PROTEIN BLAST TABLES
# Blastp of self is ensZfishBlastTab table.
# Other blastp ortholog tables are: hgBlastTab (hg18), mmBlastTab(mm8),
# rnBlastTab (rn4), dmBlastTab (dm2), ceBlastTab (ce2),
# sacCerBlastTab (sacCer1).
ssh hgwdev
mkdir /cluster/data/danRer3/bed/geneSorter.2006-06-08
ln -s /cluster/data/danRer3/bed/geneSorter.2006-06-08 \
/cluster/data/danRer3/bed/geneSorter
cd /cluster/data/danRer3/bed/geneSorter
# Create table that maps between known genes and RefSeq
# Index is only on first 16 characters, too short for Ensembl names
# manually changed hgMapToGene to create index with 20 characters
# on name and use local copy of program.
$HOME/bin/x86_64/hgMapToGene danRer3 refGene ensGene ensToRefSeq
# hgsql -e 'select count(*) from ensToRefSeq;' danRer3
# 9707
# Create table that maps between Ensembl genes and LocusLink
# LocusLink is now called Entrez Gene.
hgsql -N -e "select mrnaAcc,locusLinkId from refLink" danRer3 > refToLl.txt
$HOME/bin/x86_64/hgMapToGene danRer3 refGene ensGene \
ensToLocusLink -lookup=refToLl.txt
# Update the following three tables after update of Affy Zebrafish and
# Affy Zon Lab Wild Type data (2006-10-25):
hgsql -e 'drop table ensToAffyZebrafish;' danRer3
hgsql -e 'drop table ensToAffyZonWildType;' danRer3
hgsql -e 'drop table zebrafishZonWTDistance;' danRer3
# Create table that maps between Ensembl genes and the Affy Zebrfish
# probeset consensus sequences.
$HOME/bin/x86_64/hgMapToGene danRer3 affyZebrafish ensGene \
ensToAffyZebrafish
# Create a table that maps between Ensembl genes and
# the Zon lab microarray expression data.
$HOME/bin/x86_64/hgMapToGene "-type=bed 12" danRer3 affyZonWildType \
ensGene ensToAffyZonWildType
# Create expression distance table.
nice hgExpDistance danRer3 hgFixed.zebrafishZonWTMedianRatio \
hgFixed.zebrafishZonWTMedianExps zebrafishZonWTDistance \
-lookup=ensToAffyZebrafish &
# Have 15617 elements in hgFixed.zebrafishZonWTMedian
# Got 8911 unique elements in hgFixed.zebrafishZonWTMedian
# Made zebrafishZonWTDistance.tab
# Loaded zebrafishZonWTDistance
# Made query index
# Took 2 minutes.
# To allow data to be viewed in Gene Sorter, add the hgNearOk=1
# to the dbDb table entry for danRer3 on hgcentraltest -
# see section on MAKE HGCENTRALTEST ENTRY FOR DANRER3.
# added a protein ID field to ensGene before running this hgClusterGenes
# Cluster together various alt-splicing isoforms.
# Creates the knownIsoforms and knownCanonical tables
# Rebuild this after creating updating the ensGene table with
# protein IDs from UniProt with >= 90% identity to Ensembl proteins.
# (2006-11-06, hartera)
hgsql -e 'drop table ensIsoforms;' danRer3
hgsql -e 'drop table ensCanonical;' danRer3
hgClusterGenes danRer3 ensGene ensIsoforms ensCanonical
# Got 22877 clusters, from 32143 genes in 28 chromosomes
# There are also 22877 genes in the ensGtp table so this is in agreement.
#######################################################################
# UPDATE AFFY ZEBRAFISH TRACK USING BLAT WITHOUT -mask OPTION AND
# USING -repeats OPTION AND DIFFERENT FILTERING TO REMOVE SHORT
# ALIGNMENTS (DONE, 2006-09-27, hartera)
# With the previous version of this track, QA found a number of short
# alignments of <= 30 bp and there are a number in the <= 50bp range.
# These do not seem to be meaningful so filtering was changed to try to
# remove these alignments while retaining meaningful alignments.
# pslCDnaFilter was used with the same settings as used for the
# Genbank EST alignments for zebrafish.
# Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
# higher minIdentity is causing alignments to be dropped that should not be.
# Blat's minIdentity seems to be more severe than that for pslReps or
# pslCDnaFilter as it takes insertions and deletions into account.
# These are Jim's recommendations.
# Remove old Affy zebrafish directories (DONE, 2006-12-13, hartera)
# Array chip sequences already downloaded for danRer1
ssh hgwdev
cd /projects/compbio/data/microarray/affyZebrafish
mkdir -p /san/sanvol1/scratch/affy
cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/san/sanvol1/scratch/affy/
# Set up cluster job to align Zebrafish consensus sequences to danRer3
# remove old link and create new one
rm /cluster/data/danRer3/bed/affyZebrafish
mkdir -p /cluster/data/danRer3/bed/affyZebrafish.2006-09-27
ln -s /cluster/data/danRer3/bed/affyZebrafish.2006-09-27 \
/cluster/data/danRer3/bed/affyZebrafish
# Align sequences on the pitakluster. Scaffolds were aligned for NA
# and Un and lifted to chrom level afterwards. Chroms 1-25 and M
# were aligned as ~5 Mb chunks.
ssh pk
cd /cluster/data/danRer3/bed/affyZebrafish
mv /san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/scaffold*.fa \
/san/sanvol1/scratch/danRer3/
ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/*.fa)
ls -1 $f >> genome.lst
end
wc -l genome.lst
# 15149 genome.lst
# for output:
mkdir -p /san/sanvol1/scratch/danRer3/affy/psl
# use -repeats option to report matches to repeat bases separately
# to other matches in the PSL output.
echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90 -ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc $(path1) $(path2) {check out line+ /san/sanvol1/scratch/danRer3/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 genome.lst affy.lst template.sub para.spec
para create para.spec
para try, check, push ... etc.
para time
# Completed: 15149 of 15149 jobs
#CPU time in finished jobs: 34672s 577.87m 9.63h 0.40d 0.001y
#IO & Wait Time: 41580s 692.99m 11.55h 0.48d 0.001 y
#Average job time: 5s 0.08m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 145s 2.42m 0.04h 0.00d
#Submission to last job: 1400s 23.33m 0.39h 0.02d
# need to do pslSort and lift up
ssh pk
cd /san/sanvol1/scratch/danRer3/affy
# Do sort, liftUp and then best in genome filter.
# only use alignments that have at least
# 95% identity in aligned region.
# Previously did not use minCover since a lot of sequence is in
# Un and NA so genes may be split up so good to see all alignments.
# However, found a number of short alignments of <= 50 bp. These are
# not meaningful so maybe need to use minCover. If increased too much,
# then hits on poor parts of the assembly will be missed.
# use pslCDnaFilter with the same parameters as used for zebrafish
# Genbank EST alignments.
pslSort dirs raw.psl tmp psl
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
# seqs aligns
# total: 14886 830753
# drop minNonRepSize: 2753 745330
# drop minIdent: 2645 38916
# drop minCover: 2472 10516
# weird over: 384 1529
# kept weird: 308 403
# drop localBest: 2559 17395
# kept: 14494 18596
# 97.3% were kept.
# There are 15502 Affy sequences originally aligned so there are now
# 93.5% remaining.
# lift up the coordinates to chrom level
#pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
# lift up chrom contigs to chrom level
cat /cluster/data/danRer3/jkStuff/liftAll.lft \
/cluster/data/danRer3/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft \
> allLift.lft
liftUp affyZebrafish.psl allLift.lft warn contig.psl
# Got 30168 lifts in allLift.lft
# Lifting contig.psl
# rsync these psl files
rsync -a --progress /san/sanvol1/scratch/danRer3/affy/*.psl \
/cluster/data/danRer3/bed/affyZebrafish/
ssh kkstore02
cd /cluster/data/danRer3/bed/affyZebrafish
# shorten names in psl file
sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
mv affyZebrafish.psl.tmp affyZebrafish.psl
pslCheck affyZebrafish.psl
# psl is good
# load track into database
ssh hgwdev
cd /cluster/data/danRer3/bed/affyZebrafish
hgsql -e 'drop table affyZebrafish;' danRer3
hgLoadPsl danRer3 affyZebrafish.psl
# Add consensus sequences for Zebrafish chip
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/hgFixed/affyProbes
ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/gbdb/hgFixed/affyProbes
# these sequences were loaded previously so no need to reload.
hgLoadSeq -abbr=Zebrafish: danRer3 \
/gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
# Clean up
rm batch.bak contig.psl raw.psl
# check number of short alignments:
hgsql -e \
'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer3
# 6
# for previous filtered set, there were 1195 alignments of <= 50 bp so
# this has improved.
hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer3
# 14494
# Previously 14335 distinct affy sequences were aligned. Many of the
# short alignments may also have longer alignments to different regions
# of the genome that are good.
# CLEANUP:
# remove old Affy Zebrafish alignment directories (hartera, 2006-12-13)
ssh kkstore02
cd /cluster/data/danRer3/bed
rm -r affyZebrafish.2005-08-19
rm -r affyZebrafish.2005-09-25
#########################################################################
# NEW RH MAP SEQUENCES FOR TRACK (in progress, 2006-10-12, hartera)
# Data from Yi Zhou at Boston Children's Hospital:
# yzhou@enders.tch.harvard.edu
ssh kkstore02
mkdir /cluster/data/danRer3/bed/rhMap-2006-10-03
cd /cluster/data/danRer3/bed
ln -s rhMap-2006-10-03 rhMap
# download data files from e-mail:
# rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
unzip rhSequenceSubmit100306.zip
unzip rhSequenceSubmitSeq100306.zip
dos2unix rhSequenceSubmit100306.txt
dos2unix rhSequenceSubmitSeq100306.txt
# need to convert format of FASTA file to remove the line numbers
###########################################################################
# BACENDS CLEANUP (DONE, 2006-12-13, hartera)
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends
mv ./seqs/getCloneEnds.csh .
rm CHORI73.* DH.* DHBacs.fullnames DHmorethan2.*
rm bacEndsChroms.psl bacNAandUnScafs.psl
rm bacends.lst genome.lst names.psl namesPls.uniq header pslCheck.log \
raw*
rm -r /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnPsl
rm -r /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnPsl
rm -r newPairs2
rm -r /san/sanvol1/scratch/danRer3/bacends/scaffoldsNAandUnRun
rm -r /cluster/data/danRer3/bed/bacends/scaffoldsNAandUnRun
rm -r singles pairs scores
rm -r ./cloneandStsAliases/tmp
rm ./cloneandStsAliases/*.bak ./cloneandStsAliases/*.tab \
./cloneandStsAliases/*.sort ./cloneandStsAliases/*.uniq
rm DH_bacends.fa
rm -r liftedPsl
# the psl directory is large, gzip the contents
cd psl
gzip *.psl
#########################################################################
## Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
hgsql -h genome-testdb hgcentraltest \
-e "update dbDb set orderKey = 451 where name = 'danRer3';"
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer3
###########################################################################
# REMAKE RADIATION HYBRID (RH) MAP TRACK (DONE, 2007-02-14, hartera)
# Use update of Radiation Hybrid map data from October 2006 and use method
# as documented in danRer4.txt to map these sequences to danRer3.
# Data from Yi Zhou at Boston Children's Hospital:
# yzhou@enders.tch.harvard.edu
# Latest RH map sequences and primers received on 2006-10-03 from
# Anhua (Peter) Song - asong@enders.tch.harvard.edu
ssh kkstore02
mkdir /cluster/data/danRer3/bed/ZonLab/rhMap-2006-10-03
cd /cluster/data/danRer3/bed/ZonLab
ln -s rhMap-2006-10-03 rhMap
cd rhMap
# download data files from e-mail:
# rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
unzip rhSequenceSubmit100306.zip
unzip rhSequenceSubmitSeq100306.zip
dos2unix rhSequenceSubmit100306.txt
dos2unix rhSequenceSubmitSeq100306.txt
# Sequences are in rhSequenceSubmitSeq100306.txt and primers and other
# information are in rhSequenceSubmi100306.txt
grep '>' rhSequenceSubmitSeq100306.txt | wc -l
# 11514
wc -l rhSequenceSubmit100306.txt
# 13438 rhSequenceSubmit100306.txt
grep '>' rhSequenceSubmitSeq100306.txt > rhMap.names
# remove '>' from names and grab first field
perl -pi.bak -e 's/>//' rhMap.names
awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
> rhMap.namesOnly.sort
awk 'BEGIN {FS="|"} {print $1;}' rhSequenceSubmit100306.txt | sort | uniq \
> rhMapPrimers.namesOnly.sort
wc -l *.sort
# 11514 rhMap.namesOnly.sort
# 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
# get a list of headers from the FASTA file
grep '>' rhSequenceSubmitSeq100306.txt > rhMap.headers
awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
# BAC_END
# EST
# GENE
# SSLP
# STS
# There are 5 types of sequence here.
awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
#BACends
#Custom
#Insertion_Mutant
#Insertion_Mutants
#MGH
#NCBI
#Sanger SG
#Sequencing_Project
#ThisseClone
#Thisse_Clone
#other_zfEst
#wu_zfEst
#wz
awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
# CHBG
# MPIEB
# Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
# So there are 11 different sources.
# There are 2 sequences with problem primers. E-mailed Peter Song about
# these and he suggested to delete thoser primers:
# >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
# >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A|
# edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
# need to reformat FASTA headers so they are in the format:
# NAME.SOURCE.TYPE.ORIGIN
# Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
# so change these to have the same name. Also shorten Sanger SG to
# Shotgun.
sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmitSeq100306.txt \
| sed -e 's/Insertion_Mutant/InsertMut/' \
| sed -e 's/Sanger SG/Shotgun/' \
| sed -e 's/ThisseClone/Thisse/' \
| sed -e 's/Thisse_Clone/Thisse/' \
| sed -e 's/Sequencing_Project/Seqproj/' > rhMap100306.fa
# Do the same for the primers and information file:
sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmit100306.txt \
| sed -e 's/Insertion_Mutant/InsertMut/' \
| sed -e 's/Sanger SG/Shotgun/' \
| sed -e 's/ThisseClone/Thisse/' \
| sed -e 's/Thisse_Clone/Thisse/' \
| sed -e 's/Sequencing_Project/Seqproj/' > rhMapPrimers100306.txt
# edit these files to remove the extra newline char after the first primer
# for 1942c and then change "/" in FJ34C05.Y1/FJ56G09.Y1.WU_ZFEST to
# an underscore (2007-02-14, hartera)
perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
rhMap100306.fa
perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
rhMapPrimers100306.txt
# use a script to reformat the names for the FASTA headers to the format
# >NAME.SOURCE where name is the first field separated by "|" and source
# is the 9th field. The source is used to make the name unique. Some
# of these names are BAC ends that occur in the BAC ends track so there
# are name clashes in the seq table if the names are not made unique.
# Also make the name upper case as for those for the danRer1 and danRer2
# RH map and remove base numbering on each sequence line of FASTA file.
cat << '_EOF_' > rhFix
#!/usr/bin/awk -f
#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
split(toupper($0), a, "\\|");
print a[1]"."a[9];
next;
}
/^[0-9]+ / {
$0 = $2;
}
{
print $0;
}
'_EOF_'
# << keep emacs coloring happy
chmod +x rhFix
rhFix rhMap100306.fa > rhMap.fa
# Blat sequences vs danRer3 genome
ssh pk
mkdir -p /cluster/data/danRer3/bed/ZonLab/rhMap/blatRun
cd /cluster/data/danRer3/bed/ZonLab/rhMap
# put the rhMap sequences on the san
mkdir -p /san/sanvol1/scratch/danRer3/rhMap
cp rhMap.fa /san/sanvol1/scratch/danRer3/rhMap/
# do blat run to align RH map sequences to danRer3 and and use
# chrNA_random and chrUn_random separated into scaffolds.
cd blatRun
ls -1S /san/sanvol1/scratch/danRer3/rhMap/rhMap.fa > rhMap.lst
foreach f (/san/sanvol1/scratch/danRer3/trfFaChromsAndScafs/*.fa)
ls -1S $f >> genome.lst
end
wc -l genome.lst
# 15149 genome.lst
# for output:
mkdir -p /san/sanvol1/scratch/danRer3/rhMap/psl
# use -repeats option to report matches to repeat bases separately
# to other matches in the PSL output.
echo '#LOOP\n/cluster/bin/x86_64/blat -repeats=lower -minIdentity=80
-ooc=/san/sanvol1/scratch/danRer3/danRer3_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer3/rhMap/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
> template.sub
gensub2 genome.lst rhMap.lst template.sub para.spec
para create para.spec
para try, check, push ... etc.
para time
# Completed: 15149 of 15149 jobs
#CPU time in finished jobs: 13684s 228.07m 3.80h 0.16d 0.000y
#IO & Wait Time: 38258s 637.63m 10.63h 0.44d 0.001 y
#Average job time: 3s 0.06m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 24s 0.40m 0.01h 0.00d
#Submission to last job: 901s 15.02m 0.25h 0.01d
# need to do pslSort and lift up
ssh pk
cd /san/sanvol1/scratch/danRer3/rhMap
# Do sort, liftUp and then best in genome filter.
pslSort dirs raw.psl tmp psl
# only use alignments that have at least
# 95% identity in aligned region.
# Previously did not use minCover since a lot of sequence is in
# Un and NA so genes may be split up so good to see all alignments.
# However, found a number of short alignments of <= 50 bp. These are
# not meaningful so maybe need to use minCover. If increased too much,
# then hits on poor parts of the assembly will be missed.
# use pslCDnaFilter with the same parameters as used for zebrafish
# Genbank EST alignments.
# Use parameters as determined for danRer4
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.85 -minCover=0.33 raw.psl contig.psl
# seqs aligns
# total: 11060 1767931
# drop invalid: 1 1
#drop minNonRepSize: 3047 1297013
# drop minIdent: 763 3913
# drop minCover: 4065 420022
# weird over: 288 4267
# kept weird: 130 189
# drop localBest: 2188 34092
# kept: 10447 12890
# Percent sequences aligned: 10447/11514 = 90.7%
# This is a compromise between reducing the number of sequences piling
# up but not losing all alignments for too many sequences.
awk '{print $10}' contig.psl | sort | uniq -c | sort -nr > contig.count
head contig.count
# 33 ZKP106G9.YA.BACENDS
# 21 BZ83M20.Z.BACENDS
# 12 ZK4I5.T7.BACENDS
# 10 ZC27I3.ZA.BACENDS
# 10 ZC261G9.ZAF.BACENDS
# 10 ZC261G9.ZA.BACENDS
# 8 ZK8O7.T7.BACENDS
# 8 ZC77P2.ZB.BACENDS
# 8 FJ89A05.X1.WU_ZFEST
# 8 FJ07G09.X1.WU_ZFEST
cd /cluster/data/danRer3/bed/ZonLab/rhMap
# lift up to genome level coordinates
liftUp rhMap.psl \
/cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft warn \
/san/sanvol1/scratch/danRer3/rhMap/contig.psl
# Got 30168 lifts in
# /cluster/data/danRer3/jkStuff/liftAllPlusliftScaffolds.lft
pslCheck rhMap.psl
# psl looks ok
# cleanup
rm *.bak *.sort
# Load sequence alignments into the database
ssh hgwdev
cd /cluster/data/danRer3/bed/ZonLab/rhMap
# drop old table and reload final psl file
hgsql -e 'drop table rhMap;' danRer3
hgLoadPsl danRer3 rhMap.psl
# Copy sequences to gbdb if they are not already there.
mkdir -p /gbdb/danRer3/rhMap
# remove old sequences
rm /gbdb/danRer3/rhMap/rhMap022306.fa
ln -s \
/cluster/data/danRer3/bed/ZonLab/rhMap/rhMap.fa \
/gbdb/danRer3/rhMap/rhMap20061003.fa
# then add sequences to database:
# remove old ones first
hgsql -e 'select * from extFile where path like "%rhMap%";' danRer3
#| id | name | path | size |
#+--------+----------------+------------------------------------+---------+
#| 747628 | rhMap022306.fa | /gbdb/danRer3/rhMap/rhMap022306.fa | 7456861 |
#+--------+----------------+------------------------------------+---------+
hgsql -e 'select count(*) from seq where extFile = 747628;' danRer3
hgsql -e 'delete from seq where extFile = 747628;' danRer3
# delete from extFile:
hgsql -e 'delete from extFile where id = 747628;' danRer3
hgLoadSeq danRer3 /gbdb/danRer3/rhMap/rhMap20061003.fa
# loaded succesfully
# Check in the Browser and see if there are many pileups
# Much reduced now on chr24. Took 10 random sequences in the pileup from
# minCover=0.20 and found that 7 of them still align to danRer4
# with minCover=0.33 and 2 of those that don't also have primers that
# do not map using the hgPcr tool.
# Add trackDb entry and also an rhMap.html for trackDb/zebrafish/danRer4
# also add the search specs for hgFindSpec to trackDb.ra
# Add table of related information for the RH map details pages:
# Check that all the headers from rhMap.headers are also in the primers
# file which seems to contain the same headers from the FASTA file
# as well as additional markers.
ssh kkstore02
cd /cluster/data/danRer3/bed/ZonLab/rhMap/
# The same RH map version was used as for danRer4 so the data for the
# info table is the same as for danRer4 so copy the file over. See
# kent/src/hg/makeDb/doc/danRer4.txt for details on how this file is
# produced.
cp /cluster/data/danRer4/bed/ZonLab/rhMap/rhMapInfoWithZfinIds.tab .
# load the info table
ssh hgwdev
cd /cluster/data/danRer3/bed/ZonLab/rhMap
hgLoadSqlTab danRer3 rhMapZfishInfo ~/kent/src/hg/lib/rhMapZfishInfo.sql \
rhMapInfoWithZfinIds.tab
# add code to hgc.c to print ZFIN ID, if available, on the details page
# together with the other marker-related information.
# added track to trackDb.ra in trackDb/zebrafish/danRer4 with a URL for
# the ZFIN IDs to link to the relevant page at http://www.zfin.org
# and added an html page for the track.
#########################################################################
# BACENDS CLEANUP (DONE, 2007-03-27, hartera)
ssh kkstore02
cd /cluster/data/danRer3/bed/bacends
# 23G in bacends directory
# remove sequence file as already in bacSeqs dir
rm Zv5Bacends.fa
# du -sh psl
# 12G psl
nice rm -r psl
cd bacends.1
rm bacEndAccs.aliases bacEnds.log bacEnds.names.sort bacPrs.names bacs.log
rm ch211 intNames intNames.count out test test.pl bacEndSingles.txt
rm -r test2 bacEndAccs
rm BACClones* BACEndAccs.txt *.accs allBacEnds* bacEndSingles.names
cd ../scoresAndCoords
rm allBacEnds.names.* bacEndSinglesGoodScores.bed error.log *.tab \
singles.hits bacEnds.load.psl bacEnds.names
rm -r tmp
cd ../pairsNew
# bacEndSingles.bed is already in singlesNew
rm bacEnds.* bed.tab bacEndSingles.bed
cd ../singlesNew
rm singles.coordcheck bed.tab bacEnds.*
cd ../duplicates
# duplicatesNew is latest directory so remove everthing else from
# duplicates directory
rm *
cd duplicatesNew
rm log* *.lfs
cd /cluster/data/danRer3/bed/bacends/cloneandStsAliases
rm -r tmp
rm bacClones* bacs.names log
cd /cluster/data/danRer3/bed/
du -sh bacends
# 5.0G bacends
# BAC ENDS track was remade in May 2006 (see REDO BACENDS section)
# so can remove bacEndsNew which is an old version from 2005"
du -sh newBacends
# 37G newBacends
nice rm -r newBacends
#########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
echo danRer3 fr1 tetNig1 mm7 hg18 > /hive/data/genomes/danRer3/bed/multiz5way/species.list
update genbank.conf
danRer3.upstreamGeneTbl = refGene
danRer3.upstreamMaf = multiz5way /hive/data/genomes/danRer3/bed/multiz5way/species.list