src/hg/makeDb/doc/danRer4.txt 1.35
1.35 2009/11/25 21:48:39 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/danRer4.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer4.txt,v
retrieving revision 1.34
retrieving revision 1.35
diff -b -B -U 1000000 -r1.34 -r1.35
--- src/hg/makeDb/doc/danRer4.txt 17 Oct 2008 01:06:31 -0000 1.34
+++ src/hg/makeDb/doc/danRer4.txt 25 Nov 2009 21:48:39 -0000 1.35
@@ -1,10285 +1,10285 @@
# for emacs: -*- mode: sh; -*-
# Danio Rerio (zebrafish) from Sanger, version Zv5 (released 5/20/05)
# Project website:
# http://www.sanger.ac.uk/Projects/D_rerio/
# Assembly notes:
# http://www.sanger.ac.uk/Projects/D_rerio/
# ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_assembl_information.shmtl
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-----------+-------------------------+
# | tableName | type |
# +-----------+-------------------------+
# | refGene | genePred refPep refMrna |
# | mgcGenes | genePred |
# | genscan | genePred genscanPep |
# +-----------+-------------------------+
###########################################################################
# DOWNLOAD SEQUENCE (DONE, 2006-03-29, hartera)
# CHANGED NAME OF SCAFFOLDS AGP FILE (DONE, 2006-04-13, hartera)
ssh kkstore01
mkdir /cluster/store8/danRer4
ln -s /cluster/store8/danRer4 /cluster/data
cd /cluster/data/danRer4
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/README
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6.chunks.agp
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6.scaffold.agp
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_scaffolds.fa
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv6release/Zv6_scaffolds.stats
# keep agp file name consistent with Zv5 (hartera, 2006-04-13)
mv Zv6.scaffold.agp Zv6.scaffolds.agp
###########################################################################
# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE, 2006-03-29, hartera)
# ADDED CHUNKS AGP FILE (DONE, 2006-04-13, hartera)
ssh kkstore01
mkdir -p /cluster/data/danRer4/M
cd /cluster/data/danRer4/M
# go to http://www.ncbi.nih.gov/ and search the Nucleotide database for
# "Danio mitochondrion genome". That shows the gi number:
# 8576324 for the accession, AC024175
# Use that number in the entrez linking interface to get fasta:
wget -O chrM.fa \
'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=8576324&dopt=FASTA'
# Edit chrM.fa: make sure the header line says it is the
# Danio Rerio mitochondrion complete genome, and then replace the
# header line with just ">chrM".
perl -pi.bak -e 's/>.+/>chrM/' chrM.fa
rm *.bak
# Make a "pseudo-contig" for processing chrM too:
mkdir ./chrM_1
sed -e 's/chrM/chrM_1/' ./chrM.fa > ./chrM_1/chrM_1.fa
mkdir ./lift
echo "chrM_1/chrM_1.fa.out" > ./lift/oOut.lst
echo "chrM_1" > ./lift/ordered.lst
# make sure this is tab delimited:
echo "0\tM/chrM_1\t16596\tchrM\t16596" > ./lift/ordered.lft
# create a .agp file for chrM as hgGoldGapGl and other
# programs require a .agp file so create chrM.agp
echo "chrM\t1\t16596\t1\tF\tAC024175.3\t1\t16596\t+" \
> chrM.agp
# Create a chrM.chunks.agp (hartera, 2006-04-13)
mkdir -p /cluster/data/danRer4/M/agps
cd /cluster/data/danRer4/M/agps
awk 'BEGIN {OFS="\t"} \
{print $1, $2, $3, $4, $5, $6, $7, $8, $1, $7, $8}' \
../chrM.agp > chrM.chunks.agp
# make sure that all above *.agp files are tab delimited
###########################################################################
# CREATE LIST OF CHROMOSOMES (DONE, 2006-04-12, hartera)
# Change names of random chroms to chrNA_random and chrUn_random
# (DONE, hartera, 2006-04-21)
ssh kkstore01
cd /cluster/data/danRer4
awk '{if ($1 !~ /Zv6/) print $1;}' Zv6.scaffolds.agp \
| sort -n | uniq > chrom.lst
cp chrom.lst chrom1to25.lst
# add chrM, chrUn and chrNA
echo "M" >> chrom.lst
echo "NA" >> chrom.lst
echo "Un" >> chrom.lst
# Change names of random chroms to reflect that
perl -pi.bak -e 's/NA/NA_random/' chrom.lst
perl -pi.bak -e 's/Un/Un_random/' chrom.lst
rm *.bak
###########################################################################
# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2006-04-12, hartera)
ssh kkstore01
cd /cluster/data/danRer4
# This used to hold scripts -- better to keep them inline here
# Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
mkdir /cluster/data/danRer4/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/danRer4/bed
###########################################################################
# CHECK AGP FILES AND FASTA SIZE CONSISTENCY (DONE, 2006-04-13, hartera)
#
ssh kkstore01
cd /cluster/data/danRer4
mkdir -p /cluster/data/danRer4/scaffolds
cd /cluster/data/danRer4/scaffolds
faSize detailed=on ../Zv6_scaffolds.fa > Zv6.scaffolds.sizes
# Check that these sizes correspond to the sizes in the scaffolds agp file
# use script compareSizes2.pl
cat << '_EOF_' > ../jkStuff/compareSizes2.pl
#!/usr/bin/perl -w
use strict;
my ($file, $agp);
$file = $ARGV[0];
$agp = $ARGV[1];
open(FILE, $file) || die "Can not open $file: $!\n";
open(AGP, $agp) || die "Can not open $agp: $!\n";
open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";
my ($l, @f, $name, $size, %scafsHash);
while (<FILE>)
{
$l = $_;
@f = split(/\t/, $l);
$name = $f[0];
$size = $f[1];
$scafsHash{$name} = $size;
}
close FILE;
while (<AGP>)
{
my ($line, @fi, $scaf, $end);
$line = $_;
if ($line =~ /Zv/)
{
@fi = split(/\t/, $line);
$scaf = $fi[5];
$end = $fi[7];
if (exists($scafsHash{$scaf}))
{
if ($scafsHash{$scaf} == $end)
{
print OUT "$scaf - ok\n";
}
else
{
print OUT "$scaf - different size to sequence\n";
}
}
else
{
print OUT "$scaf - does not exist in list of sizes\n";
}
}
}
close AGP;
close OUT;
'_EOF_'
# << happy emacs
chmod +x ../jkStuff/compareSizes2.pl
perl /cluster/data/danRer4/jkStuff/compareSizes2.pl \
Zv6.scaffolds.sizes ../Zv6.scaffolds.agp
grep different log.txt
grep not log.txt
# these are all consistent with the sequence sizes
# check that the co-ordinates in the agp files are consistent:
# field 2 is the start position, field 3 is the end and field 8 is the size
# so check that this is consistent.
cd /cluster/data/danRer4
awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' Zv6.scaffolds.agp \
> Zv6.scaffolds.coordCheck
# this file is empty so they are ok. do the same for the chunks.agp file
awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' Zv6.chunks.agp \
> Zv6.chunks.coordCheck
# this file is empty so ok
# check that the difference between 7th and 8th fields is the same as the
# difference between 11th and 12th fields.
awk '{if ($5 != "N" && (($8 - $7) != ($12 - $11))) print $6;}' \
Zv6.chunks.agp > Zv6.chunks.coordCheck2
# these are all ok
rm Zv6.*.coord*
cat << '_EOF_' > ./jkStuff/checkSizesInAgps.pl
#!/usr/bin/perl -w
use strict;
my ($ch, $sc, %scafsHash);
$sc = $ARGV[0]; # scaffolds agp
$ch = $ARGV[1]; # chunks or contigs agp
open(SCAFS, $sc) || die "Can not open $sc: $!\n";
open(CHUNKS, $ch) || die "Can not open $ch: $!\n";
while (<SCAFS>)
{
my ($l, @f, $name, $e);
$l = $_;
@f = split(/\t/, $l);
if ($f[5] =~ /^Zv/)
{
$name = $f[5];
$e = $f[2];
$scafsHash{$name} = $e;
}
}
close SCAFS;
my $scaf = "";
my $prev = "";
my $prevEnd = 0;
while (<CHUNKS>)
{
my ($line, @fi);
$line = $_;
@fi = split(/\t/, $line);
# if it is not a gap line
if ($fi[4] ne "N")
{
$scaf = $fi[9];
if (($scaf ne $prev) && ($prev ne ""))
{
checkCoords($prev, $prevEnd);
}
$prev = $scaf;
$prevEnd = $fi[2];
}
}
# check last entry in file
checkCoords($prev, $prevEnd);
close CHUNKS;
sub checkCoords {
my ($name, $end) = @_;
if (exists($scafsHash{$prev}))
{
if ($scafsHash{$prev} != $prevEnd)
{
my $ed = $scafsHash{$prev};
print "Scaffold $prev is not consistent between agps\n";
}
else
{
my $ed = $scafsHash{$prev};
print "Scaffold $prev - ok\n";
}
}
}
'_EOF_'
# << happy emacs
chmod +x ./jkStuff/checkSizesInAgps.pl
cd scaffolds
perl /cluster/data/danRer4/jkStuff/checkSizesInAgps.pl \
Zv6.scaffolds.agp Zv6.chunks.agp > Zv6.scafsvschunks
grep "not consistent" Zv6.scafsvschunks
# no lines were inconsistency was reported
wc -l Zv6.scafsvschunks
# 6653 Zv6.scafsvschunks
grep "Zv6" Zv6.scaffolds.agp | wc -l
# 6653
# so all the scaffolds were checked and were ok.
cd ..
rm -r scaffolds
###########################################################################
# SPLIT AGP FILES BY CHROMOSOME (DONE, 2006-04-13, hartera)
# GENOME FASTA FROM SANGER WAS CREATED USING SCAFFOLDS AGP
ssh kkstore01
cd /cluster/data/danRer4
# There are 2 .agp files: one for scaffolds (supercontigs on danRer1) and
# then one for chunks (contigs on danRer1) showing how they map on to
# scaffolds.
# get list of scaffolds from FASTA file and check these are in agp
grep '>' Zv6_scaffolds.fa | sed -e 's/>//' | sort | uniq > Zv6FaScafs.lst
# get list of scaffolds from agp - do not print from gap lines
awk '{if ($7 !~ /contig/) print $6;}' Zv6.scaffolds.agp \
| sort | uniq > Zv6AgpScafs.lst
diff Zv6FaScafs.lst Zv6AgpScafs.lst
# no difference so all scaffolds are in the FASTA file
# add "chr" prefix for the agp files
perl -pi -e 's/^([0-9]+)/chr$1/' ./*.agp
# for chromosomes 1 to 25, create 2 agps for each chrom, one for scaffolds
# and one for chunks:
foreach c (`cat chrom1to25.lst`)
echo "Processing $c ..."
mkdir $c
perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
./Zv6.chunks.agp \
> $c/chr$c.chunks.agp
perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
./Zv6.scaffolds.agp \
> $c/chr$c.scaffolds.agp
end
###########################################################################
# CREATE AGP FILES FOR chrNA AND chrUn (DONE, 2006-04-13, hartera)
# RECREATE AGP FILES WITH chrNA and chrUn RENAMED AS chrNA_random
# AND chrUn_random (DONE, 2006-04-21, hartera)
# NOTE: IN THIS ASSEMBLY AND IN FUTURE, NAME chrNA AND chrUn AS
# chrNA_random AND chrUn_random TO REFLECT THAT THEY ARE UNORDERED
# COLLECTIONS OF SCAFFOLDS.
ssh kkstore01
# chrNA_random consists of WGS contigs that could not be related to any
# FPC contig and the scaffolds and contigs are named Zv5_NAN in the
# first field of the agp files where the second N is an number.
cd /cluster/data/danRer4
mkdir ./NA_random
awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.chunks.agp \
> ./NA_random/NA_random.chunks.agp
awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.scaffolds.agp \
> ./NA_random/NA_random.scaffolds.agp
# change the first field to "chrNA_random" then can use agpToFa to process
perl -pi.bak -e 's/Zv6_NA[0-9]+/chrNA_random/' ./NA_random/*.agp
wc -l ./NA_random/NA_random.scaffolds.agp
# 2898 ./NA_random/NA_random.scaffolds.agp
# check files and remove backup files
# these are not sorted numerically by scaffold number
rm ./NA_random/*.bak
# then process chrUn_random - this is made from scaffolds and
# contigs where the name is Zv6_scaffoldN in the first field of the
# agp files. These scaffolds and contigs are unmapped to chromosomes
# in the agp file. chrUn_random is made up of WGS scaffolds that mapped to
# FPC contigs, but the chromosome is unknown.
mkdir ./Un_random
awk '{if ($1 ~ /Zv6_scaffold/) print;}' Zv6.chunks.agp \
> ./Un_random/Un_random.chunks.agp
awk '{if ($1 ~ /Zv6_scaffold/) print;}' Zv6.scaffolds.agp \
> ./Un_random/Un_random.scaffolds.agp
# change the first field to "chrUn_random" then can use agpToFa to process
perl -pi.bak -e 's/Zv6_scaffold[0-9]+/chrUn_random/' ./Un_random/*.agp
wc -l ./Un_random/Un_random.scaffolds.agp
# 68 ./Un_random/Un_random.scaffolds.agp
# check files and remove backup files
rm ./Un_random/*.bak
# get FASTA file of sequences for NA_random and Un_random and create agp with
# Ns between scaffolds
# from scaffolds agp, get name of scaffolds to be retrieved from the
# FASTA file to make the NA_random and Un_random chromosomes.
cd /cluster/data/danRer4
foreach c (NA_random Un_random)
awk '{print $6;}' $c/$c.scaffolds.agp > $c/chr$c.scaffolds.lst
$HOME/bin/i386/faSomeRecords /cluster/data/danRer4/Zv6_scaffolds.fa \
$c/chr$c.scaffolds.lst $c/chr$c.fa
end
# check that all scaffolds in the list are in the FASTA file for
# NA_random and Un_random.
# made a change to scaffoldFaToAgp.c so that the the number of Ns to be
# inserted between scaffolds can be specified as an option.
# There are less and smaller random scaffolds than before so use 50,000 Ns
# between scaffolds as for the human random chromosomes.
foreach c (NA_random Un_random)
$HOME/bin/i386/scaffoldFaToAgp -scaffoldGapSize=50000 $c/chr$c.fa
mv $c/chr$c.fa $c/chr$c.scaffolds.fa
end
# change chrUn to chrNA_random for NA_random, change chrUn to chrUn_random
# forUn_random. Change D to W for NA_random and Un_random..
sed -e 's/chrUn/chrNA_random/' ./NA_random/chrNA_random.agp \
| sed -e 's/D/W/' > ./NA_random/chrNA_random.scaffolds.agp
# the scaffolds agp for chrNA_random is now sorted numerically by
# scaffold number
sed -e 's/chrUn/chrUn_random/' ./Un_random/chrUn_random.agp \
| sed -e 's/D/W/' > ./Un_random/chrUn_random.scaffolds.agp
# edit ./NA_random/chrNA_random.scaffolds.agp and
# ./Un_random/chrUn_random.scaffolds.agp and remove last line as this
# just adds an extra 50000 Ns at the
# end of the sequence.
rm ./NA_random/chrNA_random.agp ./Un_random/chrUn_random.agp
cat << '_EOF_' > ./jkStuff/createAgpWithGaps.pl
#!/usr/bin/perl
use strict;
# This script takes a chunks agp and inserts Ns between scaffolds for
# the chunks (contigs) agp file. Could also insert Ns between scaffolds
# for scaffolds agp.
my ($chrom, $numN, $name, $prev, $st, $end, $prevEnd, $id);
my $chrom = $ARGV[0]; # chromosome name
my $numN = $ARGV[1]; # number of Ns to be inserted
my $type = $ARGV[2]; # contigs or scaffolds
$prev = "";
$st = 1;
$prevEnd = 0;
$id = 0;
while (<STDIN>)
{
my $l = $_;
my @f = split(/\t/, $l);
if ($type eq "contigs")
{
$name = $f[9];
}
else
{
$name = $f[5]
}
my $currSt = $f[1];
my $currEnd = $f[2];
my $size = $currEnd - $currSt;
$id++;
$st = $prevEnd + 1;
$end = $st + $size;
if (($prev ne "") && ($prev ne $name))
{
$st = $prevEnd + 1;
$end = ($st + $numN) - 1;
print "$chrom\t$st\t$end\t$id\tN\t$numN\tcontig\tno\n";
$prevEnd = $end;
$id++;
}
$st = $prevEnd + 1;
$end = $st + $size;
print "$chrom\t$st\t$end\t$id\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$f[8]";
if ($type eq "contigs")
{
print "\t$f[9]\t$f[10]\t$f[11]";
}
$prevEnd = $end;
$prev = $name;
}
'_EOF_'
chmod +x ./jkStuff/createAgpWithGaps.pl
cd /cluster/data/danRer4/NA_random
# for NA_random, sort the chunks.agp by contig number
perl -pi.bak -e 's/Zv6_NA//' NA_random.chunks.agp
sort -k6,6n NA_random.chunks.agp > NA_random.chunks2.agp
# then put back Zv6_NA
perl -pi.bak -e 's/([0-9]+\.[0-9]+)/Zv6_NA$1/' NA_random.chunks2.agp
mv NA_random.chunks2.agp NA_random.chunks.agp
# Un_random.chunks.agp is already sorted by scaffold number
cd /cluster/data/danRer4
foreach c (NA_random Un_random)
cd $c
perl /cluster/data/danRer4/jkStuff/createAgpWithGaps.pl \
chr${c} 50000 contigs < ${c}.chunks.agp > chr${c}.chunks.agp
cd ..
end
# check co-ordinates
# field 2 is the start position, field 3 is the end and field 8 is the size
# so check that this is consistent in scaffolds and chunks agp.
# check that the difference between 7th and 8th fields is the same as the
# difference between 11th and 12th fields for chunks agp.
cd /cluster/data/danRer4
foreach c (NA_random Un_random)
awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' \
$c/chr${c}.scaffolds.agp > $c/chr${c}.scaffolds.coordCheck
awk '{if ($6 ~ /^Zv6/ && (($3-$2+1) != $8)) print $6;}' \
$c/chr${c}.chunks.agp > $c/chr${c}.chunks.coordCheck
awk '{if ($5 != "N" && (($8 - $7) != ($12 - $11))) print $6;}' \
$c/chr${c}.chunks.agp > $c/chr${c}.chunks.coordCheck2
end
# check the outputs are empty
wc -l NA_random/*.coord*
wc -l Un_random/*.coord*
rm NA_random/*.coord* Un_random/*.coord*
# check that the scaffolds and chunks agp files are consistent with
# each other.
cat << '_EOF_' > ./jkStuff/checkSizesInAgps.pl
#!/usr/bin/perl -w
use strict;
my ($ch, $sc, %scafsHash);
$sc = $ARGV[0]; # scaffolds agp
$ch = $ARGV[1]; # chunks or contigs agp
open(SCAFS, $sc) || die "Can not open $sc: $!\n";
open(CHUNKS, $ch) || die "Can not open $ch: $!\n";
while (<SCAFS>)
{
my ($l, @f, $name, $e);
$l = $_;
@f = split(/\t/, $l);
if ($f[5] =~ /^Zv/)
{
$name = $f[5];
$e = $f[2];
$scafsHash{$name} = $e;
}
}
close SCAFS;
my $scaf = "";
my $prev = "";
my $prevEnd = 0;
while (<CHUNKS>)
{
my ($line, @fi);
$line = $_;
@fi = split(/\t/, $line);
# if it is not a gap line
if ($fi[4] ne "N")
{
$scaf = $fi[9];
if (($scaf ne $prev) && ($prev ne ""))
{
checkCoords($prev, $prevEnd);
}
$prev = $scaf;
$prevEnd = $fi[2];
}
}
# check last entry in file
checkCoords($prev, $prevEnd);
close CHUNKS;
sub checkCoords {
my ($name, $end) = @_;
if (exists($scafsHash{$prev}))
{
if ($scafsHash{$prev} != $prevEnd)
{
my $ed = $scafsHash{$prev};
print "Scaffold $prev is not consistent between agps\n";
}
else
{
my $ed = $scafsHash{$prev};
print "Scaffold $prev - ok\n";
}
}
}
'_EOF_'
# << happy emacs
chmod +x jkStuff/checkSizesInAgps.pl
foreach c (NA_random Un_random)
perl /cluster/data/danRer4/jkStuff/checkSizesInAgps.pl \
$c/chr${c}.scaffolds.agp $c/chr${c}.chunks.agp \
> $c/${c}.scafsvschunks
end
foreach c (NA_random Un_random)
grep "not consistent" $c/${c}.scafsvschunks
end
wc -l NA_random/NA_random.scafsvschunks
wc -l Un_random/Un_random.scafsvschunks
# no lines were inconsistency was reported
rm NA_random/NA_random.scafsvschunks Un_random/Un_random.scafsvschunks
# clean up
foreach c (NA_random Un_random)
rm $c/${c}.scaffolds.agp $c/${c}.chunks.agp $c/chr${c}.scaffolds.fa \
$c/chr${c}.scaffolds.lst $c/*.bak
end
'_EOF_'
###########################################################################
# BUILD CHROM-LEVEL SEQUENCE (DONE, 2006-04-13, hartera)
# REPEAT THIS FOR chrNA_random AND chrUn_random (DONE, 2006-04-21, hartera)
ssh kkstore01
cd /cluster/data/danRer4
# Ignore warnings about chrM files not existing - this chrom has
# already been processed - see mitochondrion section above.
# Sequence is already in upper case so no need to change
foreach c (`cat chrom.lst`)
echo "Processing ${c}"
$HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
$c/chr$c.fa ./Zv6_scaffolds.fa
echo "${c} - DONE"
end
# move scaffolds agp to be chrom agp and clean up
foreach c (`cat chrom.lst`)
cd $c
cp chr${c}.scaffolds.agp chr${c}.agp
mkdir -p agps
mv chr${c}.*.agp ./agps/
cd ..
end
# Repeat just for chrNA_random and chrUn_random (2006-04-21, hartera)
foreach c (NA_random Un_random)
echo "Processing ${c}"
$HOME/bin/i386/agpToFa -simpleMultiMixed $c/chr$c.scaffolds.agp chr$c \
$c/chr$c.fa ./Zv6_scaffolds.fa
echo "${c} - DONE"
end
# move scaffolds agp to be chrom agp and clean up
foreach c (NA_random Un_random)
cd $c
cp chr${c}.scaffolds.agp chr${c}.agp
mkdir -p agps
mv chr${c}.*.agp ./agps/
cd ..
end
##########################################################################
# CHECK CHROM AND VIRTUAL CHROM SEQUENCES (DONE, 2006-04-14, hartera)
# RE-CHECK THESE AFTER CREATING chrNA_random AND chrUn_random SEQUENCE FILES
# (DONE, 2006-04-20, hartera)
# Check that the size of each chromosome .fa file is equal to the last
# co-ordinate of the corresponding agp file.
ssh hgwdev
cd /cluster/data/danRer4
foreach c (`cat chrom.lst`)
foreach f ( $c/chr$c.agp )
set agpLen = `tail -1 $f | awk '{print $3;}'`
set h = $f:r
set g = $h:r
echo "Getting size of $g.fa"
set faLen = `faSize $g.fa | awk '{print $1;}'`
if ($agpLen == $faLen) then
echo " OK: $f length = $g length = $faLen"
else
echo "ERROR: $f length = $agpLen, but $g length = $faLen"
endif
end
end
# all are the OK so FASTA files are the expected size
###########################################################################
# CREATING DATABASE (DONE, 2006-04-14, hartera)
# Create the database.
# next machine
ssh hgwdev
echo 'create database danRer4' | hgsql ''
# if you need to delete that database: !!! WILL DELETE EVERYTHING !!!
echo 'drop database danRer4' | hgsql danRer4
# Use df to make sure there is at least 10 gig free on
df -h /var/lib/mysql
# Before loading data:
# Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 1.5T 173G 90% /var/lib/mysql
###########################################################################
# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2006-04-14, hartera)
# next machine
ssh hgwdev
# the following command copies all the data from the table
# grp in the database mm8 to the new database danRer4. Use one of the
# newest databases to copy from to make sure that the groupings are
# up to date.
echo "create table grp (PRIMARY KEY(NAME)) select * from mm8.grp" \
| hgsql danRer4
# if you need to delete that table: !!! WILL DELETE ALL grp data !!!
echo 'drop table grp;' | hgsql danRer4
###########################################################################
# MAKE HGCENTRALTEST ENTRY FOR DANRER4 (DONE, 2006-04-14, hartera)
# CHANGE DATE FORMAT ON HGCENTRALTEST ENTRY (DONE, 2006-04-21, hartera)
# Make entry into dbDb and defaultDb so test browser knows about it.
ssh hgwdev
# Add dbDb and defaultDb entries:
echo 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk, hgPbOk, sourceName) \
values("danRer4", "March 2006", \
"/gbdb/danRer4", "Zebrafish", "chr2:15,906,734-15,926,406", 1, \
37, "Zebrafish", "Danio rerio", \
"/gbdb/danRer4/html/description.html", 0, 0, \
"Sanger Centre, Danio rerio Sequencing Project Zv6");' \
| hgsql -h genome-testdb hgcentraltest
# reformat the date (2006-04-21, hartera)
echo 'update dbDb set description = "Mar. 2006" where name = "danRer4";' \
| hgsql -h genome-testdb hgcentraltest
# Create /gbdb directory for danRer4
mkdir /gbdb/danRer4
# SET AS DEFAULT LATER WHEN READY FOR RELEASE
# set danRer4 to be the default assembly for Zebrafish
echo 'update defaultDb set name = "danRer4" \
where genome = "Zebrafish";' \
| hgsql -h genome-testdb hgcentraltest
###########################################################################
# BREAK UP SEQUENCE INTO 5MB CHUNKS AT CONTIGS/GAPS FOR CLUSTER RUNS
# (DONE, 2006-04-14, hartera)
# RE-DONE JUST FOR chrNA_random AND chrUn_random (DONE, 2006-04-20, hartera)
ssh kkstore01
cd /cluster/data/danRer4
foreach c (`cat chrom.lst`)
foreach agp ($c/chr$c.agp)
if (-e $agp) then
set fa = $c/chr$c.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
splitFaIntoContigs $agp $fa . -nSize=5000000
endif
end
end
# Repeat just for chrNA_random and chrUn_random (2006-04-21, hartera)
ssh kkstore01
cd /cluster/data/danRer4
foreach c (NA_random Un_random)
foreach agp ($c/chr$c.agp)
if (-e $agp) then
set fa = $c/chr$c.fa
echo splitting $agp and $fa
cp -p $agp $agp.bak
cp -p $fa $fa.bak
splitFaIntoContigs $agp $fa . -nSize=5000000
endif
end
end
###########################################################################
# MAKE LIFTALL.LFT (DONE, 2006-04-14, hartera)
# REMAKE LIFTALL.LFT WITH chrNA_random AND chrUn_random
# (DONE, 2006-04-21, hartera)
ssh kkstore01
cd /cluster/data/danRer4
rm jkStuff/liftAll.lft
foreach c (`cat chrom.lst`)
cat $c/lift/ordered.lft >> jkStuff/liftAll.lft
end
###########################################################################
# MAKE TRACKDB ENTRY FOR DANRER4 (DONE, 2006-04-14, hartera)
# Should add this later when adding gold/gap tracks. Angie created a
# temporary chromInfo table otherwise make update/alpha causes an error
# (2006-04-17)
# Make trackDb table so browser knows what tracks to expect.
ssh hgwdev
mkdir -p ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer4
cd ~/kent/src/hg/makeDb/trackDb/zebrafish
cvs add danRer4
cvs commit danRer4
cd ~/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add danRer4 in all the right places and do
make update DBS=danRer4
make alpha DBS=danRer4
cvs commit -m "Added danRer4." makefile
###########################################################################
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2006-04-14, hartera)
ssh hgwdev
mkdir /cluster/data/danRer4/html
# make a symbolic link from /gbdb/danRer4/html to /cluster/data/danRer4/html
ln -s /cluster/data/danRer4/html /gbdb/danRer4/html
# Add a description page for zebrafish
cd /cluster/data/danRer4/html
cp $HOME/kent/src/hg/makeDb/trackDb/zebrafish/danRer3/description.html .
# Edit this for zebrafish danRer4
# create a description.html page here
cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer4
# Add description page here too
cp /cluster/data/danRer4/html/description.html .
cvs add description.html
cvs commit -m "First draft of description page for danRer4." \
description.html
cd ~/kent/src/hg/makeDb/trackDb
make update DBS=danRer4
make alpha DBS=danRer4
###########################################################################
# SIMPLE REPEAT [TRF] TRACK (DONE, 2006-04-14, hartera)
# RE-RUN FOR chrNA AND chrUn RENAMED AS chrNA_random AND chrUn_random
# AND RELOAD THE TABLE (DONE, 2006-04-21, hartera)
# MADE A NOTE IN THE HISTORY TABLE TO EXPLAIN WHY THE simpleRepeats TABLE
# WAS RELOADED (DONE, 2006-04-22, hartera)
# TRF can be run in parallel with RepeatMasker on the file server
# since it doesn't require masked input sequence.
# Run this on the kilokluster. Need to mask contig and chromosome
# sequences so run trf using contig sequences.
# First copy over contig sequences to iscratch and then rsync to cluster.
ssh kkr1u00
rm -r /iscratch/i/danRer4/contigsNoMask
mkdir -p /iscratch/i/danRer4/contigsNoMask
cd /cluster/data/danRer4
foreach d (/cluster/data/danRer4/*/chr*_?{,?})
set ctg = $d:t
foreach f ($d/${ctg}.fa)
echo "Copyig $f ..."
cp $f /iscratch/i/danRer4/contigsNoMask/
end
end
ls /iscratch/i/danRer4/contigsNoMask/*.fa | wc -l
# 317 sequence files
# rsync to cluster machines
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
end
ssh kki
mkdir -p /cluster/data/danRer4/bed/simpleRepeat
cd /cluster/data/danRer4/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << keep emacs coloring happy
chmod +x runTrf
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << keep emacs coloring happy
ls -1S /iscratch/i/danRer4/contigsNoMask/chr*.fa > genome.lst
gensub2 genome.lst single gsub jobList
# 317 jobs
para create jobList
para try, check, push, check etc...
para time
# Completed: 317 of 317 jobs
# CPU time in finished jobs: 25083s 418.05m 6.97h 0.29d 0.001 y
# IO & Wait Time: 933s 15.55m 0.26h 0.01d 0.000 y
# Average job time: 82s 1.37m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2732s 45.53m 0.76h 0.03d
# Submission to last job: 4604s 76.73m 1.28h 0.05d
# Re-do only for chrNA_random and chrUn_random (2006-04-21, hartera)
ssh kki
cd /cluster/data/danRer4/bed/simpleRepeat
rm trf/chrNA*.bed
rm trf/chrUn*.bed
rm simpleRepeat.bed
mkdir -p randomsRun/trf
cd randomsRun
cp ../runTrf .
cp ../gsub .
ls -1S /iscratch/i/danRer4/contigsNoMask/chr*_random*.fa > genome.lst
gensub2 genome.lst single gsub jobList
para create jobList
# 46 jobs
para try, check, push, check etc...
para time
# Completed: 46 of 46 jobs
# CPU time in finished jobs: 1904s 31.73m 0.53h 0.02d 0.000 y
# IO & Wait Time: 103s 1.72m 0.03h 0.00d 0.000 y
# Average job time: 44s 0.73m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 241s 4.02m 0.07h 0.00d
# Submission to last job: 269s 4.48m 0.07h 0.00d
cp ./trf/*.bed /cluster/data/danRer4/bed/simpleRepeat/trf/
# lift up to chrom level
cd /cluster/data/danRer4/bed/simpleRepeat
rm simpleRepeat.bed
liftUp simpleRepeat.bed /cluster/data/danRer4/jkStuff/liftAll.lft warn \
trf/*.bed
# Reload into the database
ssh hgwdev
cd /cluster/data/danRer4/bed/simpleRepeat
hgsql -e 'drop table simpleRepeat;' danRer4
hgLoadBed danRer4 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 759659 elements of size 16
# Make a note in the history table to explain why the simpleRepeats
# table was reloaded (2006-04-22, hartera)
hgsql -e 'update history set errata = \
"Dropped table for reloading after changing names of random chroms." \
where ix = 2;' danRer4
###########################################################################
# CREATE MICROSAT TRACK (done 2006-7-5 JK)
ssh hgwdev
cd /cluster/data/danRer4/bed
mkdir microsat
cd microsat
awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed
/cluster/bin/i386/hgLoadBed canFam2 microsat microsat.bed
###########################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2005-06-14, hartera)
# RE-DO AFTER RENAMING RANDOM CHROMS AS chrNA_random AND chrUn_random
# (DONE, 2006-04-21, hartera)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
ssh kkstore01
cd /cluster/data/danRer4/bed/simpleRepeat
rm -r trfMask
mkdir -p trfMask
foreach f (trf/chr*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Lift up filtered trf output to chrom coords as well:
cd /cluster/data/danRer4
rm -r ./bed/simpleRepeat/trfMaskChrom
mkdir bed/simpleRepeat/trfMaskChrom
foreach c (`cat chrom.lst`)
if (-e $c/lift/ordered.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/ordered.lst > $c/lift/oTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
endif
if (-e $c/lift/random.lst) then
perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
$c/lift/random.lst > $c/lift/rTrf.lst
liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
endif
end
###########################################################################
# GET ADDITIONAL ZEBRAFISH REPBASE LIBRARY FOR REPEATMASKER AND ADD TO
# DANIO LIBRARY FOR REPEATMASKER (DONE, 2006-04-14, hartera)
# Go to http://www.girinst.org/server/RepBase/RepBase11.02.fasta
# (03-15-2006) and download zebunc.ref.txt containing unclassified zebrafish
# repeats.
# Need username and password. Copy to /cluster/bluearc/RepeatMasker/Libraries/
ssh hgwdev
cd /cluster/bluearc/RepeatMasker/Libraries
# This is /cluster/bluearc/RepeatMasker060320/Libraries
# Do a dummy run of RepeatMasker with the -species option. This creates
# a zebrafish-specific library from the EMBL format RepBase library.
# Then the zebunc.ref unclassified repeats can be added to this library.
/cluster/bluearc/RepeatMasker/RepeatMasker -spec danio /dev/null
# RepeatMasker version development-$Id: RepeatMasker,v 1.13 2006/03/21
# This creates a specieslib in Libraries/20060315/danio
# Format the zebunc.ref library:
# Sequence is upper case, change to lower case like the specieslib
cat zebunc.ref.txt | tr '[A-Z]' '[a-z]' > zebunc.ref.format
perl -pi.bak -e 's/>dr([0-9]+)/>Dr$1#Unknown/' zebunc.ref.format
grep '>' zebunc.ref.format | wc -l
# 958
cd /cluster/bluearc/RepeatMasker/Libraries/20060315/danio
grep '>' specieslib | wc -l
# 219
mv specieslib danio.lib
cat danio.lib ../../zebunc.ref.format > specieslib
grep '>' specieslib | wc -l
# 1177
rm danio.lib
# make a copy in Libraries directory in case this directory of libraries
# is removed.
cp specieslib /cluster/bluearc/RepeatMasker/Libraries/danio.lib
###########################################################################
# SPLIT SEQUENCE FOR REPEATMASKER RUN (DONE, 2006-04-14, hartera)
# SPLIT SEQUENCE AGAIN JUST FOR chrNA_random AND chrUn_random AFTER RENAMING
# THESE RANDOM CHROMS (DONE, 2006-04-21, hartera)
ssh kkstore01
cd /cluster/data/danRer4
# break up into 500 kb sized chunks at gaps if possible
# for RepeatMasker runs
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
# took about 3 minutes.
# split just for chrNA_random and chrUn_random (2006-04-21, hartera)
cd /cluster/data/danRer4
foreach c (NA_random Un_random)
foreach d ($c/chr${c}*_?{,?})
cd $d
echo "splitting $d"
set contig = $d:t
faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
-minGapSize=100
cd ../..
end
end
###########################################################################
# REPEATMASKER RUN (DONE, 2006-04-21, hartera)
# Originally run 2006-04-14. There was one sequence chr16_4_10.fa that
# failed with a division by zero error. Sent this as a test case with the
# danio library to Robert Hubley who fixed the bug and sent a new
# version of ProcessRepeats. Checked this into CVS for
# /cluster/bluearc/RepeatMasker on 2006-04-19.
# When a new library is added for this version of RepeatMasker, need to
# check in /cluster/bluearc/RepeatMasker/Libraries for a directory made
# up of a date e.g. 20060315 here and inside this are species directories
# for which RepeatMasker has already been run. In this directory it creates
# a specieslib of the danio repeats. If this exists, this is used for the
# RepeatMasker run for that species. Check that this contains the
# unclassified Zebrafish repeats with IDs beginning with Dr. This library
# with these repeats should have been created in the section above:
# Use sequence split into 500 kb chunks.
ssh kkstore01
cd /cluster/data/danRer4
mkdir RMRun
# Record RM version used:
ls -l /cluster/bluearc/RepeatMasker
# lrwxrwxrwx 1 angie protein 18 Mar 20 16:50 /cluster/bluearc/RepeatMasker -> RepeatMasker060320
# March 20 2006 (open-3-1-5) version of RepeatMasker
# get RM database version
grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl \
> RMdatabase.version
# RELEASE 20060315
cd /cluster/data/danRer4
cat << '_EOF_' > jkStuff/RMZebrafish
#!/bin/csh -fe
cd $1
pushd .
/bin/mkdir -p /tmp/danRer4/$2
/bin/cp $2 /tmp/danRer4/$2/
cd /tmp/danRer4/$2
/cluster/bluearc/RepeatMasker060320/RepeatMasker -ali -s -species danio $2
popd
/bin/cp /tmp/danRer4/$2/$2.out ./
if (-e /tmp/danRer4/$2/$2.align) /bin/cp /tmp/danRer4/$2/$2.align ./
if (-e /tmp/danRer4/$2/$2.tbl) /bin/cp /tmp/danRer4/$2/$2.tbl ./
if (-e /tmp/danRer4/$2/$2.cat) /bin/cp /tmp/danRer4/$2/$2.cat ./
/bin/rm -fr /tmp/danRer4/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer4/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/danRer4
'_EOF_'
# << emacs
chmod +x jkStuff/RMZebrafish
# move old files out the way and re-run on 2006-04-19
cd /cluster/data/danRer4
mkdir RMOutOld
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
foreach c ($d/$contig*.fa.*)
set t=$c:t
mv $c /cluster/data/danRer4/RMOutOld/$t.bak
end
end
cp /dev/null RMRun/RMJobs
foreach c (`cat chrom.lst`)
foreach d ($c/chr${c}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/danRer4/jkStuff/RMZebrafish \
/cluster/data/danRer4/$d $f \
'{'check out line+ /cluster/data/danRer4/$d/$f.out'}' \
>> RMRun/RMJobs
end
end
end
# Do the run again with new version of ProcessRepeats used
# for RepeatMasker.
ssh pk
cd /cluster/data/danRer4/RMRun
para create RMJobs
# 4382 jobs written to batch
para try, check, push, check ... etc.
para time
# Completed: 4382 of 4382 jobs
# CPU time in finished jobs: 11745656s 195760.94m 3262.68h 135.95d 0.372 y
# IO & Wait Time: 18953s 315.88m 5.26h 0.22d 0.001 y
# Average job time: 2685s 44.75m 0.75h 0.03d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3878s 64.63m 1.08h 0.04d
# Submission to last job: 41887s 698.12m 11.64h 0.48d
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kkstore01
cd /cluster/data/danRer4
foreach d (*/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
#- Lift pseudo-contigs to chromosome level
foreach c (`cat chrom.lst`)
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
cd ..
end
# Re-run for just chrNA_random and chrUn_random (start on 2006-04-21)
ssh kkstore01
mkdir /cluster/data/danRer4/RMRun/randomsRun
cd /cluster/data/danRer4
cp /dev/null RMRun/randomsRun/RMJobs
foreach c (NA_random Un_random)
foreach d ($c/chr${c}_?{,?})
set ctg = $d:t
foreach f ( $d/${ctg}_?{,?}.fa )
set f = $f:t
echo /cluster/data/danRer4/jkStuff/RMZebrafish \
/cluster/data/danRer4/$d $f \
'{'check out line+ /cluster/data/danRer4/$d/$f.out'}' \
>> RMRun/randomsRun/RMJobs
end
end
end
# Do the run again for chrNA_random and chrUn_random.
ssh pk
cd /cluster/data/danRer4/RMRun/randomsRun
para create RMJobs
# 468 jobs written to batch
para try, check, push, check ... etc.
para time
# Completed: 468 of 468 jobs
# CPU time in finished jobs: 551863s 9197.71m 153.30h 6.39d 0.017 y
# IO & Wait Time: 2217s 36.96m 0.62h 0.03d 0.000 y
# Average job time: 1184s 19.73m 0.33h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3836s 63.93m 1.07h 0.04d
# Submission to last job: 9086s 151.43m 2.52h 0.11d
#- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
ssh kkstore01
cd /cluster/data/danRer4
foreach c (NA_random Un_random)
foreach d (${c}/chr*_?{,?})
set contig = $d:t
echo $contig
liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
> /dev/null
end
end
#- Lift pseudo-contigs to chromosome level
foreach c (NA_random Un_random)
echo lifting $c
cd $c
if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
> /dev/null
endif
cd ..
end
# Load tables
#- Load the .out files into the database with:
ssh hgwdev
cd /cluster/data/danRer4
hgLoadOut danRer4 */chr*.fa.out -verbose=2 >& load.log
# bad rep range [5031, 4990] line 51895 of 14/chr14.fa.out
# bad rep range [4559, 4558] line 59431 of 16/chr16.fa.out
# bad rep range [1202, 1201] line 131633 of 16/chr16.fa.out
# bad rep range [280, 252] line 93608 of 17/chr17.fa.out
# bad rep range [429, 272] line 43230 of 22/chr22.fa.out
# bad rep range [262, 261] line 167346 of 3/chr3.fa.out
# bad rep range [889, 888] line 28495 of 5/chr5.fa.out
# bad rep range [349, 348] line 113404 of 5/chr5.fa.out
# bad rep range [1133, 1132] line 200654 of 5/chr5.fa.out
# bad rep range [965, 920] line 3567 of 8/chr8.fa.out
# bad rep range [292, 291] line 6354 of NA_random/chrNA_random.fa.out
# note: 11 records dropped due to repStart > repEnd
# Not too many errors so just ignore, but send examples to Arian Smit
# and Robert Hubley.
# check coverage of repeats masked
featureBits -chrom=chr1 danRer3 rmsk
# 25822888 bases of 55500710 (46.527%) in intersection
featureBits -chrom=chr1 danRer4 rmsk
# 32880041 bases of 70589895 (46.579%) in intersection
###########################################################################
# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF AND BUILD NIB FILES
# (DONE, 2006-04-22, hartera)
# MASK PSEUDO-CONTIGS AS NOT DONE BEFORE (DONE, 2006-05-27, hartera)
ssh kkstore01
cd /cluster/data/danRer4
# Soft-mask (lower-case) the contig and chr .fa's,
# then make hard-masked versions from the soft-masked.
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
# for the chromosomes:
foreach f (*/chr*.fa)
echo "repeat- and trf-masking $f"
maskOutFa -soft $f $f.out $f
set chr = $f:t:r
maskOutFa -softAdd $f $trfChr/$chr.bed $f
echo "hard-masking $f"
maskOutFa $f hard $f.masked
end
# check percent sequence masked
faSize /cluster/data/danRer4/1/chr1.fa
# 70589895 bases (904883 N's 69685012 real 36751306 upper
# 32933706 lower) in 1 sequences in 1 files
faSize /cluster/data/danRer3/1/chr1.fa
# 55805710 bases (1047706 N's 54758004 real 28887275 upper
# 25870729 lower) in 1 sequences in 1 files
# 47% of danRer4 chr1.fa is in lower case so masked
# Build nib files, using the soft masking in the fa
mkdir nib
foreach f (*/chr*.fa)
faToNib -softMask $f nib/$f:t:r.nib
end
ls ./nib/* | wc
# 28
# for the contigs (2006-05-27, hartera)
ssh kkstore04
cd /cluster/data/danRer4
set trfCtg=bed/simpleRepeat/trfMask
set trfChr=bed/simpleRepeat/trfMaskChrom
foreach c (`cat chrom.lst`)
echo "repeat- and trf-masking contigs of chr$c"
foreach d ($c/chr*_?{,?})
set ctg=$d:t
set f=$d/$ctg.fa
maskOutFa -soft $f $f.out $f
maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
maskOutFa $f hard $f.masked
end
end
###########################################################################
# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION AND CREATE 2BIT FILE
# (DONE, 2006-04-23, hartera)
# CHANGE FILENAME TO 2BIT FILE IN CHROMINFO AND REMOVE NIB DIR IN /gbdb
# (DONE, 2006-05-24, hartera)
# Make symbolic links from /gbdb/danRer4/nib to the real nibs
ssh hgwdev
cd /cluster/data/danRer4
mkdir -p /gbdb/danRer4/nib
foreach f (/cluster/data/danRer4/nib/chr*.nib)
ln -s $f /gbdb/danRer4/nib
end
# Load /gbdb/danRer4/nib paths into database and save size info
# hgNibSeq creates chromInfo table
hgNibSeq -preMadeNib danRer4 /gbdb/danRer4/nib */chr*.fa
echo "select chrom,size from chromInfo" | hgsql -N danRer4 > chrom.sizes
# take a look at chrom.sizes, should be 28 lines
wc chrom.sizes
# 28 56 422 chrom.sizes
# Make one big 2bit file as well, and make a link to it in
# /gbdb/danRer4 because hgBlat looks there:
faToTwoBit */chr*.fa danRer4.2bit
# check the 2bit file
twoBitInfo danRer4.2bit 2bit.tab
diff 2bit.tab chrom.sizes
# should be the same and they are so ok.
rm 2bit.tab
# add link to this 2bit file from gbdb danRer4 directory
ln -s /cluster/data/danRer4/danRer4.2bit /gbdb/danRer4/
# (hartera, 2006-05-24)
# change chromInfo table to have 2bit file for filename
hgsql -e 'update chromInfo set fileName = "/gbdb/danRer4/danRer4.2bit";' \
danRer4
# then remove nib directory in /gbdb/danRer4 as do not need both nibs
# and 2 bit file which is in /gbdb/danRer4.
rm -r /gbdb/danRer4/nib
###########################################################################
# MAKE GOLD AND GAP TRACKS (DONE, 2006-04-23, hartera)
ssh hgwdev
cd /cluster/data/danRer4
# the gold and gap tracks are created from the chrN.agp file and this is
# the scaffolds or supercontigs agp
hgGoldGapGl -noGl -chromLst=chrom.lst danRer4 /cluster/data/danRer4 .
# featureBits danRer4 gold
# 1626093931 bases of 1626093931 (100.000%) in intersection
# featureBits danRer3 gold
# 1630323462 bases of 1630323462 (100.000%) in intersection
# featureBits danRer4 gap
# 148566200 bases of 1626093931 (9.136%) in intersection
# featureBits danRer3 gap
# 13709500 bases of 1630323462 (0.841%) in intersection
# there are larger gaps now in chrNA and chrUn so compare just chr1
# featureBits -chrom=chr1 danRer4 gap
# 16000 bases of 70573895 (0.023%) in intersection
# featureBits -chrom=chr1 danRer3 gap
# 305000 bases of 55500710 (0.550%) in intersection
# without random or chrUn chroms:
# featureBits -noRandom danRer4 gap
# 366200 bases of 1546950119 (0.024%) in intersection
# featureBits -noRandom danRer3 gap
# 6240000 bases of 1200146216 (0.520%) in intersection
# Add trackDb.ra entries for gold and gap tracks and also create
# gap.html and gold.html pages.
###########################################################################
# PUT MASKED SEQUENCE OUT ON iSERVERS AND THE SAN FOR CLUSTER RUNS
# (DONE, 2006-04-23, hartera)
# TRFFA SEQUENCED WAS NOT MASKED SO ADD MASKED SEQUENCE TO iSERVERS AND
# THE SAN FOR CLUSTER RUNS (DONE, 2006-05-30, hartera)
ssh kkr1u00
# Chrom-level mixed nibs that have been repeat- and trf-masked:
rm -rf /iscratch/i/danRer4/nib
mkdir -p /iscratch/i/danRer4/nib
cp -p /cluster/data/danRer4/nib/chr*.nib /iscratch/i/danRer4/nib
# Pseudo-contig fa that have been repeat- and trf-masked:
# Add these pseudo-contigs that have been repeat- and trf-masked
# and rsync again. (2006-05-30, hartera)
rm -rf /iscratch/i/danRer4/trfFa
mkdir /iscratch/i/danRer4/trfFa
foreach d (/cluster/data/danRer4/*/chr*_?{,?})
cp -p $d/$d:t.fa /iscratch/i/danRer4/trfFa
end
rm -rf /iscratch/i/danRer4/rmsk
mkdir -p /iscratch/i/danRer4/rmsk
cp -p /cluster/data/danRer4/*/chr*.fa.out /iscratch/i/danRer4/rmsk
cp -p /cluster/data/danRer4/danRer4.2bit /iscratch/i/danRer4/
# rsync files - faster than using iSync
# rsync again - still can not rsync to kkr2u00 (hartera, 2006-05-30)
foreach R (2 3 4 5 6 7 8)
echo "rsync for kkr${R}u00 ..."
rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
end
# error rsyncing to kkr2u00:
# connect to host kkr2u00 port 22: No route to host
# then add the same sequence files to the san
ssh kkstore01
# Chrom-level mixed nibs that have been repeat- and trf-masked:
mkdir -p /san/sanvol1/scratch/danRer4/nib
rm -rf /san/sanvol1/scratch/danRer4/nib
cp -p /cluster/data/danRer4/nib/chr*.nib /san/sanvol1/scratch/danRer4/nib
cp /cluster/data/danRer4/danRer4.2bit /san/sanvol1/scratch/danRer4
# Pseudo-contig fa that have been repeat- and trf-masked:
# Add these pseudo-contigs again (2006-05-30, hartera)
ssh kkstore04
rm -rf /san/sanvol1/scratch/danRer4/trfFa
mkdir /san/sanvol1/scratch/danRer4/trfFa
foreach d (/cluster/data/danRer4/*/chr*_?{,?})
cp -p $d/$d:t.fa /san/sanvol1/scratch/danRer4/trfFa
end
###########################################################################
# ADD CONTIGS TRACK (DONE, 2006-04-23, hartera)
# make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from
# chunks (contigs) agp files.
ssh kkstore01
mkdir -p /cluster/data/danRer4/bed/ctgPos2
cd /cluster/data/danRer4/bed/ctgPos2
# ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
foreach c (`cat /cluster/data/danRer4/chrom.lst`)
awk 'BEGIN {OFS="\t"} \
{if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
/cluster/data/danRer4/$c/agps/chr${c}.chunks.agp >> ctgPos2.tab
end
# load the ctgPos2 table
ssh hgwdev
cd /cluster/data/danRer4/bed/ctgPos2
# use hgLoadSqlTab as it gives more error messages than using
# "load data local infile ...".
/cluster/bin/i386/hgLoadSqlTab danRer4 ctgPos2 \
~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
# create trackDb.ra entry and html page for ctgPos2 track.
# add search for the track and make sure the termRegex will handle
# contigs named "Zv6_scaffoldN.N" where N is an integer and all the
# contig accessions in the *.chunks.agp files.
###########################################################################
# CREATE gc5Base WIGGLE TRACK (DONE, 2006-04-23, hartera)
ssh kkstore01
mkdir -p /cluster/data/danRer4/bed/gc5Base
cd /cluster/data/danRer4/bed/gc5Base
nice hgGcPercent -wigOut -doGaps -file=stdout -win=5 danRer4 \
/cluster/data/danRer4 | wigEncode stdin gc5Base.wig gc5Base.wib
# Calculating gcPercent with window size 5
# Using twoBit: /cluster/data/danRer4/danRer4.2bit
# File stdout created
# Converted stdin, upper limit 100.00, lower limit 0.00
# runs for about 7 minutes
# load database with the .wig file and add .wib file to /gbdb/danRer4
ssh hgwdev
cd /cluster/data/danRer4/bed/gc5Base
mkdir /gbdb/danRer4/wib
ln -s `pwd`/gc5Base.wib /gbdb/danRer4/wib
time hgLoadWiggle -pathPrefix=/gbdb/danRer4/wib danRer4 gc5Base gc5Base.wig
# 17 second load time
# verify index is correct:
hgsql danRer4 -e "show index from gc5Base;"
# should see good numbers in Cardinality column
###########################################################################
# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2005-04-24, hartera)
# Use -repMatch=512 (based on size -- for human we use 1024, and
# the zebrafish genome is ~50% of the size of the human genome
ssh kkr1u00
mkdir /cluster/data/danRer4/bed/ooc
cd /cluster/data/danRer4/bed/ooc
mkdir -p /san/sanvol1/scratch/danRer4
ls -1 /cluster/data/danRer4/nib/chr*.nib > nib.lst
blat nib.lst /dev/null /dev/null -tileSize=11 \
-makeOoc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc -repMatch=512
# Wrote 50424 overused 11-mers to /cluster/bluearc/danRer4/11.ooc
# For 10.ooc, repMatch = 4096 for human, so use 2048
blat nib.lst /dev/null /dev/null -tileSize=10 \
-makeOoc=/san/sanvol1/scratch/danRer4/danRer4_10.ooc -repMatch=2048
# Wrote 12231 overused 10-mers to /cluster/bluearc/danRer4/10.ooc
# keep copies of ooc files in this directory and copy to iscratch
cp /san/sanvol1/scratch/danRer4/*.ooc .
cp -p /san/sanvol1/scratch/danRer4/*.ooc /iscratch/i/danRer4/
# rsync to iServers
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer4/*.ooc \
kkr${R}u00:/iscratch/i/danRer4/
end
###########################################################################
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer4 (DONE, 2006-04-27, hartera)
ssh hgwdev
# DNA port is "0", trans prot port is "1"
echo 'insert into blatServers values("danRer4", "blat17", "17788", "1", "0"); insert into blatServers values("danRer4", "blat17", "17789", "0", "1");' \
| hgsql hgcentraltest
# this enables blat and isPcr, isPcr is enabled by loading blat server
# with tilesize=5 (ask for this when request blat servers from
# cluster admin).
# if you need to delete those entries
echo 'delete from blatServers where db="danRer4";' | hgsql hgcentraltest
###########################################################################
# AFFYMETRIX ZEBRAFISH GENOME ARRAY CHIP (DONE, 2006-04-24, hartera)
# UPDATED (2006-09-28) - see separate section, UPDATE AFFY ZEBRAFISH TRACK.
# NOTE: Jim recommends that, in the future, all AFFY blat alignments should drop
# -mask=lower for blat and drop -minIdentity=95 to -minIdentity=90 as the
# higher minIdentity is causing alignments to be dropped that should not be.
# e.g. blat -fine -minIdentity=90 -ooc=11.ooc
# $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
# pslReps can be used to handle filtering at a later step. Blat's minIdentity
# seems to be more severe than that for pslReps as it takes insertions and
# deletions into account.
# CHECKED ALIGNMENTS USING MASKED TRFFA AND RESULTS ARE THE SAME
# (DONE, 2006-05-30, hartera)
# array chip sequences already downloaded for danRer1
ssh hgwdev
# need to copy sequences to the bluearc first to transfer to the iServers
cd /projects/compbio/data/microarray/affyZebrafish
mkdir -p /cluster/bluearc/affy
cp -p \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/cluster/bluearc/affy/
# Set up cluster job to align Zebrafish consensus sequences to danRer3
ssh kkr1u00
mkdir -p /cluster/data/danRer4/bed/affyZebrafish.2006-04-24
ln -s /cluster/data/danRer4/bed/affyZebrafish.2006-04-24 \
/cluster/data/danRer4/bed/affyZebrafish
cd /cluster/data/danRer4/bed/affyZebrafish
mkdir -p /iscratch/i/affy
cp /cluster/bluearc/affy/Zebrafish_consensus.fa /iscratch/i/affy
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/affy/*.fa \
kkr${R}u00:/iscratch/i/affy/
end
# small cluster run to align sequences
ssh kki
cd /cluster/data/danRer4/bed/affyZebrafish
ls -1 /iscratch/i/affy/Zebrafish_consensus.fa > affy.lst
ls -1 /iscratch/i/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
# for output:
mkdir -p psl
echo '#LOOP\n/cluster/bin/i386/blat -fine -minIdentity=90 -ooc=/iscratch/i/danRer4/danRer4_11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 genome.lst affy.lst template.sub para.spec
para create para.spec
para try, check, push, check .... etc.
# para time
# Completed: 271 of 271 jobs
# CPU time in finished jobs: 15331s 255.51m 4.26h 0.18d 0.000 y
# IO & Wait Time: 737s 12.29m 0.20h 0.01d 0.000 y
# Average job time: 59s 0.99m 0.02h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 101s 1.68m 0.03h 0.00d
# Submission to last job: 1557s 25.95m 0.43h 0.02d
# do pslSort and liftUp
ssh kkstore04
cd /cluster/data/danRer4/bed/affyZebrafish
# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyZebrafish.psl
pslSort dirs raw.psl tmp psl
# only use alignments that have at least 95% identity in aligned region.
# try minCover as now there is less sequence in chrUn and chrNA
# so less likely that genes are split up.
grep '>' /cluster/bluearc/affy/Zebrafish_consensus.fa | wc -l
# 15502
pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
# see how many sequences are aligned:
awk '{print $10;}' contig.psl > contigAligned
tail +6 contigAligned | sort | uniq -c | sort -nr > contigAligned.count
wc -l contigAligned.count
# 14819 contigAligned.count
tail +6 contig.psl | wc -l
# 21486
# 96% of sequences are aligned. The sequence with the most alignments
# aligns 177 times, then the next is 105, then 86, 85, 69, 69, 54, 54 etc.
# for danRer3, 14335 were aligned (92% aligned). The sequence with
# the most alignments aligned 96 times, then 31, 27, 22, 20, 19 times.
# also 854 sequences aligned for danRer4 that did not align for danRer3.
# 370 were aligned in danRer3 but not for danRer4.
# USED THESE pslReps PARAMETERS:
pslReps -minCover=0.30 -minAli=0.95 -nearTop=0.005 \
raw.psl contig2.psl /dev/null
# see how many sequences are aligned:
awk '{print $10;}' contig2.psl > contig2Aligned
tail +6 contig2Aligned | sort | uniq -c | sort -nr > contig2Aligned.count
wc -l contig2Aligned.count
# 14528 contig2Aligned.count
tail +6 contig2.psl | wc -l
# 18744
# danRer3 has 21196 total alignments and 14335 sequences aligned.
# 94% of sequences are aligned.
# 785 sequences were aligned for danRer4 using minCover but not for
# danRer3 after using pslReps. 592 sequences were aligned for danRer3
# but not for danRer4 using minCover after using pslReps.
# the sequence with the most alignments aligns 105 times, then 85, 69,
# 54, 50, 47, 44, 37, 26, 31, 29:
# No. of alignments Sequence Name
# 105 Zebrafish:Dr.15955.1.A1_at
# 85 Zebrafish:Dr.20178.1.A1_at
# 69 Zebrafish:Dr.885.1.S1_at
# 54 Zebrafish:Dr.15958.1.S1_at
# 50 Zebrafish:Dr.25427.1.A1_at
# 47 Zebrafish:Dr.16470.1.A1_at
# 44 Zebrafish:Dr.490.1.S1_at
# 37 Zebrafish:Dr.7806.1.A1_at
# 36 Zebrafish:Dr.19.1.A1_at
# 31 Zebrafish:Dr.2825.1.A1_at
# 29 Zebrafish:Dr.19556.1.A1_at
# aligning with the -mask=lower option doesn't make a difference to the
# number of alignments and sequences aligned.
# there are 291 extra sequences that align when minCover option is
# not used. Only 7 of these have 22 or more alignments.
# 86 Zebrafish:Dr.24316.1.S1_at
# 69 Zebrafish:Dr.14452.1.A1_at
# 39 Zebrafish:Dr.12372.1.S1_at
# 26 Zebrafish:Dr.18296.2.S1_a_at
# 23 Zebrafish:Dr.7519.1.A1_at
# 22 Zebrafish:Dr.8680.1.S1_at
# 22 Zebrafish:Dr.22175.1.S1_at
# clean up
rm contig*
# use pslReps without the minCover option as it does allow quite a lot
# more alignments and the number of total alignments/number of sequences
# aligned is still close to that for danRer3. Using nearTop=0.001 does
# decrease the number of alignments but also means that some good
# alignments are lost.
pslReps -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp affyZebrafish.psl ../../jkStuff/liftAll.lft warn contig.psl
# shorten names in psl file:
sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
mv affyZebrafish.psl.tmp affyZebrafish.psl
pslCheck affyZebrafish.psl
# co-ordinates are ok. psl is good.
# load track into database
ssh hgwdev
cd /cluster/data/danRer4/bed/affyZebrafish
hgLoadPsl danRer4 affyZebrafish.psl
# Add consensus sequences for Zebrafish chip
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/hgFixed/affyProbes
ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/gbdb/hgFixed/affyProbes
hgLoadSeq -abbr=Zebrafish: danRer4 \
/gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
# Clean up
rm batch.bak contig.psl raw.psl
# trackDb.ra entry and html are already there in trackDb/zebrafish/
###########################################################################
# CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS DIRECTORY AND
# ADD CHROM SIZES FOR BLASTZ CLUSTER RUNS (DONE, 2006-04-24, hartera)
# There are no lineage-specific repeats for zebrafish and other species
# so use all repeats.
ssh pk
mkdir -p /san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
foreach f (/cluster/data/danRer4/*/chr*.fa.out)
cp -p $f \
/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers/$f:t:r:r.out.spec
end
cp -p /cluster/data/danRer4/chrom.sizes \
/san/sanvol1/scratch/danRer4/
###########################################################################
# BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
# HUMAN (hg18) (DONE, 2006-04-24 - 2006-04-25, hartera)
# LOAD BLASTZ PSLS INTO DATABASE AND CHECK FOR HUMAN CONTAMINATION
# (DONE, 2006-05-11, hartera)
ssh pk
# Blastz uses lineage-specific repeats. There are none for human
# and zebrafish so use all repeats.
# There is a lineage-specific repeats directory for zebrafish (see
# section on CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS
# DIRECTORY. lineage-specific repeats for hg18 already made - see
# makeHg18.doc (BLASTZ ZEBRAFISH section).
mkdir -p /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
cd /cluster/data/danRer4/bed
ln -s blastz.hg18.2006-04-24 blastz.hg18
cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
# only 5% of the danRer4 genome is now in the random unordered chroms
# so not running only scaffolds for these chroms - run as virtual chroms
# and use same parameters as for danRer2.
cat << 'EOF' > DEF
# danRer4 zebrafish target, human hg18 query
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# use parameters suggested for human-fish evolutionary distance
# recommended in doBlastzChainNet.pl help
# (previously used for hg16-fr1, danRer1-mm5)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
# TARGET: zebrafish (danRer4)
# Use all chroms, including both randoms (chrNA_random and chrUn_random)
SEQ1_DIR=/san/sanvol1/scratch/danRer4/nib
SEQ1_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
# QUERY: human (hg18) - single chunk big enough to run each chrom by itself
# Use all chroms, including all randoms
SEQ2_DIR=/san/sanvol1/scratch/hg18/nib
SEQ2_LEN=/san/sanvol1/scratch/hg18/hg18Chroms.len
SEQ2_SMSK=/san/sanvol1/scratch/hg18/linSpecRep.notInOthers
SEQ2_CHUNK=300000000
SEQ2_LAP=0
BASE=/cluster/data/danRer4/bed/blastz.hg18.2006-04-24
TMPDIR=/scratch/tmp
'EOF'
# << happy emacs
chmod +x DEF
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF >& doBlastz.log &
# Start: Mon Apr 24 19:20 Stop: Tues Apr 25 05:42
# Did not finish:
# netChains: looks like previous stage was not successful
# (can't find [danRer4.hg18.]all.chain[.gz]).
# This file is there so run again. Continue chainMerge step so remove
# all.chain file and chain directory.
# NOTE: can leave these files and continue from the net step and it
# will work.
cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
rm ./axtChain/*.all.chain.gz
rm -r ./axtChain/chain
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue chainMerge `pwd`/DEF >& chainMerge.log &
# Took about 10 minutes.
# Check results with featureBits and compare to those
# for danRer3 and danRer2:
ssh hgwdev
featureBits danRer4 chainHg18Link
# 64196991 bases of 1626093931 (3.948%) in intersection
featureBits danRer3 chainHg18Link
# 69559338 bases of 1630323462 (4.267%) in intersection
featureBits danRer2 chainHg17Link
# 70046373 bases of 1560497282 (4.489%) in intersection
# After Genbank tracks are loaded, (hartera, 2006-04-27)
featureBits -chrom=chr1 danRer4 refGene:cds chainHg18Link -enrichment
# refGene:cds 0.732%, chainHg18Link 4.140%, both 0.558%, cover 76.19%,
# enrich 18.40x
featureBits -chrom=chr1 danRer3 refGene:cds chainHg18Link -enrichment
# refGene:cds 0.769%, chainHg18Link 4.124%, both 0.604%, cover 78.49%,
# enrich 19.03x
featureBits -chrom=chr1 danRer4 refGene:cds netHg18 -enrichment
# refGene:cds 0.732%, netHg18 31.154%, both 0.624%, cover 85.21%,
# enrich 2.73x
featureBits -chrom=chr1 danRer3 refGene:cds netHg18 -enrichment
# refGene:cds 0.774%, netHg18 35.434%, both 0.679%, cover 87.73%,
# enrich 2.48x
# Similar coverage and enrichment as for hg18 chains and net on danRer3.
# do the swap for Blastz chains over to human (hg18) and create net,
# axtNet, mafNet, liftOver and Downloads. see also makeHg18.doc for
# featureBits on these alignments.
ssh pk
cd /cluster/data/danRer4/bed/blastz.hg18.2006-04-24
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& doSwap.log &
# Took about 15 minutes.
# Load Blastz results into database (DONE, 2006-05-11, hartera)
ssh kkstore04
cd /cluster/data/danRer4/bed/blastz.hg18/pslParts
# cat together Blastz for each chrom
mkdir pslChrom
foreach c (`cat /cluster/data/danRer4/chrom.lst`)
echo "Processing $c ..."
foreach p (chr${c}.nib*)
zcat $p >> ./pslChrom/chr${c}_blastzHg18.psl
end
end
# load Blastz psls into the database
ssh hgwdev
cd /cluster/data/danRer4/bed/blastz.hg18/pslParts/pslChrom
foreach f (*.psl)
/cluster/bin/i386/hgLoadPsl danRer4 $f
echo "$f Done"
end
# Then determine how much sequence has 100% identity to human with a
# stretch of at least 300 bp. Human contamination was also found in
# danRer1 and a user reported it more recently.
foreach c (`cat /cluster/data/danRer4/chrom.lst`)
echo "chr$c" >> humanContamination.txt
hgsql -e "select count(*) from chr${c}_blastzHg18 where matches >= 300 and misMatches = 0;" danRer4 >> humanContamination.txt
end
# There are 4 on chr11 that fit this criteria (same if decrease to regions
# of >= 200 bp with 100% ID).
hgsql -e \
'select * from chr11_blastzHg18 where matches >= 300 and mismatches = 0;' \
danRer4 > chr11HumanSeq
# only 2 of these also have no query inserts and 1 of the others only has
# a 1 base insert: regions are of size 303, 310 and 367 bp. The region of
# 330 bp has a 45 bp insert on the query side - see below
#bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand qName qSize qStart qEnd tName tSize tStart tEnd blockCount blockSizes qStarts tStarts
#588 303 0 0 0 0 0 0 0 - chr4 191273063 69879746 69880049 chr11 52342180 502145 502448 1 303, 121393014, 502145,
#588 330 0 0 0 1 45 0 0 - chr4 191273063 69879319 69879694 chr11 52342180 502545 502875 2 1,329, 121393369,121393415, 502545,502546,
#588 310 0 0 0 0 0 0 0 - chr4 191273063 69878956 69879266 chr11 52342180 502928 503238 1 310, 121393797, 502928,
#588 667 0 0 0 1 1 0 0 - chr4 191273063 69878268 69878936 chr11 52342180 503258 503925 2 453,214, 121394127,121394581, 503258,503711,
###########################################################################
# BLASTZ/CHAIN/NET PREP (DONE 4/25/06 angie)
ssh kkstore04
cd /cluster/data/danRer4
cp -p danRer4.2bit /san/sanvol1/scratch/danRer4/
# Create a 2bit file for danRer4 with all chroms (1-25 and M) and the
# scaffolds for NA and Un:
awk '$1 == $6 {print $1;}' Zv6.scaffolds.agp \
| faSomeRecords Zv6_scaffolds.fa stdin stdout \
| faToTwoBit [1-9]/chr*.fa [12][0-9]/chr*.fa M/chrM.fa stdin \
/san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit
twoBitInfo /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit \
/san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes
# Make a lift file for scaffolds --> {chrUn, chrNA}:
mkdir /cluster/data/danRer4/liftSupertoChrom
cd /cluster/data/danRer4/liftSupertoChrom
/cluster/bin/scripts/agpToLift \
< ../NA_random/agps/chrNA_random.scaffolds.agp \
> chrNA_random.lft
/cluster/bin/scripts/agpToLift \
< ../Un_random/agps/chrUn_random.scaffolds.agp \
> chrUn_random.lft
cat chr*.lft > liftNAandUnScaffoldsToChrom.lft
cp -p liftNAandUnScaffoldsToChrom.lft /san/sanvol1/scratch/danRer4/
# Distribute on /iscratch/i too (danRer4.2bit is already there):
ssh kkr1u00
cd /iscratch/i/danRer4
cp -p /san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit .
twoBitInfo danRer4ChrUnNAScafs.2bit chromsUnNAScafs.sizes
cp -p \
/cluster/data/danRer4/liftSupertoChrom/liftNAandUnScaffoldsToChrom.lft .
iSync
###########################################################################
# BLASTZ/CHAIN/NET XENTRO2 (DONE 4/26/06 angie)
ssh kkstore04
mkdir /cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
cd /cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
cat << '_EOF_' > DEF
# zebrafish vs. frog
BLASTZ=/cluster/bin/penn/i386/blastz
# Use same params as used for danRer1-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Zebrafish danRer4
SEQ1_DIR=/iscratch/i/danRer4/danRer4.2bit
SEQ1_CTGDIR=/iscratch/i/danRer4/danRer4ChrUnNAScafs.2bit
SEQ1_LIFT=/iscratch/i/danRer4/liftNAandUnScaffoldsToChrom.lft
SEQ1_LEN=/cluster/data/danRer4/chrom.sizes
SEQ1_CTGLEN=/iscratch/i/danRer4/chromsUnNAScafs.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
SEQ1_LIMIT=100
# QUERY: Frog xenTro2 - single chunk big enough to run two of the
# largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/cluster/bluearc/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/danRer4/bed/blastz.xenTro2.2006-04-25
'_EOF_'
# << emacs
# kkstore04 can't see /iscratch so use an iServer as fileServer:
doBlastzChainNet.pl -blastzOutRoot=/cluster/bluearc/danRer4XenTro2 \
-bigClusterHub=kk -fileServer=kkr8u00 -workhorse=kkr8u00 \
-chainMinScore=5000 -chainLinearGap=loose DEF \
>& do.log & tail -f do.log
ln -s blastz.xenTro2.2006-04-25 /cluster/data/danRer4/bed/blastz.xenTro2
###########################################################################
# CREATE LIFT FILES FOR RANDOM CHROMOSOMES' SCAFFOLDS
# (DONE, 2006-04-25, hartera)
# scaffolds lift files created by scaffoldFaToAgp when agp files created
# for chrNA_random and chrUn_random. remove last line as this is an extra
# gap line that was removed from the agp.
ssh kkstore01
cd /cluster/data/danRer4
foreach c (NA_random Un_random)
mkdir -p /cluster/data/danRer4/$c/tmp
end
# NA_random doesn't have .lft and .gap files from scaffoldFaToAgp so
# recreate. It had no tmp dir with the NA_random.scaffolds.agp.
awk '{if ($1 ~ /Zv6_NA/) print;}' Zv6.scaffolds.agp \
> ./NA_random/tmp/NA_random.scaffolds.agp
# change the first field to "chrNA_random" then can use agpToFa to process
perl -pi.bak -e 's/Zv6_NA[0-9]+/chrNA_random/' ./NA_random/tmp/*.agp
wc -l ./NA_random/tmp/NA_random.scaffolds.agp
# 2898 ./NA_random/tmp/NA_random.scaffolds.agp
cd /cluster/data/danRer4
foreach c (NA_random)
awk '{print $6;}' $c/tmp/$c.scaffolds.agp > $c/tmp/chr$c.scaffolds.lst
$HOME/bin/i386/faSomeRecords /cluster/data/danRer4/Zv6_scaffolds.fa \
$c/tmp/chr$c.scaffolds.lst $c/tmp/chr$c.fa
end
cd /cluster/data/danRer4/NA_random/tmp
scaffoldFaToAgp -scaffoldGapSize=50000 chrNA_random.fa
# change chrUn to chrNA_random for NA_random, change chrUn to chrUn_random
# forUn_random. Change D to W for NA_random and Un_random..
sed -e 's/chrUn/chrNA_random/' chrNA_random.agp \
| sed -e 's/D/W/' > chrNA_random.scaffolds.agp
mv chrNA_random.fa chrNA_random.scaffolds.fa
# also move the Un_random .lft and .gap files to Un_random/tmp
mv ./Un_random/chrUn_random.lft ./Un_random/tmp/chrUn_random.lft
mv ./Un_random/chrUn_random.gap ./Un_random/tmp/chrUn_random.gap
# for chrNA_random and chrUn_random: remove last line as this is an extra
# gap line that was removed from the chrN_random.agp. Add these
# scaffold lift files to liftAll.lft. Also need to change the last
# field so that the correct total number of bases is being shown in the
# last column.
cd /cluster/data/danRer4
foreach c (NA_random Un_random)
head -n -1 $c/tmp/chr${c}.lft > $c/tmp/chr${c}.scaffolds.lft
perl -pi.bak -e "s/chrUn/chr${c}/" $c/tmp/chr${c}.scaffolds.lft
if ($c == "NA_random") then
perl -pi.bak -e 's/208064280/208014280/' \
$c/tmp/chrNA_random.scaffolds.lft
else
perl -pi.bak -e 's/19379532/19329532/' \
$c/tmp/chrUn_random.scaffolds.lft
endif
cat $c/tmp/chr${c}.scaffolds.lft >> ./jkStuff/liftAll.lft
rm $c/tmp/chr${c}.lft $c/tmp/chr${c}.gap *.bak
end
###########################################################################
# AUTO UPDATE GENBANK MRNA AND EST AND MGC GENES RUN
# (DONE, 2006-04-25 - 2006-04-26, hartera)
ssh hgwdev
cd ~kent/src/hg/makeDb/genbank
cvs update -d -P etc
# edit etc/genbank.conf to add danRer4 and commit this to CVS.
# danRer4 (zebrafish)
# Lift file partitions unplaced sequence pseudo-chroms
danRer4.serverGenome = /cluster/data/danRer4/danRer4.2bit
danRer4.clusterGenome = /iscratch/i/danRer4/danRer4.2bit
danRer4.ooc = /iscratch/i/danRer4/danRer4_11.ooc
danRer4.align.unplacedChroms = chrNA_random chrUn_random
danRer4.lift = /cluster/data/danRer4/jkStuff/liftAll.lft
danRer4.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
danRer4.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
danRer4.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
danRer4.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
danRer4.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
danRer4.downloadDir = danRer4
danRer4.mgcTables.default = full
danRer4.mgcTables.mgc = all
# end of section added to etc/genbank.conf
cvs commit -m "Added danRer4." etc/genbank.conf
# update /cluster/data/genbank/
make etc-update
# ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
# danRer genome information
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial danRer4 &
# Start: Tues Apr 25 12:53 Finish: Wed Apr 26 08:38
# logFile: var/build/logs/2006.04.25-12:53:39.danRer4.initalign.log
# check log file
tail -f var/build/logs/2006.04.25-12:53:39.danRer4.initalign.log
# check it has finished (last line in log file):
# kkstore02 2006.04.26-08:38:36 danRer4.initalign: finish
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad danRer4 &
# logFile: var/dbload/hgwdev/logs/2006.04.26-15:45:19.dbload.log
# check it is finished: hgwdev 2006.04.26-17:48:07 dbload: finish
# Took about 2 hours.
###########################################################################
# SPLIT UP ZEBRAFISH MASKED SEQUENCE FROM chrUn AND chrNA INTO SCAFFOLDS
# ADD SOFT-MASKED SCAFFOLDS TO ISERVERS AND THE SAN FOR CLUSTER RUNS
# (DONE, 2006-04-27, hartera)
ssh kkstore01
cd /cluster/data/danRer4
# for chrNA_random and chrUn_random, get soft-masked sequence.
foreach c (NA_random Un_random)
cd $c
mkdir scaffoldsSoftMask
awk 'BEGIN {FS="\t"}{if ($5 != "N") \
print "faFrag -mixed chr'${c}'.fa",$2-1, $3, $6".fa";}' chr${c}.agp \
>> ./scaffoldsSoftMask/faFragSoftMask.csh
cd ..
end
# change permissions run scripts to get sequences
foreach d (NA_random Un_random)
chmod +x $d/scaffoldsSoftMask/faFragSoftMask.csh
end
# wrapper shell script to run script to get the soft-masked scaffolds
cat << '_EOF_' > jkStuff/getMaskedScaffolds.csh
#!/bin/csh
foreach c (NA_random Un_random)
set dir=/cluster/data/danRer4
echo "Processing $c"
cd $dir/$c/scaffoldsSoftMask
cp ../chr${c}.fa .
echo "Getting soft-masked sequences ..."
nice faFragSoftMask.csh >& faFrag.log
end
'_EOF_'
chmod +x jkStuff/getMaskedScaffolds.csh
nice ./jkStuff/getMaskedScaffolds.csh &
# Took about 2.5 hours.
# check a few sequences that they are correct
# add name of scaffold to sequence fasta and cat together
foreach c (NA_random Un_random)
set dir = /cluster/data/danRer4
cd $dir/$c/scaffoldsSoftMask
foreach f (Zv*)
set g=$f:r
set sc=scaffold${c}.fa
perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:_]+/>$g/" $f
cat $f >> $sc
rm *.bak
end
cp scaffold* $dir/$c/
end
grep '>' NA_random/scaffoldNA_random.fa | wc -l
# 2898
grep '>' Un_random/scaffoldUn_random.fa | wc -l
# 68
# check sizes of final FASTA file with all sequences. check a few
# sequence files to see that they are correct - ok
cd /cluster/data/danRer4
cat << '_EOF_' > ./jkStuff/checkFastaSizes.csh
#!/bin/csh -fe
set scafName=$1
set agpLen=$2
set pref=`echo $scafName | cut -c1-2`
if ($pref == "Zv") then
set g=/cluster/data/danRer4/*/scaffoldsSoftMask/${scafName}.fa
set h=$g:t
echo "Getting size of $h"
set faLen = `faSize $g | awk '{print $1;}'`
if ($agpLen == $faLen) then
echo " OK: apg length = $h length = $faLen"
else
echo "ERROR: length = $agpLen, but $h length = $faLen"
endif
endif
'_EOF_'
# << happy emacs
chmod +x ./jkStuff/checkFastaSizes.csh
# use bash as doing a cat in C shell seems to split the line up by space
bash
for c in NA_random Un_random
do
echo "Processing $c scaffolds ...";
cat $c/chr${c}.agp | while read line;
do
scaf=`echo $line | cut -d " " -f6`;
size=`echo $line | cut -d " " -f8`;
nice ./jkStuff/checkFastaSizes.csh $scaf $size >> checkFastaSizes.log;
done
done
exit # back to C shell
grep "ERROR:" checkFastaSizes.log | wc -l
# No errors so all are the OK so FASTA files are the expected size
# Add soft-masked scaffolds to the Iservers and the san for cluster runs
ssh kkr1u00
cd /cluster/data/danRer4
mkdir /iscratch/i/danRer4/scaffoldsSoftMask
foreach c (NA_random Un_random)
foreach f (/cluster/data/danRer4/$c/scaffoldsSoftMask/Zv*.fa)
cp -p $f /iscratch/i/danRer4/scaffoldsSoftMask
end
cp -p /cluster/data/danRer4/$c/scaffold${c}.fa /iscratch/i/danRer4
end
ls /iscratch/i/danRer4/scaffoldsSoftMask/ | wc
# 2966
# all files are there
# rsync to cluster machines
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
end
ssh pk
mkdir -p /san/sanvol1/scratch/danRer4/scaffoldsSoftMask
foreach c (NA_random Un_random)
foreach f (/cluster/data/danRer4/$c/scaffoldsSoftMask/Zv*.fa)
rsync -a --progress $f /san/sanvol1/scratch/danRer4/scaffoldsSoftMask/
end
rsync -a --progress /cluster/data/danRer4/${c}/scaffold${c}.fa \
/san/sanvol1/scratch/danRer4/
end
foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
echo $f >> files.log
end
wc -l files.log
# 2966 files.log
rm files.log
# All files have transferred.
###########################################################################
## SWAP MM8 blastz result (DONE - 2006-04-28 - Hiram)
# ADD SYMBOLIC LINK TO SWAP DIR (DONE, 2006-05-04, hartera)
# RE-MAKE MM8 CHAINS AND NET SWAP WITH DANRER4 RANDOM CHROMS
# (DONE, 2006-05-24, hartera) ADDED LINK TO SWAP DIR (2006-05-27, hartera)
ssh pk
cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
# blastz parameters used in blastz alignment of danRer4 on mm8:
# BLASTZ_ABRIDGE_REPEATS=1
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF > swap.out 2>&1 &
ssh hgwdev
cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
time nice -n +19 featureBits danRer4 chainMm8Link \
> fb.danRer4.chainDanRer4Link 2>&1 &
cat fb.danRer4.chainDanRer4Link
# 60721886 bases of 1626093931 (3.734%) in intersection
# Add symbolic link to new swap directory (2006-05-27, hartera)
ssh kkstore04
cd /cluster/data/danRer4/bed
ln -s blastz.mm8.swap blastz.mm8
###########################################################################
# MONDOM4 BLASTZ TESTS USING LINEAGE-SPECIFIC REPEATS OR DYNAMIC MASKING
# AND SWAP (DONE, 2006-04-28, hartera)
# used no lineage specific-repeats and M=50 for dynamic masking
featureBits danRer4 chainMonDom4
# 541863023 bases of 1626093931 (33.323%) in intersection
featureBits danRer4 chainMonDom4NoDyMsk
# 534445657 bases of 1626093931 (32.867%) in intersection
featureBits monDom4 chainDanRer4
# 856404995 bases of 3501643220 (24.457%) in intersection
featureBits monDom4 chainDanRer4NoDyMsk
# 812142533 bases of 3501643220 (23.193%) in intersection
featureBits -chrom=chr1 danRer4 refGene:cds chainMonDom4Link -enrichment
# refGene:cds 0.732%, chainMonDom4Link 5.573%, both 0.550%, cover 75.20%,
# enrich 13.49x
featureBits -chrom=chr1 danRer4 refGene:cds chainMonDom4NoDyMskLink
-enrichment
# refGene:cds 0.732%, chainMonDom4NoDyMskLink 4.083%, both 0.550%,
# cover 75.15%, enrich 18.40x
featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4Link -enrichment
# refGene:cds 0.001%, chainDanRer4Link 2.448%, both 0.000%,
# cover 55.63%, enrich 22.73x
featureBits -chrom=chr1 monDom4 refGene:cds chainDanRer4NoDyMskLink
-enrichment
# refGene:cds 0.001%, chainDanRer4NoDyMskLink 1.807%, both 0.000%,
# cover 43.85%, enrich 24.27x
# There are only 36 RefSeq genes for monDom4 so results are misleading.
# Try mrna and xenoRefGene table.
# for mrna tables, not much difference:
featureBits -chrom=chr1 monDom4 mrna chainDanRer4Link -enrichment
# mrna 0.004%, chainDanRer4Link 2.448%, both 0.002%, cover 54.59%,
# enrich 22.30x
featureBits -chrom=chr1 monDom4 mrna chainDanRer4NoDyMskLink -enrichment
# mrna 0.004%, chainDanRer4NoDyMskLink 1.807%, both 0.002%,
# cover 52.67%, enrich 29.15x
featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4Link -enrichment
# xenoRefGene:cds 0.820%, chainDanRer4Link 2.448%, both 0.655%,
# cover 79.88%, enrich 32.63x
featureBits -chrom=chr1 monDom4 xenoRefGene:cds chainDanRer4NoDyMskLink
-enrichment
# xenoRefGene:cds 0.820%, chainDanRer4NoDyMskLink 1.807%, both 0.661%,
# cover 80.63%, enrich 44.63x
# For the nets:
featureBits -chrom=chr1 danRer4 refGene:cds netMonDom4 -enrichment
# refGene:cds 0.732%, netMonDom4 31.056%, both 0.612%,
# cover 83.58%, enrich 2.69x
featureBits -chrom=chr1 danRer4 refGene:cds netMonDom4NoDyMsk -enrichment
# refGene:cds 0.732%, netMonDom4NoDyMsk 31.002%, both 0.617%,
# cover 84.31%, enrich 2.72x
featureBits -chrom=chr1 monDom4 refGene:cds netDanRer4 -enrichment
# refGene:cds 0.001%, netDanRer4 25.224%, both 0.000%,
# cover 66.95%, enrich 2.65x
featureBits -chrom=chr1 monDom4 refGene:cds netDanRer4NoDyMsk -enrichment
# refGene:cds 0.001%, netDanRer4NoDyMsk 24.539%, both 0.000%,
# cover 49.19%, enrich 2.00x
# rows in tables for chr1
# Assembly Table Number of rows
# danRer4 chainMonDom4 36931
# danRer4 chainMonDom4Link 426659
# danRer4 chainMonDom4NoDyMsk 34363
# danRer4 chainMonDom4NoDyMskLink 361572
# monDom4 chainDanRer4 170759
# monDom4 chainDanRer4Link 2552995
# monDom4 chainDanRer4NoDyMsk 139797
# monDom4 chainDanRer4NoDyMskLink 1806858
# all chroms:
# danRer4 netMonDom4 399531
# danRer4 netMonDom4NoDyMsk 346482
# monDom4 netDanRer4 395881
# monDom4 netDanRer4NoDyMsk 321288
# Use lineage-specific repeats and no dynamic masking, seem to get
# better enrichment and coverage compared to gene CDS regions and also
# there are less chains being produced.
###########################################################################
# BLASTZ, CHAIN, NET, MAFNET, AXTNET AND ALIGNMENT DOWNLOADS FOR
# OPOSSUM (monDom4) (DONE, 2006-04-28 - 2006-04-29, hartera)
ssh hgwdev
# Remove all test chain and net tables and start again
foreach c (`cat chrom.lst`)
hgsql -e "drop table chr${c}_chainMonDom4;" danRer4
hgsql -e "drop table chr${c}_chainMonDom4Link;" danRer4
hgsql -e "drop table chr${c}_chainMonDom4NoDyMsk;" danRer4
hgsql -e "drop table chr${c}_chainMonDom4NoDyMskLink;" danRer4
end
hgsql -e "drop table netMonDom4;" danRer4
hgsql -e "drop table netMonDom4NoDyMsk;" danRer4
# remove downloads
rm -r /usr/local/apache/htdocs/goldenPath/danRer4/vsMonDom4
rm \
/usr/local/apache/htdocs/goldenPath/danRer4/liftOver/danRer4ToMonDom4.over.chain.gz
rm /cluster/data/danRer4/bed/liftOver/danRer4ToMonDom4.over.chain.gz
# remove old Blastz swap
rm -r /cluster/data/danRer4/bed/blastz.monDom4.swap
# remove link to old blastz directory
rm -r /cluster/data/danRer4/bed/blastz.monDom4
# see makeMonDom4.doc for removal of test tables and download files
# and swap directory on monDom4.
ssh pk
# Blastz uses lineage-specific repeats. There are none for human
# and zebrafish so use all repeats.
# There is a lineage-specific repeats directory for zebrafish (see
# section on CREATE ZEBRAFISH AND OTHER SPECIES LINEAGE-SPECIFIC REPEATS
# DIRECTORY. lineage-specific repeats for monDom4 made and also nibs - see
# makeMonDom4.doc. Need nib files when running Blastz with
# lineage-specific repeats.
mkdir -p /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
cd /cluster/data/danRer4/bed
ln -s blastz.monDom4.2006-04-28 blastz.monDom4
cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
# only 5% of the danRer4 genome is now in the random unordered chroms
# so not running only scaffolds for these chroms - run as virtual chroms
# and use same parameters as for danRer2 but use all repeats as
# lineage-specific as monDom4 is now mapped to chroms.
cat << 'EOF' > DEF
# danRer4 zebrafish target, opossum monDom4 query
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=1
# use parameters suggested for human-fish evolutionary distance
# recommended in doBlastzChainNet.pl help.
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/san/sanvol1/scratch/blastz/HoxD55.q
# TARGET: zebrafish (danRer4)
# Use all chroms, including both randoms (chrNA_random and chrUn_random)
SEQ1_DIR=/san/sanvol1/scratch/danRer4/nib
SEQ1_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
SEQ1_CHUNK=100000000
SEQ1_LAP=10000
# QUERY: opossum (monDom4)
SEQ2_DIR=/san/sanvol1/scratch/monDom4/nib
SEQ2_LEN=/san/sanvol1/scratch/monDom4/chrom.sizes
SEQ2_SMSK=/san/sanvol1/scratch/monDom4/linSpecRep.notInOthers
SEQ2_CHUNK=50000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
TMPDIR=/scratch/tmp
'EOF'
# << happy emacs
chmod +x DEF
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF >& doBlastz.log &
# Start: Fri Apr 28 13:27 Finish: Apr 29 01:28
# Stopped after making and merging chains:
# netChains: looks like previous stage was not successful
# (can't find [danRer4.monDom4.]all.chain[.gz]).
# Start again with net step and continue:
cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue net `pwd`/DEF >& net.log &
# Took about 15 minutes to finish.
# Do swap to get danRer4 alignments on monDom4:
# see also makeMonDom4.doc
cd /cluster/data/danRer4/bed/blastz.monDom4.2006-04-28
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& doSwap.log &
# Took about 15 minutes.
###########################################################################
# BLASTZ FOR FUGU (fr1) (DONE, 2006-04-28 - 2006-04-29, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# No lineage-specific repeats for this species pair. fr1 is in scaffolds
# so not so easy to use repeats with this run anyway. There is a 2bit
# file of scaffolds on the Iservers.
# Run this with dynamic masking instead.
# copy masked fr1 scaffolds 2 bit file to the san - see makeFr1.doc
# size of scaffolds FASTA file:
ssh kkr1u00
faSize /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa
# 329140338 bases
ssh pk
mkdir /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
cd /cluster/data/danRer4/bed
ln -s blastz.fr1.2006-04-28 blastz.fr1
cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
# use parameters for fr1 in makeDanRer2.doc. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q
# matrix as Fugu is quite distant from zebrafish. Blastz uses
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds and dynamic masking.
cat << '_EOF_' > DEF
# zebrafish (danRer4) vs. Fugu (fr1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - zebrafish (danRer4)
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
# 0.5 Mb chunk for target with 5 kb overlap
SEQ1_LIMIT=30
SEQ1_CHUNK=500000
SEQ1_LAP=5000
# QUERY - Fugu (fr1)
SEQ2_DIR=/san/sanvol1/scratch/fr1/fr1.2bit
# soft-masked scaffolds in 2bit format
SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/UnScaffolds/fr1UnScaffolds.2bit
SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/UnScaffolds/scaffolds.sizes
# large enough chunk to do whole genome at once
SEQ2_CHUNK=500000000
SEQ2_LAP=0
BASE=/cluster/data/danRer4/bed/blastz.fr1.2006-04-28
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF >& doBlastz.log &
# Start: Fri Apr 28 18:54 Finish: Apr 29 06:35
# Stopped after making and merging chains:
# netChains: looks like previous stage was not successful
# (can't find [danRer4.fr1.]all.chain[.gz]).
# Start again with net step and continue:
cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue net `pwd`/DEF >& net.log &
# Took about an hour to finish.
# check coverage:
featureBits danRer4 chainFr1Link
# 139280554 bases of 1626093931 (8.565%) in intersection
featureBits danRer3 chainFr1Link
# 137698495 bases of 1630323462 (8.446%) in intersection
featureBits -chrom=chr1 danRer4 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.732%, chainFr1Link 8.464%, both 0.660%,
# cover 90.18%, enrich 10.66x
featureBits -chrom=chr1 danRer3 refGene:cds chainFr1Link -enrichment
# refGene:cds 0.774%, chainFr1Link 8.364%, both 0.713%,
# cover 92.09%, enrich 11.01x
featureBits -chrom=chr1 danRer4 refGene:cds netFr1 -enrichment
# refGene:cds 0.732%, netFr1 52.712%, both 0.710%,
# cover 96.97%, enrich 1.84x
featureBits -chrom=chr1 danRer3 refGene:cds netFr1 -enrichment
# refGene:cds 0.774%, netFr1 58.353%, both 0.759%,
# cover 97.95%, enrich 1.68x
# Do the Blastz swap to get danRer4 alignments on fr1
# see also makeFr1.doc for featureBits on these alignments.
ssh pk
cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& doSwap.log &
# Took about 30 minutes.
###########################################################################
# BLASTZ FOR TETRAODON (tetNig1) (DONE, 2006-04-29 - 2006-04-30, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# No lineage-specific repeats for this species pair.
# Tetraodon also has no species-specific repeats in the RepeatMasker
# library so run this using dynamic masking instead as for danRer2 and
# danRer3.
# The tetraodon 2bit file of chroms and scaffolds
# (tetNig1ChromsRandomScafs.2bit) - this contains sequences for chroms
# and for scaffolds of random chroms.
ssh pk
mkdir /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
cd /cluster/data/danRer4/bed
ln -s blastz.tetNig1.2006-04-29 blastz.tetNig1
cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
# use parameters for tetNig1 in makeDanRer3.doc. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q
# matrix as tetraodon is quite distant from zebrafish. Blastz uses
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds and dynamic masking.
cat << '_EOF_' > DEF
# zebrafish (danRer4) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_ABRIDGE_REPEATS=0
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - zebrafish (danRer4)
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
SEQ1_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
# 0.5 Mb chunk for target with 5 kb overlap
SEQ1_LIMIT=30
SEQ1_CHUNK=500000
SEQ1_LAP=5000
# QUERY - Tetraodon (tetNig1)
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
# soft-masked chroms and random scaffolds in 2bit format
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
# large enough chunk to do whole genome at once
SEQ2_CHUNK=1000000000
SEQ2_LAP=0
BASE=/cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
`pwd`/DEF >& doBlastz.log &
# Start: Sat Apr 29 18:10 Finish: Apr 29 22:41
# Stopped after making and merging chains:
# netChains: looks like previous stage was not successful
# (can't find [danRer4.tetNig1.]all.chain[.gz]). However, this file
# is there so start again with net step and continue:
cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
nohup nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-continue net `pwd`/DEF >& net.log &
# Took about 20 minutes to finish.
# check coverage compared to danRer3:
featureBits danRer4 chainTetNig1Link
# 119439512 bases of 1626093931 (7.345%) in intersection
featureBits danRer3 chainTetNig1Link
# 109205244 bases of 1630323462 (6.698%) in intersection
featureBits -chrom=chr1 danRer4 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.732%, chainTetNig1Link 7.536%, both 0.645%,
# cover 88.08%, enrich 11.69x
featureBits -chrom=chr1 danRer3 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.774%, chainTetNig1Link 6.821%, both 0.692%,
# cover 89.34%, enrich 13.10x
featureBits -chrom=chr1 danRer4 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.732%, netTetNig1 55.116%, both 0.705%,
# cover 96.33%, enrich 1.75x
featureBits -chrom=chr1 danRer3 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.774%, netTetNig1 61.540%, both 0.753%,
# cover 97.24%, enrich 1.58x
# Similar coverage as for tetNig1 chains and nets on zebrafish danRer3.
# Do the Blastz swap to get danRer4 alignments on tetNig1
# see also makeTetNig1.doc for featureBits for these alignments on tetNig1.
ssh pk
cd /cluster/data/danRer4/bed/blastz.tetNig1.2006-04-29
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& doSwap.log &
# Took about 22 minutes to run.
###########################################################################
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2006-05-01, hartera)
# RE-MAKE DOWNLOADS FOR AGP, SOFT AND HARD MASKED CHROMS, REPEATMASKER OUT
# BECAUSE THEY DID NOT INCLUDE NA_RANDOM AND UN_RANDOM
# (DONE, 2007-03-29, hartera)
# NOTE THAT zipAll.csh MUST BE ALTERED ACCORDINGLY IN FUTURE.
ssh kkstore01
cd /cluster/data/danRer4
#- Build the .tar.gz and *.gz files for bigZips
cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf bigZips
mkdir bigZips
tar cvzf bigZips/chromAgp.tar.gz ?{,?}/chr*.agp
tar cvzf bigZips/chromOut.tar.gz ?{,?}/chr*.fa.out
tar cvzf bigZips/chromFa.tar.gz ?{,?}/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}/chr*.fa.masked
# soft masked chrNA and chrUn scaffolds
tar cvzf bigZips/scaffoldRandomsFa.tar.gz NA_random/scaffoldNA_random.fa \
Un_random/scaffoldUn_random.fa
cd bed/simpleRepeat
tar cvzf ../../bigZips/chromTrf.tar.gz trfMaskChrom/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=danRer4 -native GenBank mrna \
/cluster/data/danRer4/bigZips/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=danRer4 -xeno GenBank mrna \
/cluster/data/danRer4/bigZips/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=danRer4 -native refseq mrna \
/cluster/data/danRer4/bigZips/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=danRer4 -native GenBank est \
/cluster/data/danRer4/bigZips/est.fa
# gzip the Genbank sequences and create upstream sequence files for RefSeq.
cd /cluster/data/danRer4/bigZips
gzip *.fa
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/zipAll.csh
csh -ef ./jkStuff/zipAll.csh >& zipAll.log &
# Took about 35 minutes.
#- Look at zipAll.log to make sure all file lists look reasonable.
# Make upstream files for zebrafish RefSeq and Copy the .gz files to
# hgwdev:/usr/local/apache/...
ssh hgwdev
cd /cluster/data/danRer4/bigZips
foreach I (1000 2000 5000)
featureBits danRer4 refGene:upstream:${I} -fa=stdout \
| gzip -c > upstream${I}.fa.gz
echo "upstream${I} done"
end
set gp = /usr/local/apache/htdocs/goldenPath/danRer4
mkdir -p $gp/bigZips
cp -p *.gz $gp/bigZips
mkdir -p $gp/chromosomes
# Add individual chromosomes and file of scaffolds for each random chrom
# to chromosomes downloads directory.
foreach f (../*/chr*.fa)
cp $f $gp/chromosomes
end
foreach c (NA_random Un_random)
cd /cluster/data/danRer4/$c
cp scaffold${c}.fa $gp/chromosomes
end
# create md5sum for bigZips
cd $gp/bigZips
md5sum *.gz > md5sum.txt
# gzip each chrom or scaffolds for chrom separately in chromosomes dir
cd $gp/chromosomes
foreach f (*.fa)
gzip $f
end
# create md5sum for chromosomes
md5sum *.gz > md5sum.txt
# Take a look at bigZips/* and chromosomes/*
# copy README.txt's from danRer3 and update
# RE-MAKE DOWNLOADS FOR AGP, SOFT AND HARD MASKED CHROMS, REPEATMASKER OUT
# BECAUSE THEY DID NOT INCLUDE NA_RANDOM AND UN_RANDOM
# (DONE, 2007-03-29, hartera)
# NOTE THAT zipAll.csh MUST BE ALTERED ACCORDINGLY IN FUTURE.
ssh kkstore04
cd /cluster/data/danRer4
#- Rebuild the .tar.gz (agp, files for bigZips
cat << '_EOF_' > jkStuff/zip2.csh
rm -r bigZips/chromAgp.tar.gz
rm -r bigZips/chromOut.tar.gz
rm -r bigZips/chromFa.tar.gz
rm -r bigZips/chromFaMasked.tar.gz
tar cvzf bigZips/chromAgp.tar.gz ?{,?}{,_random}/chr*.agp
tar cvzf bigZips/chromOut.tar.gz ?{,?}{,_random}/chr*.fa.out
tar cvzf bigZips/chromFa.tar.gz ?{,?}{,_random}/chr*.fa
tar cvzf bigZips/chromFaMasked.tar.gz ?{,?}{,_random}/chr*.fa.masked
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/zip2.csh
csh -ef ./jkStuff/zip2.csh >& zip2.log &
# Took about 10 minutes
# Links to these files already exist from the
# /usr/local/apache/htdocs/goldenpath/danRer4/bigZips directory.
# Recreate the md5sum there to include these new files.
cd /usr/local/apache/htdocs/goldenpath/danRer4/bigZips
rm md5sum.txt
md5sum *.gz > md5sum.txt
###########################################################################
# HUMAN (hg18) PROTEINS TRACK FOR hg18 (DONE, 2006-04-28 - 2006-05-03, hartera)
ssh kkstore01
bash # if not using bash shell already
# make Blast database for non-random chrom sequences
mkdir -p /cluster/data/danRer4/blastDb
cd /cluster/data/danRer4/blastDb
cut -f 1 ../chrom.sizes | sed "s/chr//" | sed "/NA_random/d" \
| sed "/Un_random/d" > chrom.list
for i in `cat chrom.list`;
do ls -1 ../$i/*/*.fa . ; done | sed -n "/.*_.*_.*_.*/p" > list
ln -s `cat list` .
for i in *.fa
do
/projects/compbio/bin/i686/formatdb -i $i -p F
done
rm *.log *.fa list
cd /cluster/data/danRer4
for i in `cat blastDb/chrom.list`;
do cat $i/chr*/*.lft ; done > jkStuff/subChr.lft
rm blastDb/chrom.list
# Now make Blast database for random scaffolds sequences.
mkdir /cluster/data/danRer4/scaffoldBlastDb
cd /cluster/data/danRer4/scaffoldBlastDb
# Take file of all scaffolds for NA_random and Un_random and cat together
cat ../NA_random/scaffoldNA_random.fa ../Un_random/scaffoldUn_random.fa \
> allRandomScafs.fasta
grep '>' allRandomScafs.fasta | wc -l
# 2966
faSplit sequence allRandomScafs.fasta 500 scaf
rm allRandomScafs.fasta
for i in *.fa
do
/projects/compbio/bin/i686/formatdb -i $i -p F
done
rm *.log *.fa
# combine databases for chroms and random chroms
mkdir -p /san/sanvol1/scratch/danRer4/comboBlastDb
cd /cluster/data/danRer4/blastDb
for i in nhr nin nsq;
do cp *.$i /san/sanvol1/scratch/danRer4/comboBlastDb;
done
cd /cluster/data/danRer4/scaffoldBlastDb
for i in nhr nin nsq;
do cp *.$i /san/sanvol1/scratch/danRer4/comboBlastDb;
done
mkdir -p /cluster/data/danRer4/bed/tblastn.hg18KG
cd /cluster/data/danRer4/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/danRer4/comboBlastDb/*.nsq \
| xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 4377 query.lst
# we want around 250000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(250000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(250000/4377) = 643.016316
mkdir -p /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa
split -l 643 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl \
/cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/danRer4/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice /cluster/home/braney/bin/x86_64/pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
exit # back to tcsh
cd /cluster/data/danRer4/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer4/jkStuff/subChr.lft carry $f.2
liftUp -nosort -type=".psl" -nohead $f.4 /cluster/data/danRer4/jkStuff/liftAll.lft carry $f.3
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.4
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
# then run the Blast cluster jobs
ssh kk
cd /cluster/data/danRer4/bed/tblastn.hg18KG
para create blastSpec
para try, check, push, check etc.
# pushed 100,000 jobs at a time so need to do para push again later
para time
# Completed: 253866 of 253866 jobs
# CPU time in finished jobs: 52410110s 873501.83m 14558.36h 606.60d 1.662 y
# IO & Wait Time: 5508786s 91813.10m 1530.22h 63.76d 0.175 y
# Average job time: 228s 3.80m 0.06h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 2162s 36.03m 0.60h 0.03d
# Submission to last job: 147825s 2463.75m 41.06h 1.71d
# Took a while as had to repush some crashed jobs.
ssh kkstore01
cd /cluster/data/danRer4/bed/tblastn.hg18KG
tcsh
mkdir chainRun
cd chainRun
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS \
/cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh kk
cd /cluster/data/danRer4/bed/tblastn.hg18KG/chainRun
para create chainSpec
para try, check, push, check etc.
# Completed: 58 of 58 jobs
# CPU time in finished jobs: 759034s 12650.56m 210.84h 8.79d 0.024 y
# IO & Wait Time: 217724s 3628.74m 60.48h 2.52d 0.007 y
# Average job time: 16841s 280.68m 4.68h 0.19d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 208828s 3480.47m 58.01h 2.42d
# Submission to last job: 208891s 3481.52m 58.03h 2.42d
ssh kkstore01
cd /cluster/data/danRer4/bed/tblastn.hg18KG/blastOut
bash # if using another shell
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
liftUp -nohead -type=.psl stdout \
/cluster/data/danRer4/jkStuff/liftAll.lft carry u.*.psl m60* | \
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq \
> /cluster/data/danRer4/bed/tblastn.hg18KG/blastHg18KG.psl
pslCheck blastHg18KG.psl
# this is ok.
# load table
ssh hgwdev
cd /cluster/data/danRer4/bed/tblastn.hg18KG
hgLoadPsl danRer4 blastHg18KG.psl
# check coverage
featureBits danRer4 blastHg18KG
# 21159392 bases of 1626093931 (1.301%) in intersection
featureBits danRer3 blastHg17KG
# 21063005 bases of 1630323462 (1.292%) in intersection
featureBits -chrom=chr1 danRer4 refGene:cds blastHg18KG -enrichment
# refGene:cds 0.732%, blastHg18KG 1.333%, both 0.428%, cover 58.43%,
# enrich 43.83x
featureBits -chrom=chr1 danRer3 refGene:cds blastHg17KG -enrichment
# refGene:cds 0.774%, blastHg17KG 1.370%, both 0.450%, cover 58.05%,
# enrich 42.38x
# Similar coverage compared to refGene CDS as for hg17 proteins on danRer3.
# back to kkstore04 to clean up
ssh kkstore04
rm -rf /cluster/data/danRer4/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/danRer4/bed/tblastn.hg18KG/blastOut
# add trackDb.ra entry and html to ~/kent/src/hg/makeDb/trackDb/trackDb.ra
# also added the blastHg18KG.html here.
# blastKGPep04 and blastKGRef04 tables required on hg18 - these have
# been created - see makeHg18.doc. update of hgc.c, hgTrackUi.c and
# hgTracks.c was required - done by Brian.
###########################################################################
# MULTIZ7WAY ALIGNMENTS FOR CONSERVATION TRACK
# (DONE, 2006-05-04 - 2006-05-10, hartera)
# RE-MAKE WITH DANRER4 RANDOMS FOR MM8 AND ADDED FRAMES TABLE AND
# MULTIZ7WAY DOWNLOADS (DONE, 2006-05-28 - 2005-05-29, hartera)
# for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
ssh kkstore04
mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28
cd /cluster/data/danRer4/bed
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
# copy MAFs to a cluster-friendly server
rm -r /san/sanvol1/scratch/danRer4/mafNet
mkdir /san/sanvol1/scratch/danRer4/mafNet
foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
echo $s
rsync -av /cluster/data/danRer4/bed/blastz.$s/mafNet/* \
/san/sanvol1/scratch/danRer4/mafNet/$s/
end
# prune the hg17 17way tree to just these 7 and update db names:
/cluster/bin/phast/tree_doctor \
--prune-all-but=mouse_mm8,human_hg18,monodelphis_monDom4,xenopus_xenTro1,tetraodon_tetNig1,fugu_fr1,zebrafish_danRer3 \
--rename="xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \
/cluster/data/hg18/bed/multiz17way/17way.nh > 7way.nh
# carefully edit so that danRer4 is first. copy first to new file
cp 7way.nh 7way_zfishFirst.nh
# /cluster/bin/phast/draw_tree 7way_zfishFirst.nh > 7way.ps
# also made the ps file for the 7way.nh and compared to make sure
# that the tree with zebrafish at the top looks correct.
/cluster/bin/phast/all_dists 7way_zfishFirst.nh > 7way.distances
grep danRer4 7way.distances | sort -k3,3n | \
awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
cat distances.txt
# 1.4749 tetraodon_tetNig1
# 1.5154 fugu_fr1
# 1.7480 human_hg18
# 1.7782 monodelphis_monDom4
# 1.8771 xenopus_xenTro2
# 2.1058 mouse_mm8
# the order in the browser display will be by tree topology,
# not by distance, so they will be:
# danRer4
# 1.5154 fugu_fr1
# 1.4749 tetraodon_tetNig1
# 1.8771 xenopus_xenTro2
# 1.7782 monodelphis_monDom4
# 2.1058 mouse_mm8
# 1.7480 human_hg18
# create species list and stripped down tree for autoMZ
sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' \
7way_zfishFirst.nh > tree-commas.nh
sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
ssh pk
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
mkdir maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = danRer4
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/mafNet
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << emacs
awk '{print $1}' /cluster/data/danRer4/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para create jobList
para try, check, push, check etc. ...
para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 7022s 117.03m 1.95h 0.08d 0.000 y
IO & Wait Time: 142s 2.37m 0.04h 0.00d 0.000 y
Average job time: 256s 4.26m 0.07h 0.00d
Longest running job: 0s 0.00m 0.00h 0.00d
Longest finished job: 368s 6.13m 0.10h 0.00d
Submission to last job: 705s 11.75m 0.20h 0.01d
# Make .jpg for tree and install in htdocs/images/phylo/... don't forget
# to request a push of that file. The treeImage setting in trackDb.ra
# is phylo/danRer4_7way.jpg (relative to htdocs/images).
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
cat << '_EOF_' > species7.nh
((zebrafish,(Fugu,Tetraodon)),(X. tropicalis,(opossum,(mouse,human))))
'_EOF_'
/cluster/bin/phast/draw_tree species7.nh > species7way.ps
# ask Bob to resize image for Browser track description page and convert
# to JPEG and rename as danRer4_7way.jpg
# Build maf annotation and load dataabase
ssh kolossus
mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno
mkdir maf run
cd run
rm -f sizes nBeds
foreach db (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-28/species.lst`)
ln -s /cluster/data/$db/chrom.sizes $db.len
if (! -e /cluster/data/$db/$db.N.bed) then
twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
endif
ln -s /cluster/data/$db/$db.N.bed $db.bed
echo $db.bed >> nBeds
echo $db.len >> sizes
end
echo date > jobs.csh
# do smaller jobs first:
foreach f (`ls -1rS ../../maf/*.maf`)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
/cluster/data/danRer4/danRer4.2bit ../maf/`basename $f` \
>> jobs.csh
echo "echo $f" >> jobs.csh
end
echo date >> jobs.csh
csh -efx jobs.csh >&! jobs.log &
tail -f jobs.log
# Took 27 minutes to run.
# Load anno/maf
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
mkdir -p /gbdb/danRer4/multiz7way/anno/maf
ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf/*.maf \
/gbdb/danRer4/multiz7way/anno/maf
# delete old files from extFile table
hgsql -e 'delete from extFile where path like "%multiz7way/anno/maf%";' \
danRer4
cat > loadMaf.csh << 'EOF'
date
nice hgLoadMaf -pathPrefix=/gbdb/danRer4/multiz7way/anno/maf danRer4 multiz7way
date
'EOF'
# << emacs
csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log
# Took about 1 minute.
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kkr7u00
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
cat *.maf \
| nice hgLoadMafSummary danRer4 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 -test multiz7waySummary stdin
# Created 820403 summary blocks from 4245668 components and
# 2120803 mafs from stdin
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/anno/maf
sed -e 's/mafSummary/multiz7waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
> /tmp/multiz7waySummary.sql
time nice hgLoadSqlTab danRer4 multiz7waySummary \
/tmp/multiz7waySummary.sql multiz7waySummary.tab
# 0.000u 0.000s 2:05.26 0.0% 0+0k 0+0io 209pf+0w
rm *.tab /tmp/multiz7waySummary.sql
# zip mafs:
ssh kkstore04
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
cat > zipMafs.csh << 'EOF'
date
foreach f (chr*.maf)
set c = $f:r
echo $c
nice gzip -c $f > $c.maf.gz
end
date
'EOF'
time csh -efx zipMafs.csh >&! zip.log
# 219.706u 1.939s 3:41.75 99.9% 0+0k 0+0io 0pf+0w
rm *.maf
# add Frames table:
mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/frames
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/frames
# The following is adapted from MarkD's Makefile used for mm7...
# and used in makeRn4.doc.
#------------------------------------------------------------------------
# get the genes for all genomes
# using mrna for danRer4
# using knownGene for mm8 hg18
# using mgcGenes for xenTro2
# using ensGene for fr1
# no genes for monDom4 and tetNig1
# targetDb = danRer4
# queryDbs = mm8 hg18 xenTro2 fr1 (to build frames for)
# genePreds; (must keep only the first 10 columns for knownGene)
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using mrna table as genes: danRer4
mkdir genes
foreach queryDb (danRer4)
set tmpExt = `mktemp temp.XXXXXX`
set tmpMrnaCds = ${queryDb}.mrna-cds.${tmpExt}
set tmpMrna = ${queryDb}.mrna.${tmpExt}
set tmpCds = ${queryDb}.cds.${tmpExt}
echo $queryDb
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
from all_mrna,gbCdnaInfo,cds \
where (all_mrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
${queryDb} > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
# using knownGene for mm8 hg18
# using mgcGenes for xenTro2
# using enesGene for fr1
foreach queryDb (mm8 hg18 xenTro2 fr1)
if ($queryDb == "xenTro2") then
set geneTbl = mgcGenes
else if ($queryDb == "fr1") then
set geneTbl = ensGene
else
set geneTbl = knownGene
endif
hgsql -N -e "select * from $geneTbl" ${queryDb} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$queryDb.tmp.gz
mv /scratch/tmp/$queryDb.tmp.gz genes/$queryDb.gp.gz
rm -f $tmpExt
end
#------------------------------------------------------------------------
# create frames
set clusterDir = /cluster/bluearc/danRer4/multiz7wayFrames
set multizDir = /cluster/data/danRer4/bed/multiz7way.2006-05-28
set mafDir = $multizDir/maf
set geneDir = $multizDir/frames/genes
set clusterMafDir = ${clusterDir}/maf
set clusterGeneDir = ${clusterDir}/genes
set clusterFramesDir = ${clusterDir}/mafFrames.kki
# copy mafs to cluster storage
mkdir $clusterDir
ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/"
# copy genes to cluster storage
ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"
# run cluster jobs
set tmpExt = `mktemp temp.XXXXXX`
set paraDir = $multizDir/frames/para.${tmpExt}
mkdir mafFrames $paraDir
rm -f $paraDir/jobList
mkdir ${clusterFramesDir}
foreach queryDb (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-28/species.lst`)
mkdir ${clusterFramesDir}/${queryDb}
foreach c (`awk '{print $1;}' /cluster/data/danRer4/chrom.sizes`)
if (-e ${clusterGeneDir}/${queryDb}.gp.gz) then
echo /cluster/bin/scripts/mkMafFrames.pl ${queryDb} danRer4 \
${clusterGeneDir}/${queryDb}.gp.gz ${clusterMafDir}/$c.maf.gz \
${clusterFramesDir}/${queryDb}/$c.mafFrames \
>> $paraDir/jobList
endif
end
end
rm -f $tmpExt
ssh -x kki "cd ${paraDir} && para make jobList && para time"
# Completed: 140 of 140 jobs
# CPU time in finished jobs: 255s 4.25m 0.07h 0.00d 0.000 y
# IO & Wait Time: 360s 6.00m 0.10h 0.00d 0.000 y
# Average job time: 4s 0.07m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 8s 0.13m 0.00h 0.00d
# Submission to last job: 55s 0.92m 0.02h 0.00d
# combine results from cluster
foreach queryDb (`cat ../species.lst`)
echo $queryDb
ssh -x kolossus "cat ${clusterFramesDir}/${queryDb}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${queryDb}.mafFrames.gz"
end
#------------------------------------------------------------------------
# load the database
hgLoadMafFrames danRer4 multiz7wayFrames mafFrames/*.mafFrames.gz
#------------------------------------------------------------------------
# clean up
rm -rf ${clusterDir}
###
# rebuild frames to get bug fix, using 1-pass maf methodology
# (2006-06-09 markd)
ssh kkstore04
cd /cluster/data/danRer4/bed/multiz7way/frames
mv mafFrames/ mafFrames.old
nice tcsh # easy way to get process niced
(zcat ../maf/*.maf.gz | time genePredToMafFrames danRer4 stdin stdout danRer4 genes/danRer4.gp.gz fr1 genes/fr1.gp.gz hg18 genes/hg18.gp.gz mm8 genes/mm8.gp.gz xenTro2 genes/xenTro2.gp.gz | gzip >multiz7way.mafFrames.gz)>&log&
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way/frames
hgLoadMafFrames danRer4 multiz7wayFrames multiz7way.mafFrames.gz >&log&
# end of multiz7way frames and load
cd /cluster/data/danRer4/bed
ln -s multiz7way.2006-05-28 /cluster/data/danRer4/bed/multiz7way
# create and add the tree image for the description page
# Make .jpg for tree and install in htdocs/images/phylo/... don't forget
# to request a push of that file. The treeImage setting in trackDb.ra
# is phylo/danRer4_7way.jpg (relative to htdocs/images).
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
cat << '_EOF_' > species7.nh
((zebrafish,(Fugu,Tetraodon)),(X. tropicalis,(opossum,(mouse,human))))
'_EOF_'
/cluster/bin/phast/draw_tree species7.nh > species7way.ps
# ask Bob to resize image for Browser track description page and convert
# to JPEG and rename as danRer4_7way.jpg
ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/danRer4_7way.jpg \
/usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
# change permissions for display if not already readable to all
chmod +r /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
# check for all.joiner entry for 7-way - it is there already.
# add html and trackDb.ra entry for danRer4:
# track multiz7way
# shortLabel Conservation
# longLabel Vertebrate Multiz Alignment & Conservation
# group compGeno
# priority 104
# visibility pack
# color 0, 10, 100
# altColor 0,90,10
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons7way
# pairwiseHeight 12
# spanList 1
# yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
# windowingFunction mean
# summary multiz7waySummary
# frames multiz7wayFrames
# irows on
# speciesGroups vertebrate mammal
# sGroup_vertebrate fr1 tetNig1 xenTro2
# sGroup_mammal monDom4 mm8 hg18
# treeImage phylo/danRer4_7way.jpg
###########################################################################
# MAF DOWNLOADS FOR MULTIZ7WAY (DONE, 2006-05-29, hartera)
# GZIPPED UPSTREAM FILES AND ADDED TO DOWNLOADS AND RE-MADE md5sum.txt
# (DONE, 2006-06-02, hartera)
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
mkdir mafDownloads
cd mafDownloads
# upstream mafs
cat > mafFrags.csh << 'EOF'
date
foreach i (1000 2000 5000)
echo "making upstream$i.maf"
nice featureBits danRer4 refGene:upstream:$i -fa=/dev/null -bed=up.bad
awk -F '\t' '{printf("%s\t%s\t%s\t%s\t%s\t%s\n", $1, $2, $3, substr($4, 0, 9), 0, $5)}' up.bad > up.bed
rm up.bad
nice mafFrags danRer4 multiz7way up.bed upstream$i.maf \
-orgs=../species.lst
rm up.bed
end
date
'EOF'
time csh mafFrags.csh >&! mafFrags.log & tail -f mafFrags.log
# 57.823u 105.238s 4:13.15 64.4% 0+0k 0+0io 2pf+0w
# add maf downloads for annotated mafs
ssh kkstore04
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads
cat > downloads.csh << 'EOF'
date
foreach f (../anno/maf/chr*.maf)
set c = $f:t:r
echo $c
nice gzip -c $f > $c.maf.gz
end
md5sum *.gz > md5sum.txt
date
'EOF'
# 446.734u 5.629s 7:38.09 98.7% 0+0k 0+0io 2pf+0w
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/danRer4/multiz7way
mkdir $dir
ln -s \
/cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads/{*.gz,md5sum.txt} \
$dir
cp /usr/local/apache/htdocs/goldenPath/danRer3/multiz5way/README.txt $dir
# edit README.txt
# gzip the upstream maf downloads and remake md5sum.txt
# (2006-06-02, hartera)
ssh kkstore04
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads
foreach f (upstream*.maf)
nice gzip -c $f > $f.maf.gz
end
rm md5sum.txt
md5sum *.gz > md5sum.txt
ssh hgwdev
set dir = /usr/local/apache/htdocs/goldenPath/danRer4/multiz7way
rm $dir/md5sum.txt
ln -s \
/cluster/data/danRer4/bed/multiz7way.2006-05-28/mafDownloads/{upstream*.gz,md5sum.txt} $dir
###########################################################################
# PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 7-WAY ALIGNMENT
# (DONE, 2006-05-17 - 2006-05-24, hartera)
# REMAKE CONSERVATION TRACK USING MULTIZ 7-WAY INCLUDING DANRER4 RANDOM CHROMS
# FOR MM8 ALIGNMENTS (DONE, 2006-05-29, hartera)
ssh kkstore04
# Need unzipped maf files for this.
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
foreach f (*.maf.gz)
echo $f
gunzip -c $f > $f:r
end
mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
# create a starting-tree.mod based on chr14 (92 Mb)
# chr14 is the largest chrom apart from chrNA_random
/cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr14.maf \
--refseq ../../../14/chr14.fa --in-format MAF \
--windows 100000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "`cat ../tree-commas.nh`" \
--out-root starting-tree
# took less than a minute
rm s1.*ss
# Get genome-wide average GC content (for all species together,
# not just the reference genome). If you have a globally
# estimated tree model, as above, you can get this from the
# BACKGROUND line in the .mod file. E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.305239 0.194225 0.194292 0.306244
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.389 is the GC content. This is used in the -gc argument below.
# If you do *not* have a global tree model and you do not know your
# GC content, you can get it directly from the MAFs with a command
# like:
/cluster/bin/phast/$MACHTYPE/msa_view \
--aggregate danRer4,tetNig1,fr1,xenTro2,monDom4,mm8,hg18 -i MAF \
-S /cluster/data/danRer4/bed/multiz7way/maf/chr*.maf > maf_summary.txt
# This gives a GC content of 0.426 so use this as it is from mafs for
# the whole genome.
# break up the genome-wide MAFs into pieces on the san filesystem
ssh pk
set WINDOWS=/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/ss
mkdir -p $WINDOWS
cd $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/danRer4/bed/multiz7way.2006-05-28/maf
set WINDOWS=/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
set N = `echo $c | sed -e 's/chr//'`
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M /cluster/data/danRer4/$N/$c.fa \
-o SS -w 10000000,0 -I 1000 -B 5000 -r $c/$c
echo "Done" >> $c.done
'EOF'
# << emacs
chmod +x doSplit.csh
rm -f jobList
foreach c (`cat /cluster/data/danRer4/chrom.lst`)
echo "doSplit.csh chr${c} {check out line+ $WINDOWS/chr$c.done}" >> jobList
end
para create jobList
para push, check etc.
para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 831s 13.86m 0.23h 0.01d 0.000 y
# IO & Wait Time: 634s 10.56m 0.18h 0.01d 0.000 y
# Average job time: 52s 0.87m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 118s 1.97m 0.03h 0.00d
# Submission to last job: 118s 1.97m 0.03h 0.00d
# Create a random list of 50 1 mb regions (do not use chrNA and chrUn)
ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
# Set up parasol directory to calculate trees on these 50 regions
ssh pk
set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
mkdir -p $dir
cd $dir
# now set up cluster job to estimate model parameters. Parameters
# will be estimated separately for each alignment fragment then
# will be combined across fragments. Tuning this loop should come
# back to here to recalculate. Tuning target-coverage and expected-length.
# Create little script that calls phastCons with right arguments
cat > makeTree.csh << 'EOF'
#!/bin/csh -fe
set C = $1:h
set treeRun = $2
set cov = $3
set len = $4
set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
mkdir -p $dir/$treeRun/log/${C} $dir/$treeRun/tree/${C}
/cluster/bin/phast/x86_64/phastCons $dir/ss/$1 \
/cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons/starting-tree.mod \
--gc 0.426 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-length $len --target-coverage $cov \
--quiet --log $dir/$treeRun/log/$1 --estimate-trees $dir/$treeRun/tree/$1
'EOF'
# << emacs
chmod a+x makeTree.csh
# Make sure that the correct GC content is substituted in here. Notice
# the target coverage of 0.17. Here we are going to aim
# for 65% coverage of coding regions by conserved elements.
# Create gensub file
# need to add cov and len parameters
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1) $(path2)
#ENDLOOP
'_EOF_'
# happy emacs
# Make cluster job and run it to try out a few parameters close
# to those used for danRer3 and danRer2 phastCons runs.
echo "treeRun1 0.17 12" > tree.lst
echo "treeRun2 0.32 18" >> tree.lst
echo "treeRun3 0.32 20" >> tree.lst
echo "treeRun4 0.35 18" >> tree.lst
gensub2 randomSs.list tree.lst template jobList
para create jobList
para try,check,push,check etc.
# para time
# Completed: 200 of 200 jobs
# CPU time in finished jobs: 68652s 1144.20m 19.07h 0.79d 0.002 y
# IO & Wait Time: 2521s 42.02m 0.70h 0.03d 0.000 y
# Average job time: 356s 5.93m 0.10h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 629s 10.48m 0.17h 0.01d
# Submission to last job: 2356s 39.27m 0.65h 0.03d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
set dir = /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
foreach d ($dir/treeRun*)
cd $d
ls tree/chr*/*.cons.mod > cons.txt
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
--output-average ave.cons.mod > cons_summary.txt
ls tree/chr*/*.noncons.mod > noncons.txt
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
--output-average ave.noncons.mod > noncons_summary.txt
end
# measuring entropy
# consEntropy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
# never stops with the --NH argument
# target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
# the expected length that produces this entropy is the one
# to use for phastCons.
# foreach treeRun, set the appropriate coverage and length
# file: treeRunN cov len
# use awk to split up
cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
cp tree.lst entropy.csh
perl -pi.bak -e 's/^(treeRun[0-9]+)\s*([0-9\.]+)\s*([0-9]+)/echo \"Coverage = $2 Length = $3\"\ncd $1\n\/cluster\/bin\/phast\/x86_64\/consEntropy $2 $3 ave.cons.mod ave.noncons.mod\ncd \.\./' entropy.csh
chmod +x entropy.csh
entropy.csh >& entropy.out
# entropy.out
#Coverage = 0.17 Length = 12
#Transition parameters:gamma=0.170000,omega=12.000000, mu=0.083333, nu=0.017068
#Relative entropy: H=0.857449 bits/site
#Expected min. length: L_min=12.298748 sites
#Expected max. length: L_max=8.165741 sites
#Phylogenetic information threshold: PIT=L_min*H=10.545544 bits
#### !!! THESE PARAMETERS BELOW WERE THOSE THAT WERE FINALLY USED ####
# These are the same as for danRer2 and give the targeted L_min*H value.
# This is from treeRun2.
#Coverage = 0.32 Length = 18
#Transition parameters:gamma=0.320000,omega=18.000000, mu=0.055556, nu=0.026144
#Relative entropy: H=0.818130 bits/site
#Expected min. length: L_min=12.025818 sites
#Expected max. length: L_max=9.281106 sites
#Phylogenetic information threshold: PIT=L_min*H=9.838688 bits
###
#Coverage = 0.32 Length = 20
#Transition parameters:gamma=0.320000,omega=20.000000, mu=0.050000, nu=0.023529
#Relative entropy: H=0.795926 bits/site
#Expected min. length: L_min=12.724131 sites
#Expected max. length: L_max=9.927736 sites
#Phylogenetic information threshold: PIT=L_min*H=10.127467 bits
#Coverage = 0.35 Length = 18
#Transition parameters:gamma=0.350000,omega=18.000000, mu=0.055556, nu=0.029915
#Relative entropy: H=0.827604 bits/site
#Expected min. length: L_min=11.542637 sites
#Expected max. length: L_max=9.061627 sites
#Phylogenetic information threshold: PIT=L_min*H=9.552732 bits
# need to iterate and get the right coverage and parameters
# try running phastCons below with parameters used above and check the
# coverage of coding regions by the most conserved elements
# Create cluster dir to do main phastCons run
ssh pk
mkdir -p \
/san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
cp -p ../treeRun2/ave.*.mod .
cp -p ../treeRun2/ave.*.mod \
/cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
mkdir ppRaw bed
# Create script to run phastCons with right parameters
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
# Use the expected length and target coverage determined above and
# the corresponding average conserved and nonconserved models
cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
--expected-length 18 --target-coverage 0.32 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.*.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# emacs happy
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
# happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
para time
# Completed: 191 of 191 jobs
# CPU time in finished jobs: 4660s 77.67m 1.29h 0.05d 0.000 y
# IO & Wait Time: 2927s 48.78m 0.81h 0.03d 0.000 y
# Average job time: 40s 0.66m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 83s 1.38m 0.02h 0.00d
# Submission to last job: 2246s 37.43m 0.62h 0.03d
# combine predictions and transform scores to be in 0-1000 interval
ssh kkstore04
cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
# ~ 1 minute
cp -p mostConserved.bed \
/cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
# Figure out how much is actually covered by the mostConserved data as so:
cd /cluster/data/danRer4
faSize */chr*.fa
# 1774660131 bases (175779328 N's 1598880803 real 816338509 upper
# 782542294 lower) in 28 sequences in 28 files
# Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM)
# max 208014280 (chrNA_random) median 59765243
# The non-N size is 1598880803 bases
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
awk '{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/1598880803\n",100.0*sum/1598880803,sum}' \
mostConserved.bed
-target-coverage 0.32: % 3.18 = 100.0*50871950/1598880803 length=18
# want to aim for 65% coverage of coding regions
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way/phastCons
# get an or of refGene and mgcGenes CDS regions
featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
# 11770580 bases of 1626093931 (0.724%) in intersection
featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
# refSeqOrMgcCds.bed 0.724%, mostConserved.bed 3.128%, both 0.463%,
# cover 63.94%, enrich 20.44x
# for danRer3:
featureBits danRer3 refSeqOrMgcCdsDanRer3.bed \
/cluster/data/danRer3/bed/multiz5way/mostConserved.bed -enrichment
# refSeqOrMgcCdsDanRer3.bed 0.714%,
# /cluster/data/danRer3/bed/multiz5way/mostConserved.bed 2.998%,
# both 0.474%, cover 66.40%, enrich 22.14x
# so use this result for -target-coverage=0.32 -expected-lengths=18
# with L_min*H entropy (PIT) value of 9.84 (aiming for around 9.8) and
# 63.9% coverage of coding regions with most conserved elements
# (aiming for about 65%)
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
hgsql -e 'drop table phastConsElements;' danRer4
hgLoadBed danRer4 phastConsElements mostConserved.bed
# Loaded 676058 elements of size 5
featureBits danRer4 mgcGenes:cds phastConsElements -enrichment
# mgcGenes:cds 0.560%, phastConsElements 3.128%, both 0.366%,
# cover 65.36%, enrich 20.89x
# Create merged posterier probability file and wiggle track data files
# the sed business gets the names sorted by chromName, chromStart
# so that everything goes in numerical order into wigEncode
ssh kkstore04
cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun
find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| wigEncode stdin phastCons7way.wig phastCons7way.wib
# takes a few minutes
ls -l phastCons*
# -rw-rw-r-- 1 hartera protein 255524779 May 29 19:49 phastCons7way.wib
# -rw-rw-r-- 1 hartera protein 61525690 May 29 19:49 phastCons7way.wig
cp -p phastCons7way.wi? /cluster/data/danRer4/bed/multiz7way/phastCons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
mkdir -p /gbdb/danRer4/wib
rm /gbdb/danRer4/wib/phastCons7way.wib
ln -s `pwd`/phastCons7way.wib /gbdb/danRer4/wib/phastCons7way.wib
# use this if need to reload table
hgsql -e 'drop table phastCons7way;' danRer4
# load table
hgLoadWiggle danRer4 phastCons7way phastCons7way.wig
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons
bash
time hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=danRer4 phastCons7way > histogram.data 2>&1
# real 0m30.234s
# user 0m23.721s
# sys 0m3.234s
# create plot of histogram:
cat << '_EOF_' > histo.gp
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer4 Histogram phastCons7 track"
set xlabel " phastCons7 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# happy emacs
gnuplot histo.gp > histo.png
display histo.png &
# add line: wiggle phastCons7way to trackDb.ra for multiz7way to display the
# wiggle for the conservation track.
# check all.joiner for entries for phastCons7way and phastConsElements7way -ok
# copy over html for multiz and edit.
###########################################################################
# PHASTCONS SCORES DOWNLOADABLES FOR 7WAY (DONE, 2006-05-30, hartera)
# prepare compressed copy of ascii data values for downloads
ssh kolossus
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28
mkdir phastConsDownloads
cd phastConsDownloads
cat > downloads.csh << 'EOF'
date
cd /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons/consRun/ppRaw
foreach chr (`awk '{print $1}' /cluster/data/danRer4/chrom.sizes`)
echo $chr
cat `ls -1 $chr/$chr.*.pp | sort -t\. -k2,2n` \
| nice gzip -c \
> /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads/$chr.gz
end
date
'EOF'
# << emacs
csh -efx downloads.csh >&! downloads.log & tail -f downloads.log
# Took ~5 minutes.
md5sum *.gz > md5sum.txt
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads
set dir = /usr/local/apache/htdocs/goldenPath/danRer4/phastCons7wayScores
mkdir $dir
ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastConsDownloads/{*.gz,md5sum.txt} $dir
# copy over and edit README.txt
cd $dir
cp \
/usr/local/apache/htdocs/goldenPath/danRer3/phastCons5wayScores/README.txt .
# Clean up after phastCons run.
ssh kkstore04
rm /cluster/data/danRer4/bed/multiz7way.2006-05-28/phastCons/*.tab
rm -r /san/sanvol1/scratch/danRer4/multiz7way.2006-05-28/phastCons
###########################################################################
# CREATED RECIPROCAL BEST NETS AND MAF NETS FOR ALL SPECIES WITH PAIRWISE
# ALIGNMENTS USED FOR MULTIZ MULTIPLE ALIGNMENT
# (DONE, 2006-05-12 - 2006-05-15 , hartera)
# for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
ssh kolossus
mkdir /cluster/data/danRer4/bed/rBestRunForMultiz/
cd /cluster/data/danRer4/bed/rBestRunForMultiz
# need to re-run chainNet and keep first output (target-referenced,
# target-centric nets) and second output that we usually /dev/null
# (query-referenced, target-centric nets).
cat > rBestNet.csh << 'EOF'
#!/bin/csh -ef
foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
echo "Creating Reciprocal Best Net for $s..."
set binDir=/cluster/home/hartera/bin/i386
set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
cd $dir
# Run chainNet again, this time keeping the second output:
chainPreNet danRer4.$s.all.chain.gz /cluster/data/danRer4/chrom.sizes \
/cluster/data/$s/chrom.sizes stdout \
| $binDir/chainNet stdin /cluster/data/danRer4/chrom.sizes \
/cluster/data/$s/chrom.sizes /dev/null stdout | \
netSyntenic stdin $dir/$s.danRer4_ref.net
# get the other species chains from the other species-referenced
# (but danRer4-centric) net:
chainSwap danRer4.$s.all.chain.gz $s.danRer4.all.chain
netChainSubset -verbose=0 $s.danRer4_ref.net \
$s.danRer4.all.chain stdout \
| chainSort stdin $s.danRer4_ref.subset.chain
# Net those (sorted) danRer4 chains, and keep both outputs, to get
# reciprocal best nets referenced to both species:
chainPreNet $s.danRer4_ref.subset.chain \
/cluster/data/$s/chrom.sizes /cluster/data/danRer4/chrom.sizes stdout \
| $binDir/chainNet stdin /cluster/data/$s/chrom.sizes \
/cluster/data/danRer4/chrom.sizes tmp1 tmp2
netSyntenic tmp1 $s.danRer4.rbest.net
netSyntenic tmp2 danRer4.$s.rbest.net
rm tmp1 tmp2
nice gzip *.rbest.net
end
'EOF'
chmod +x rBestNet.csh
nice rBestNet.csh >& rBestNet.log &
# Took about 11 minutes to complete.
# Then make axtNet and mafNet
cat > makeMafRBestNet.csh << 'EOF'
#!/bin/csh -ef
foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
echo "Creating mafs for $s ..."
set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
set seqDir=/san/sanvol1/scratch
cd $dir
# extract recriprocal best chains from the zebrafish-other species rbest.net
echo "Get reciprocal best chains for best zebrafish-$s"
netChainSubset danRer4.$s.rbest.net.gz danRer4.$s.all.chain.gz \
danRer4.$s.rbest.chain
# need to make sure this is sorted and assing unique chain IDs
chainSort danRer4.$s.rbest.chain stdout | chainMergeSort stdin \
> danRer4.$s.rbest.newids.chain
# need to re-net with new ids
chainNet danRer4.$s.rbest.newids.chain /cluster/data/danRer4/chrom.sizes \
/cluster/data/$s/chrom.sizes danRer4.$s.rbest.newids.net /dev/null
# split reciprocal best chains and net
chainSplit rBestChain danRer4.$s.rbest.newids.chain
netSplit danRer4.$s.rbest.newids.net rBestNet
mkdir ../axtRBestNet
# make axtNet for reciprocal best
echo "Making axtRBestNet for $s ..."
foreach f (rBestNet/*.net)
netToAxt $f rBestChain/$f:t:r.chain \
$seqDir/danRer4/danRer4.2bit $seqDir/$s/$s.2bit stdout \
| axtSort stdin stdout \
| gzip -c > ../axtRBestNet/$f:t:r.danRer4.$s.net.axt.gz
end
# make mafNet for reciprocal best
cd ..
mkdir mafRBestNet
echo "Making mafRBestNet for $s ..."
foreach f (axtRBestNet/*.danRer4.$s.net.axt.gz)
axtToMaf -tPrefix=danRer4. -qPrefix=$s. $f \
/cluster/data/danRer4/chrom.sizes /cluster/data/$s/chrom.sizes stdout \
| gzip -c > mafRBestNet/$f:t:r:r:r:r:r.maf.gz
end
end
'EOF'
chmod +x makeMafRBestNet.csh
nice makeMafRBestNet.csh >& mafRBestNet.log &
# Took about an hour.
# NOTE: Must use chainSort and chainMergeSort to reassign unique IDs
# to the chains extracted from the rbest.net and then re-net the chains
# with the new IDs ortherwise netToAxt crashes due to duplicate chain IDs.
# Now do the multiple alignment using reciprocal best mafNets as input
# for multiz.
# Load up nets and chains from rBestChain and rBestNet
ssh hgwdev
cd /cluster/data/danRer4/bed/rBestRunForMultiz
# Nets from Reciprocal Best have no type field or repeat/gap stats so need
# to add these.
cat > loadRBest.csh << 'EOF'
#!/bin/csh -ef
foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
set dir=/cluster/data/danRer4/bed/blastz.$s/axtChain
if ($s == "tetNig1") then
set g = TetNig1
else if ($s == "fr1") then
set g = Fr1
else if ($s == "xenTro2") then
set g = XenTro2
else if ($s == "monDom4") then
set g = MonDom4
else if ($s == "mm8") then
set g = Mm8
else if ($s == "hg18") then
set g = Hg18
endif
# load chains
echo "Loading chains for $s ..."
cd $dir/rBestChain
foreach f (*.chain)
set c = $f:r
hgLoadChain danRer4 ${c}_chainRBest${g} $f
end
# load nets
cd $dir
echo "Loading nets for $s ..."
# add type field
netSyntenic danRer4.${s}.rbest.newids.net noClassRBest.net
# add gap/repeat stats to net file using database tables
netClass -verbose=0 -noAr noClassRBest.net danRer4 $s \
danRer4.${s}.rbest.withClass.net
netFilter -minGap=10 danRer4.${s}.rbest.withClass.net \
| hgLoadNet -verbose=0 danRer4 netRBest${g} stdin
end
'EOF'
<< emacs
chmod +x loadRBest.csh
nohup nice loadRBest.csh >& loadRBest.log &
###########################################################################
# MULTIZ7WAY ALIGNMENTS FOR CONSERVATION TRACK - USING RECIPROCAL BEST NETS
# (DONE, 2006-05-18 - 2006-05-24, hartera)
# for tetNig1, fr1, xenTro2, monDom4, mm8 and hg18.
ssh kkstore04
mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18
# copy MAFs to a cluster-friendly server
# use bluearc as the san is down
mkdir /cluster/bluearc/danRer4/mafRBestNet
foreach s (tetNig1 fr1 xenTro2 monDom4 mm8 hg18)
echo $s
rsync -av /cluster/data/danRer4/bed/blastz.$s/mafRBestNet/* \
/cluster/bluearc/danRer4/mafRBestNet/$s/
end
# prune the hg17 17way tree to just these 7 and update db names:
/cluster/bin/phast/tree_doctor \
--prune-all-but=mouse_mm8,human_hg18,monodelphis_monDom4,xenopus_xenTro1,tetraodon_tetNig1,fugu_fr1,zebrafish_danRer3 \
--rename="xenopus_xenTro1 -> xenopus_xenTro2 ; zebrafish_danRer3 -> zebrafish_danRer4" \
/cluster/data/hg18/bed/multiz17way/17way.nh > 7way.nh
# carefully edit so that danRer4 is first. copy first to new file
cp 7way.nh 7way_zfishFirst.nh
# DO THIS LATER AND CREATE FROM TREE WITHOUT DISTANCES
/cluster/bin/phast/draw_tree 7way_zfishFirst.nh > 7way.ps
# also made the ps file for the 7way.nh and compared to make sure
# that the tree with zebrafish at the top looks correct.
/cluster/bin/phast/all_dists 7way_zfishFirst.nh > 7way.distances
grep danRer4 7way.distances | sort -k3,3n | \
awk '{printf ("%.4f\t%s\n", $3, $2)}' > distances.txt
cat distances.txt
# 1.4749 tetraodon_tetNig1
# 1.5154 fugu_fr1
# 1.7480 human_hg18
# 1.7782 monodelphis_monDom4
# 1.8771 xenopus_xenTro2
# 2.1058 mouse_mm8
# the order in the browser display will be by tree topology,
# not by distance, so they will be:
# danRer4
# 1.5154 fugu_fr1
# 1.4749 tetraodon_tetNig1
# 1.8771 xenopus_xenTro2
# 1.7782 monodelphis_monDom4
# 2.1058 mouse_mm8
# 1.7480 human_hg18
# create species list and stripped down tree for autoMZ
sed -e 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//' \
7way_zfishFirst.nh > tree-commas.nh
sed -e 's/ //g; s/,/ /g' tree-commas.nh > tree.nh
sed -e 's/[()]//g; s/,/ /g' tree.nh > species.lst
cp tree-commas.nh 7way.nh
ssh pk
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18
mkdir maf run
cd run
# stash binaries
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
cat > autoMultiz.csh << 'EOF'
#!/bin/csh -ef
set db = danRer4
set c = $1
set maf = $2
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /cluster/bluearc/$db/mafRBestNet
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'EOF'
# << emacs
chmod +x autoMultiz.csh
cat << 'EOF' > spec
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/danRer4/bed/multiz7way.2006-05-18/maf/$(root1).maf}
#ENDLOOP
'EOF'
# << emacs
awk '{print $1}' /cluster/data/danRer4/chrom.sizes > chrom.lst
gensub2 chrom.lst single spec jobList
para create jobList
para try, check, push, check etc. ...
# Took less than 10 minutes to run
# Make .jpg for tree and install in htdocs/images/phylo/... don't forget
# to request a push of that file. The treeImage setting in trackDb.ra
# is phylo/danRer4_7way.jpg (relative to htdocs/images).
# ssh hgwdev
# DO LATER
# cd /cluster/data/danRer4/bed/multiz7way.2006-05-04
# pstopnm -stdout 7way.ps | pnmtojpeg > danRer4_7way.jpg
# ask Bob to resize image for Browser track description page.
# Build maf annotation and load database
ssh kolossus
mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno
mkdir maf run
cd run
rm -f sizes nBeds
foreach db (`cat /cluster/data/danRer4/bed/multiz7way.2006-05-18/species.lst`)
ln -s /cluster/data/$db/chrom.sizes $db.len
if (! -e /cluster/data/$db/$db.N.bed) then
twoBitInfo -nBed /cluster/data/$db/$db.{2bit,N.bed}
endif
ln -s /cluster/data/$db/$db.N.bed $db.bed
echo $db.bed >> nBeds
echo $db.len >> sizes
end
echo date > jobs.csh
# do smaller jobs first:
foreach f (`ls -1rS ../../maf/*.maf`)
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $f \
/cluster/data/danRer4/danRer4.2bit ../maf/`basename $f` \
>> jobs.csh
echo "echo $f" >> jobs.csh
end
echo date >> jobs.csh
csh -efx jobs.csh >&! jobs.log &
tail -f jobs.log
# Load anno/maf
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
mkdir -p /gbdb/danRer4/multiz7wayRBest/anno/maf
ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf/*.maf \
/gbdb/danRer4/multiz7wayRBest/anno/maf
# Reload as not working correctly.
hgsql -e 'drop table multiz7wayRBest;' danRer4
hgsql -e 'delete from extFile where path like "%multiz7wayRBest%";' \
danRer4
cat > loadMaf.csh << 'EOF'
date
nice hgLoadMaf -pathPrefix=/gbdb/danRer4/multiz7wayRBest/anno/maf danRer4 multiz7wayRBest
date
'EOF'
# << emacs
csh -efx loadMaf.csh >&! loadMaf.log & tail -f loadMaf.log
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kkr7u00
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
cat *.maf \
| nice hgLoadMafSummary danRer4 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 -test multiz7wayRBestSummary stdin
# Created 526386 summary blocks from 1972659 components and 1105457 mafs
# from stdin
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/anno/maf
sed -e 's/mafSummary/multiz7wayRBestSummary/' \
~/kent/src/hg/lib/mafSummary.sql \
> /tmp/multiz7wayRBestSummary.sql
time nice hgLoadSqlTab danRer4 multiz7wayRBestSummary \
/tmp/multiz7wayRBestSummary.sql multiz7wayRBestSummary.tab
# 0.000u 0.000s 0:07.56 0.0% 0+0k 0+0io 4pf+0w
rm *.tab /tmp/multiz7wayRBestSummary.sql
# ln -s multiz7way.2006-05-18 /cluster/data/danRer4/bed/multiz7way
# ln -s /cluster/data/danRer4/bed/multiz7way.2006-05-18/danRer4_7way.jpg \
# /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
# change permissions for display if not already readable to all
# chmod +r /usr/local/apache/htdocs/images/phylo/danRer4_7way.jpg
# check for all.joiner entry for 7-way - it is there already.
# add trackDb.ra entry for danRer4:
###########################################################################
# PHYLO-HMM (PHASTCONS) CONSERVATION TRACK FOR 7-WAY ALIGNMENT USING MAFS
# FROM RECIPROCAL BEST NET (DONE, 2006-05-19 - 2005-05-24, hartera)
ssh kkstore04
mkdir /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
# create a starting-tree.mod based on chr14 (92 Mb)
# chr14 is the largest chrom apart from chrNA_random
/cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr14.maf \
--refseq ../../../14/chr14.fa --in-format MAF \
--windows 100000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
/cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "`cat ../tree-commas.nh`" \
--out-root starting-tree
# took less than a minute
rm s1.*ss
# Get genome-wide average GC content (for all species together,
# not just the reference genome). If you have a globally
# estimated tree model, as above, you can get this from the
# BACKGROUND line in the .mod file. E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.309665 0.189697 0.189720 0.310918
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.379 is the GC content. This is used in the -gc argument below.
# If you do *not* have a global tree model and you do not know your
# GC content, you can get it directly from the MAFs with a command
# like:
/cluster/bin/phast/$MACHTYPE/msa_view \
--aggregate danRer4,tetNig1,fr1,xenTro2,monDom4,mm8,hg18 -i MAF \
-S /cluster/data/danRer4/bed/multiz7way/maf/chr*.maf > maf_summary.txt
# This gives a GC content of 0.426 so use this as it is from mafs for
# the whole genome.
# break up the genome-wide MAFs into pieces on the san filesystem
ssh pk
# should use a directory on the san but it is down and para create is
# not working on kk.
set WINDOWS=/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/ss
mkdir -p $WINDOWS
cd $WINDOWS
cat << 'EOF' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/danRer4/bed/multiz7way.2006-05-18/maf
set WINDOWS=/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/ss
cd $WINDOWS
set c = $1
echo $c
rm -fr $c
mkdir $c
set N = `echo $c | sed -e 's/chr//'`
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M /cluster/data/danRer4/$N/$c.fa \
-o SS -w 10000000,0 -I 1000 -B 5000 -r $c/$c
echo "Done" >> $c.done
'EOF'
# << emacs
chmod +x doSplit.csh
rm -f jobList
foreach c (`cat /cluster/data/danRer4/chrom.lst`)
echo "doSplit.csh chr${c} {check out line+ $WINDOWS/chr$c.done}" >> jobList
end
para create jobList
para push, check etc.
para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 847s 14.12m 0.24h 0.01d 0.000 y
# IO & Wait Time: 9741s 162.35m 2.71h 0.11d 0.000 y
# Average job time: 378s 6.30m 0.11h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 539s 8.98m 0.15h 0.01d
# Submission to last job: 581s 9.68m 0.16h 0.01d
# Create a random list of 50 1 mb regions (do not use chrNA and chrUn)
ls -1l chr*/chr*.ss | grep -v NA | grep -v Un | \
awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
# Set up parasol directory to calculate trees on these 50 regions
ssh pk
set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
mkdir -p $dir
cd $dir
# now set up cluster job to estimate model parameters. Parameters
# will be estimated separately for each alignment fragment then
# will be combined across fragments. Tuning this loop should come
# back to here to recalculate. Tuning target-coverage and expected-length.
# Create little script that calls phastCons with right arguments
cat > makeTree.csh << 'EOF'
#!/bin/csh -fe
set C = $1:h
set treeRun = $2
set cov = $3
set len = $4
set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
mkdir -p $dir/$treeRun/log/${C} $dir/$treeRun/tree/${C}
/cluster/bin/phast/x86_64/phastCons $dir/ss/$1 \
/cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons/starting-tree.mod \
--gc 0.426 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-length $len --target-coverage $cov \
--quiet --log $dir/$treeRun/log/$1 --estimate-trees $dir/$treeRun/tree/$1
'EOF'
# << emacs
chmod a+x makeTree.csh
# Make sure that the correct GC content is substituted in here. Notice
# the target coverage of 0.17. Here we are going to aim
# for 65% coverage of coding regions by conserved elements.
# Create gensub file
# need to add cov and len parameters
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1) $(path2)
#ENDLOOP
'_EOF_'
# happy emacs
# Make cluster job and run it
echo "treeRun1 0.17 12" > tree.lst
echo "treeRun2 0.32 18" >> tree.lst
echo "treeRun3 0.32 20" >> tree.lst
echo "treeRun4 0.35 18" >> tree.lst
gensub2 randomSs.list tree.lst template jobList
para create jobList
para try,check,push,check etc.
# para time
# Completed: 200 of 200 jobs
# CPU time in finished jobs: 45500s 758.33m 12.64h 0.53d 0.001 y
# IO & Wait Time: 31478s 524.64m 8.74h 0.36d 0.001 y
# Average job time: 385s 6.41m 0.11h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 622s 10.37m 0.17h 0.01d
# Submission to last job: 821s 13.68m 0.23h 0.01d
# try again, mkdir test2. if aim for about 5% coverage and for chr1 on
# hg18, netDanRer4 covers about 31% of bases then 0.05/0.30 = 0.156
# want length of about 20 bp to influence the model towards detecting
# shorter conserved regions such as TFBSs.
cd test2
echo "treeRun5 0.156 20" > tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test3
echo "treeRun6 0.156 15" > tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test4
# increase coverage and compensate a bit by lowering the expected length
echo "treeRun7 0.25 8" > tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test5
echo "treeRun8 0.35 12" > tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test6
echo "treeRun9 0.5 20" > tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test7
echo "treeRun10 0.5 24" > tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test8
echo "treeRun11 0.45 22" > tree.lst
echo "treeRun12 0.5 26" >> tree.lst
echo "treeRun13 0.5 28" >> tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test9
echo "treeRun14 0.45 24" > tree.lst
echo "treeRun15 0.45 20" >> tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test10
echo "treeRun16 0.40 24" > tree.lst
echo "treeRun17 0.40 20" >> tree.lst
echo "treeRun18 0.42 20" >> tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
cd test11
echo "treeRun19 0.38 24" > tree.lst
echo "treeRun20 0.38 22" >> tree.lst
echo "treeRun21 0.38 20" >> tree.lst
gensub2 ../randomSs.list tree.lst template jobList
para create jobList
# Now combine parameter estimates. We can average the .mod files
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
set dir = /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
foreach d ($dir/treeRun*)
cd $d
ls tree/chr*/*.cons.mod > cons.txt
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.txt' \
--output-average ave.cons.mod > cons_summary.txt
ls tree/chr*/*.noncons.mod > noncons.txt
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.txt' \
--output-average ave.noncons.mod > noncons_summary.txt
end
# measuring entropy
# consEntropy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
# never stops with the --NH argument
# target entropy should be L_min*H=9.8 bits, (between 9.5 to 10.5 is ok)
# the expected length that produces this entropy is the one
# to use for phastCons.
# foreach treeRun, set the appropriate coverage and length
# file: treeRunN cov len
# use awk to split up
cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons
cp tree.lst entropy.csh
perl -pi.bak -e 's/^(treeRun[0-9]+)\s*([0-9\.]+)\s*([0-9]+)/echo \"Coverage = $2 Length = $3\"\ncd $1\n\/cluster\/bin\/phast\/x86_64\/consEntropy $2 $3 ave.cons.mod ave.noncons.mod\ncd \.\./' entropy.csh
chmod +x entropy.csh
entropy.csh >& entropy.out
# entropy.out
#Coverage = 0.17 Length = 12
#Transition parameters:gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
#Relative entropy: H=0.782279 bits/site
#Expected min. length: L_min=13.655129 sites
#Expected max. length: L_max=8.801144 sites
#Phylogenetic information threshold: PIT=L_min*H=10.682123 bits
#Coverage = 0.32 Length = 18
#Transition parameters:gamma=0.320000, omega=18.000000, mu=0.055556, nu=0.026144
#Relative entropy: H=0.757117 bits/site
#Expected min. length: L_min=13.055080 sites
#Expected max. length: L_max=9.912578 sites
#Phylogenetic information threshold: PIT=L_min*H=9.884225 bits
#Coverage = 0.32 Length = 20
#Transition parameters:gamma=0.320000, omega=20.000000, mu=0.050000, nu=0.023529
#Relative entropy: H=0.736191 bits/site
#Expected min. length: L_min=13.815340 sites
#Expected max. length: L_max=10.615242 sites
#Phylogenetic information threshold: PIT=L_min*H=10.170732 bits
#Coverage = 0.35 Length = 18
#Transition parameters:gamma=0.350000, omega=18.000000, mu=0.055556, nu=0.029915
#Relative entropy: H=0.768872 bits/site
#Expected min. length: L_min=12.471015 sites
#Expected max. length: L_max=9.642561 sites
#Phylogenetic information threshold: PIT=L_min*H=9.588610 bits
#Coverage = 0.156 Length = 20
#Transition parameters:gamma=0.156000, omega=20.000000, mu=0.050000, nu=0.009242
#Relative entropy: H=0.676147 bits/site
#Expected min. length: L_min=17.857722 sites
#Expected max. length: L_max=12.694666 sites
#Phylogenetic information threshold: PIT=L_min*H=12.074436 bits
#Coverage = 0.156 Length = 15
#Transition parameters:gamma=0.156000, omega=15.000000, mu=0.066667, nu=0.012322
#Relative entropy: H=0.726430 bits/site
#Expected min. length: L_min=15.713919 sites
#Transition parameters: gamma=0.250000, omega=8.000000, mu=0.125000, nu=0.041667
#Relative entropy: H=0.950194 bits/site
#Expected min. length: L_min=8.951612 sites
#Expected max. length: L_max=5.560228 sites
#Phylogenetic information threshold: PIT=L_min*H=8.505767 bits
#Coverage = 0.5 Length = 20
#Transition parameters:gamma=0.500000, omega=20.000000, mu=0.050000, nu=0.050000
#Relative entropy: H=0.817081 bits/site
#Expected min. length: L_min=10.397809 sites
#Expected max. length: L_max=9.006386 sites
#Phylogenetic information threshold: PIT=L_min*H=8.495855 bits
# Coverage = 0.5 Length = 24
#Transition parameters:gamma=0.500000, omega=24.000000, mu=0.041667, nu=0.041667
#Relative entropy: H=0.772807 bits/site
#Expected min. length: L_min=11.706841 sites
#Expected max. length: L_max=10.170845 sites
#Phylogenetic information threshold: PIT=L_min*H=9.047124 bits
# Coverage = 0.5 Length = 26
#Transition parameters:gamma=0.500000,omega=26.000000, mu=0.038462, nu=0.038462
#Relative entropy: H=0.755159 bits/site
#Expected min. length: L_min=12.299010 sites
#Expected max. length: L_max=10.697444 sites
#Phylogenetic information threshold: PIT=L_min*H=9.287712 bits
#Coverage = 0.5 Length = 28
#Transition parameters:gamma=0.500000,omega=28.000000, mu=0.035714, nu=0.035714
#Relative entropy: H=0.739661 bits/site
#Expected min. length: L_min=12.856932 sites
#Expected max. length: L_max=11.193931 sites
#Phylogenetic information threshold: PIT=L_min*H=9.509775 bits
########USED THESE PARAMETERS##################
#Coverage = 0.45 Length = 24
#Transition parameters:gamma=0.450000, omega=24.000000, mu=0.041667, nu=0.034091
#Relative entropy: H=0.749572 bits/site
#Expected min. length: L_min=12.663020 sites
#Expected max. length: L_max=10.634682 sites
#Phylogenetic information threshold: PIT=L_min*H=9.491841 bits
#Coverage = 0.40 Length = 24
#Transition parameters:gamma=0.400000, omega=24.000000, mu=0.041667, nu=0.027778
#Relative entropy: H=0.730161 bits/site
#Expected min. length: L_min=13.607002 sites
#Expected max. length: L_max=11.092981 sites
#Phylogenetic information threshold: PIT=L_min*H=9.935307 bits
#Coverage = 0.38 Length = 20
#Transition parameters:gamma=0.380000, omega=20.000000, mu=0.050000, nu=0.030645
#Relative entropy: H=0.758676 bits/site
#Expected min. length: L_min=12.652818 sites
#Expected max. length: L_max=10.063048 sites
#Phylogenetic information threshold: PIT=L_min*H=9.599385 bits
#Coverage = 0.38 Length = 24
#Transition parameters:gamma=0.380000, omega=24.000000, mu=0.041667, nu=0.025538
#Relative entropy: H=0.723105 bits/site
#Expected min. length: L_min=13.987286 sites
#Expected max. length: L_max=11.279443 sites
#Phylogenetic information threshold: PIT=L_min*H=10.114270 bits
# Create cluster dir to do main phastCons run
ssh pk
mkdir -p \
/cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
cp -p ../treeRun1/ave.*.mod .
cp -p ../treeRun1/ave.*.mod \
/cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
mkdir ppRaw bed
# Create script to run phastCons with right parameters
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
# Use the expected length and target coverage determined above and
# the corresponding average conserved and nonconserved models
cat > doPhast.csh << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ave.*.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/x86_64/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
--expected-length 18 --target-coverage 0.32 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.*.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# emacs happy
chmod a+x doPhast.csh
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << '_EOF_'
#LOOP
doPhast.csh $(root1) $(file1)
#ENDLOOP
'_EOF_'
# happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
para time
# Completed: 191 of 191 jobs
# CPU time in finished jobs: 4421s 73.69m 1.23h 0.05d 0.000 y
# IO & Wait Time: 121036s 2017.26m 33.62h 1.40d 0.004 y
# Average job time: 657s 10.95m 0.18h 0.01d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 726s 12.10m 0.20h 0.01d
# Submission to last job: 874s 14.57m 0.24h 0.01d
# combine predictions and transform scores to be in 0-1000 interval
ssh kkstore04
cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun
# The sed's and the sort get the file names in chrom,start order
# (Hiram tricks -- split into columns on [.-/] with
# identifying x,y,z, to allow column sorting and
# restoring the filename. Warning: the sort column
# will depend on how deep you are in the dir
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
cp -p mostConserved.bed \
/cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
# Figure out how much is actually covered by the mostConserved data as so:
cd /cluster/data/danRer4
faSize */chr*.fa
# 1774660131 bases (175779328 N's 1598880803 real 816338509 upper
# 782542294 lower) in 28 sequences in 28 files
# Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM)
# max 208014280 (chrNA_random) median 59765243
# 782542294 lower) in 28 sequences in 28 files
# Total size: mean 63380719.0 sd 33877121.9 min 16596 (chrM)
# max 208014280 (chrNA_random) median 59765243
# The non-N size is 1598880803 bases
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
awk '{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/1598880803\n",100.0*sum/1598880803,sum}' \
mostConserved.bed
-target-coverage 0.17: % 1.51 = 100.0*24186350/1598880803 length=12
-target-coverage 0.156: % 1.44 = 100.0*22973222/1598880803 length=20
-target-coverage 0.156: % 1.32 = 100.0*21177329/1598880803 length=15
-target-coverage 0.25: % 1.32 = 100.0*21104503/1598880803 length=8
-target-coverage 0.32: % 1.88 = 100.0*30014509/1598880803 length=20
-target-coverage 0.5: % 3.00 = 100.0*47931076/1598880803 length=20
-target-coverage 0.5: % 2.95 = 100.0*47170018/1598880803 length=24
-target-coverage 0.5: % 2.24 = 100.0*35801661/1598880803 length=28
-target-coverage 0.45: % 2.50 = 100.0*39965003/1598880803 length=24
-target-coverage 0.40: % 2.22 = 100.0*35436744/1598880803 length=24
-target-coverage 0.38: % 2.12 = 100.0*33911465/1598880803 length=20
-target-coverage 0.38: % 2.13 = 100.0*33986115/1598880803 length=24
# want to aim for 65% coverage of coding regions
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
# get an or of refGene and mgcGenes CDS regions
featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
# 11753378 bases of 1626093931 (0.723%) in intersection
# featureBits danRer3 refGene:cds mgcGenes:cds -or \
# -bed=refSeqOrMgcCdsDanRer3.bed
# 11633092 bases of 1630323462 (0.714%) in intersection
featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
# refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.487%, both 0.332%,
# cover 45.97%, enrich 30.90x
# for length = 12 and cov = 0.17 PIT=10.7
# refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.846%, both 0.388%,
# cover 53.74%, enrich 29.12x
# for length = 20 and cov = 0.156 PIT=12.1
# refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.413%, both 0.333%,
# cover 46.04%, enrich 32.59x
# for length = 15 and cov = 0.156 PIT=11.4
# refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.302%, both 0.313%,
# cover 43.36%, enrich 33.30x
# decrease length and increase coverage to compensate
# for length = 8 and cov = 0.25 PIT=8.5, PIT is too low
# refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.298%, both 0.304%,
# cover 42.06%, enrich 32.40x
# try length = 20 and cov = 0.32 PIT=10.8
# refSeqOrMgcCds.bed 0.723%, mostConserved.bed 1.846%, both 0.388%,
# cover 53.74%, enrich 29.12x
# length = 20 and cov = 0.5 PIT=8.5
# refSeqOrMgcCds.bed 0.723%, mostConserved.bed 2.948%, both 0.459%,
# cover 63.53%, enrich 21.55x
# coverage good, need to increase the PIT value so increase the length.
# length = 24 and cov = 0.5 PIT=9.05
# refSeqOrMgcCds.bed 0.723%, mostConserved.bed 2.901%, both 0.458%,
# cover 63.35%, enrich 21.84x
# length = 28 and cov = 0.5 PIT=9.5
# refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.202%, both 0.431%,
# cover 59.57%, enrich 27.06x
# length = 24 and cov = 0.45 PIT=9.5
featureBits danRer4 refGene:cds mgcGenes:cds -or -bed=refSeqOrMgcCds.bed
# 11770580 bases of 1626093931 (0.724%) in intersection
featureBits danRer4 refSeqOrMgcCds.bed mostConserved.bed -enrichment
# refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.458%, both 0.438%,
# cover 60.57% enrich 24.64x
# length = 20 and cov = 0.38 PIT=9.6
# refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.085%, both 0.411%,
# cover 56.76%, enrich 27.22x
# length = 24 and cov = 0.38 PIT=10.1
# refSeqOrMgcCds.bed 0.724%, mostConserved.bed 2.090%, both 0.413%,
# cover 57.07%, enrich 27.30x
# with L_min*H entropy (PIT) value of 9.84 (aiming for around 9.8) and
# 53.3% coverage of coding regions with most conserved elements
# (aiming for about 65%)
# use consRun14 length = 24 cov=0.45
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
hgLoadBed danRer4 phastConsRBestElements mostConserved.bed
# Loaded elements of size 5
featureBits danRer4 mgcGenes:cds phastConsRBestElements -enrichment
# mgcGenes:cds 0.560%, phastConsRBestElements 2.458%, both 0.349%,
# cover 62.23%, enrich 25.32x
# Create merged posterier probability file and wiggle track data files
# the sed business gets the names sorted by chromName, chromStart
# so that everything goes in numerical order into wigEncode
ssh kkstore04
cd /cluster/bluearc/danRer4/multiz7way.2006-05-18/phastCons/consRun14
find ./ppRaw -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| wigEncode stdin phastConsRBest7way.wig phastConsRBest7way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# takes a few minutes
ls -l phastCons*
#-rw-rw-r-- 1 hartera protein 133817339 May 24 22:48 phastConsRBest7way.wib
#-rw-rw-r-- 1 hartera protein 36947021 May 24 22:48 phastConsRBest7way.wig
cp -p phastConsRBest7way.wi? \
/cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-18/phastCons
mkdir -p /gbdb/danRer4/wib
ln -s `pwd`/phastConsRBest7way.wib /gbdb/danRer4/wib/phastConsRBest7way.wib
# use this if need to reload table
hgsql -e 'drop table phastConsRBest7way;' danRer4
# load table
hgLoadWiggle danRer4 phastConsRBest7way phastConsRBest7way.wig
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/danRer4/bed/multiz7way.2006-05-04/phastCons
bash
time hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=danRer4 phastCons7way > histogram.data 2>&1
# real 2m33.069s
# user 1m58.310s
# sys 0m16.170s
# create plot of histogram:
cat << '_EOF_' > histo.gp
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Zebrafish danRer4 Histogram phastCons7 track"
set xlabel " phastCons7 score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# happy emacs
gnuplot histo.gp > histo.png
display histo.png &
# add line: wiggle phastCons7way to trackDb.ra for multiz7way to display the
# wiggle for the conservation track.
# check all.joiner for entries for phastCons7way and phastConsElements7way -ok
# copy over html for multiz and edit.
###########################################################################
# BACENDS TRACK (DONE, 2006-08-25, hartera)
# Obtain these from the NCBI Trace archive
ssh kolossus
mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/sequences
cd /cluster/data/danRer4/bed/bacEnds/
ln -s /san/sanvol1/scratch/danRer4/bacEnds/sequences .
cd sequences
# go to NCBI Trace Archive
# http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?
cat << '_EOF_' > query_tracedb
#!/usr/bin/perl -w
use strict;
use LWP::UserAgent;
use HTTP::Request::Common 'POST';
$ENV{'LANG'}='C';
$ENV{'LC_ALL'}='C';
my $query = join ' ', @ARGV;
$query = 'help' if $query =~ /^(\-h|\-\-help|\-)$/;
$query = join('', <STDIN>) if ! $query;
my $req = POST 'http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=raw', [query=>$query];
my $res = LWP::UserAgent->new->request($req, sub { print $_[0] });
die "Couldn't connect to TRACE server\n" if ! $res->is_success;
'_EOF_'
chmod +x query_tracedb
# ./query_tracedb usage
# command to see the help screen with usage examples
# count number of entries for zebrafish
query_tracedb "query count species_code='DANIO RERIO' AND trace_type_code = 'CLONEEND'"
# 473060
# 428904 (08-16-06)
# Therefore this is 11 files of 40000 results each.
# so get from ftp site:
cat << '_EOF_' > getZfishSeqs.csh
#!/bin/csh -fe
foreach n (0 1 2 3 4 5 6 7 8 9 10)
echo "Fetching page $n ..."
(echo -n "retrieve_tgz all 0b"; query_tracedb "query page_size 40000 page_number $n binary species_code='DANIO RERIO' AND trace_type_code = 'CLONEEND'") | query_tracedb > data${n}.tgz
end
'_EOF_'
chmod +x getZfishSeqs.csh
mkdir -p downloads
cp query_tracedb getZfishSeqs.csh ./downloads
cd downloads
nohup nice getZfishSeqs.csh >& zfishSeqs.log &
# Took 5 hours 14 minutes.
## Start: Wed May 10 09:57 Finished: 14:51
# Start: May 2 21:43 Finish: May 3 03:08
ssh kkstore04
# unzip and untar the downloads
cd /cluster/data/danRer4/bed/bacEnds/sequences/downloads
gunzip *.tgz
cat << '_EOF_' > unTarBacs.csh
#!/bin/csh -fe
foreach t (0 1 2 3 4 5 6 7 8 9 10 11)
tar xvf data${t}.tar
end
'_EOF_'
chmod +x unTarBacs.csh
nohup unTarBacs.csh >& unTarBacs.log &
foreach d (2006*)
echo "Processing $d"
nice cat ${d}/TRACEINFO.xml >> allTraceInfo.xml
nice catBacs.csh >& catBacs.log &
# The last archive obtained is empty so try downloading from the ftp site
# to be sure to get everything.
# get BAC end sequences from NBCI Trace archive ftp site:
ssh kkstore04
mkdir /cluster/data/danRer4/bed/bacEnds/sequences2
mkdir /cluster/bluearc/danRer4/bacEndsDownloads
cd /cluster/data/danRer4/bed/bacEnds/sequences2
ln -s /cluster/bluearc/danRer4/bacEndsDownloads
cd /cluster/data/danRer4/bed/bacEnds/sequences2/bacEndsDownloads
# get index page and ftp for the trace server
wget --timestamping \
ftp://ftp.ncbi.nih.gov/pub/TraceDB/danio_rerio/
# grab just the ftp link for each file.
grep "anc" index.html > ancillary.lst
perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' ancillary.lst
rm *.bak
# this contains just the ftp link for each file to get the ancillary
# information files.
cat << '_EOF_' > getFtpFiles.csh
#!/bin/csh -fe
set s=$1
foreach f (`cat "${s}"`)
echo $f
nice wget --timestamping $f
end
'_EOF_'
chmod +x getFtpFiles.csh
nohup nice getFtpFiles.csh ancillary.lst >& ancillary.log &
# Took about 25 minutes.
grep "fasta" index.html > otherFiles.lst
grep "mate_pairs" index.html >> otherFiles.lst
grep "xml" index.html >> otherFiles.lst
perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' otherFiles.lst
rm *.bak
mkdir otherFiles
cd otherFiles
cp ../otherFiles.lst .
# then get these files by ftp
nice ../getFtpFiles.csh otherFiles.lst >& otherFiles.log &
# Took about 6 hours and 50 minutes.
# There are 181 files as expected.
foreach f (*.gz)
nice gunzip $f
end
cd ..
cat ./otherFiles/fasta* > danRerBacEnds.fa
# Took about 20 minutes
grep '>' danRerBacEnds.fa | wc -l
# 14566448
cat ./otherFiles/xml* > danRer.xml
# Took 4 hours and 40 minutes.
# find out which have CLONEEND information in them
cat << '_EOF_' > findCloneEnds.csh
#!/bin/csh -fe
foreach f (otherFiles/xml.*)
echo $f >> cloneEndsXml.txt
grep CLONEEND $f >> cloneEndsXml.txt
end
'_EOF_'
chmod +x findCloneEnds.csh
nice findCloneEnds.csh &
# Took 1.5 hours
# CLONEEND is only in xml.danio_rerio.024 and xml.danio_rerio.033
cd /cluster/data/danRer4/bed/bacEnds/sequences2/bacEndsDownloads
cat otherFiles/xml.danio_rerio.024 otherFiles/xml.danio_rerio.033 \
> cloneEnds.xml
# cleanup xml files
rm otherFiles/xml.*
# get list of libraries:
grep "LIBRARY_ID" cloneEnds.xml | sort | uniq > libraries.xml.txt
grep "TRACE_NAME" cloneEnds.xml | wc -l
# 985980
grep "TRACE_NAME" cloneEnds.xml | sort | uniq -c > traceName.xml.count
# Hard to tell which are the BAC clone end sequences. These ftp files
# contain a mixture of sequences from different sources
# Try downloading sequences from Sanger instead. Not all of the sequences
# may have been submitted to NCBI anyway yet.
ssh kkstore04
cd /cluster/data/danRer4/bed/bacEnds
mkdir -p /san/sanvol1/danRer4/bacEnds/ensemblSeqs
ln -s /san/sanvol1/danRer4/bacEnds/ensemblSeqs
cd ensemblSeqs
wget --timestamping \
ftp://ftp.ensembl.org/pub/traces/danio_rerio/fasta/
# gets index.html page
# get list of cloneEnd FASTA files
grep cloneEnd index.html > cloneEndsFile
perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' cloneEndsFile
rm *.bak
foreach f (`cat cloneEndsFile`)
echo $f
wget --timestamping $f
end
# then do the same to get the trace info xml files:
wget --timestamping \
ftp://ftp.ensembl.org/pub/traces/danio_rerio/traceinfo/
grep cloneEnd index.html > cloneEndsXmlFile
perl -pi.bak -e 's/.+<a href=\"(ftp.+)\">[a-zA-Z]+.+/$1/' cloneEndsXmlFile
rm *.bak
foreach f (`cat cloneEndsXmlFile`)
echo $f
wget --timestamping $f
end
gunzip *.gz
# check for multiple occurrences of same sequence ID
grep trace_name *.xml | sort | uniq -c | sort -nr > traceNames.count
# top of list has count of 1 so the end names are unique.
grep clone_id *.xml | sort | uniq -c | sort -nr > cloneIds.count
# top of list has count of 4. All those clone IDs that appear 3 or 4 times
# do so in the CHORI-1073 library - this is the fosmid library.
# move CHORI-1073 out of the way
mkdir fosmids
mv sanger-zfish-CHORI-1073-cloneEnd* ./fosmids
# FASTA files have clone end names as sequence names
# concatenate the 18 fasta files
cat *.fasta > Zv6BacEnds.fa
grep '>' Zv6BacEnds.fa | wc -l
# 694170
# Zv5 had 729101 but these were not unique reads for each sequence.
faSize Zv6BacEnds.fa >& Zv6.faSize.txt
# there are 31 sequence names with no sequence.
awk '{print $10}' Zv6.faSize.txt > cloneEnds.noSeq
# remove extra lines at end of file
# list of FASTA files that they are in.
grep -f cloneEnds.noSeq *.fasta > cloneEnds.noSeq.files
# sent this list of sequence names and files to Kerstin Howe
# at Sanger: kj2@sanger.ac.uk . Sanger said that these are just missing
# sequences due to poor quality.
# invalid FASTA file format
# remove these from FASTA file:
grep -v -f cloneEnds.noSeq Zv6BacEnds.fa > tmp.fa
grep '>' tmp.fa | wc -l
# 694139
mv tmp.fa Zv6BacEnds.fa
faSize Zv6BacEnds.fa
# 728424771 bases (11822219 N's 716602552 real 716602552 upper 0 lower) in
# 694139 sequences in 1 files
# Total size: mean 1049.4 sd 277.3 min 4 (zKp108D7.za) max 5403 (zC259G13.zb)
# median 982
# N count: mean 17.0 sd 42.1
# U count: mean 1032.4 sd 265.3
# L count: mean 0.0 sd 0.0
# Blat these BAC ends vs the danRer4 genome assembly. Gaps between
# scaffolds in the NA_random and Un_random chroms are 50,000 so
# alignments of BAC ends across adjacent scaffolds are unlikely,
# but alignments done separately just in case:
ssh pk
mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/sequences
cd /cluster/data/danRer4/bed/bacEnds/ensemblSeqs
cp Zv6BacEnds.fa /san/sanvol1/scratch/danRer4/bacEnds/sequences
mkdir -p /cluster/data/danRer4/bed/bacEnds/chromsRun
cd /cluster/data/danRer4/bed/bacEnds/chromsRun
ls -1S /san/sanvol1/scratch/danRer4/bacEnds/sequences/Zv6BacEnds.fa \
> bacends.lst
ls -1S /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > seqs.lst
# create out dir
mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl
# use Blat parameters as for mm5 and hg17
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc {check out line+ /san/sanvol1/scratch/danRer4/bacEnds/chromsPsl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
gensub2 seqs.lst bacends.lst template jobList
para create jobList
para try, check, push, check, ...
# para time
# Completed: 271 of 271 jobs
# CPU time in finished jobs: 1063126s 17718.77m 295.31h 12.30d 0.034 y
# IO & Wait Time: 2531s 42.18m 0.70h 0.03d 0.000 y
# Average job time: 3932s 65.54m 1.09h 0.05d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 9404s 156.73m 2.61h 0.11d
# Submission to last job: 9891s 164.85m 2.75h 0.11d
# Repeat for random chroms, but use separate scaffolds:
mkdir -p /cluster/data/danRer4/bed/bacEnds/randomsRun
cd /cluster/data/danRer4/bed/bacEnds/randomsRun
ls -1S /san/sanvol1/scratch/danRer4/bacEnds/sequences/Zv6BacEnds.fa \
> bacends.lst
foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/Zv6*.fa)
ls -1S $f >> seqs.lst
end
# create out dir
mkdir -p /san/sanvol1/scratch/danRer4/bacEnds/randomsPsl
# use Blat parameters as for mm5 and hg17
cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/blat $(path1) $(path2) -ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc {check out line+ /san/sanvol1/scratch/danRer4/bacEnds/randomsPsl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line keeps emacs coloring happy
gensub2 seqs.lst bacends.lst template jobList
para create jobList
para try, check, push, check, ...
# para time
# Completed: 2966 of 2966 jobs
# CPU time in finished jobs: 240259s 4004.31m 66.74h 2.78d 0.008 y
# IO & Wait Time: 84042s 1400.71m 23.35h 0.97d 0.003 y
# Average job time: 109s 1.82m 0.03h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 997s 16.62m 0.28h 0.01d
# Submission to last job: 11925s 198.75m 3.31h 0.14d
# lift chrom alignments and randoms alignments and then merge and filter.
ssh kolossus
cd /cluster/data/danRer4/bed/bacEnds/
nice pslSort dirs rawChroms.psl tmp \
/san/sanvol1/scratch/danRer4/bacEnds/chromsPsl >& chromSort.log
# Took 2 hours
# very large output so do the randoms on the san
cd /san/sanvol1/scratch/danRer4/bacEnds/
nice pslSort dirs rawRandoms.psl tmp randomsPsl >& randomsSort.log
# Took 12 minutes
# move the rawChroms.psl over to the san
mv /cluster/data/danRer4/bed/bacEnds/rawChroms.psl \
/san/sanvol1/scratch/danRer4/bacEnds/
cd /san/sanvol1/scratch/danRer4/bacEnds/
# for danRer3, hg18 etc.:
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
rawChroms.psl bacEndsChroms.psl /dev/null
# Took about 1 hour.
pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons \
rawRandoms.psl bacEndsRandoms.psl /dev/null
# Took 2 minutes.
# merge files. There is a single liftOver file that works for both the
# pseudocontigs and the scaffolds.
# remove header for bacEndsRandoms.psl
tail +6 bacEndsRandoms.psl > tmp.psl
cat bacEndsChroms.psl tmp.psl > bacEndsNoLift.psl
# liftUp file to chrom coordinates.
liftUp bacEnds.psl \
/cluster/data/danRer4/jkStuff/liftAll.lft warn bacEndsNoLift.psl
# Took 2 minutes
# REPROCESS BACENDS - see section at end (2006-10-06 - 2006-10-11, hartera)
# Now put together the pairs information:
ssh kkstore04
cd /cluster/data/danRer4/bed/bacEnds
mv /san/sanvol1/danRer4/bacEnds/bacEnds.psl .
# cat together the xml files of BAC clone end information
cat ensemblSeqs/*.xml > danRerBacEnds.xml
# get mate-pair information from xml, forward is SP6, reverse is T7
# edit getBacInfo.pl used for canFam1 and adapt for use with zebrafish
# BAC ends. Not all entries in the xml file have clone_id or trace_end
# but sometimes they have trace_direction instead of trace_end.
# correct directions:
cat << '_EOF_' > getZfishBacInfo.pl
#!/usr/bin/perl -w
use strict;
my ($file, $outFile, $name, $clone, $library, $dir);
$file = $ARGV[0];
$outFile = $ARGV[1];
open (FILE, $file) || die "Can not open $file : $!\n";
open (OUT, ">$outFile") || die "Can not create $outFile : $!\n";
open (STDERR, ">error.log") || die "Can not create error.log : $!\n";
my %cloneHash = qw {
zC CH211-
zK DKEY-
zKp DKEYP-
bZ RP71-
dZ BUSM1-
CHORI73_ CH73-
};
$name = "";
$clone = "";
$dir = "";
while (<FILE>)
{
chomp;
my $l = $_;
if ($l =~ /<trace_name>([A-Za-z0-9\_\.]+)/)
{
$name = $1;
}
elsif ($l =~ /<clone_id>([A-Z0-9]+\-[0-9A-Z]+)/)
{
$clone = $1;
}
elsif ($l =~ /<library_id>([A-Z0-9a-z\s]+\-?[0-9A-Z]*)<\/library_id>/)
{
$library = $1;
if ($library eq "Daniokey Pilot")
{
$library = "DKEYP";
}
}
elsif ($l =~ /<trace_end>(F|R)/)
{
$dir = $1;
}
elsif ($l =~ /<trace_direction>(F|R)/)
{
$dir = $1;
}
# find end of record and print out end information
if ($l =~ /^\s+<\/trace>/)
{
printInfo($name, $clone, $library, $dir);
$name = $clone = $dir = $library = "";
}
}
close FILE;
close OUT;
close STDERR;
sub printInfo {
my ($name, $clone, $lib, $d) = @_;
# if no clone name read from file then create from trace name
if ($clone eq "")
{
foreach my $c (keys(%cloneHash))
{
if ($name =~ /$c/)
{
if (exists($cloneHash{$c}))
{
my $prefix = $cloneHash{$c};
$clone = $name;
# change to clone name
$clone =~ s/$c/$prefix/;
# remove suffix
$clone =~ s/\.[a-z]+|SP6|T7//;
}
}
}
}
# convert forward or reverse direction to T7 or SP6
if ($d ne "")
{
if ($d eq "F")
{
$d = "T7";
}
elsif ($d eq "R")
{
$d = "SP6";
}
}
else
{
print STDERR "No direction for $name found\n";
}
# print clone end information
print OUT "$clone\t$name\t0\t$lib\t0\t$d\n";
}
'_EOF_'
# << for emacs
chmod +x getZfishBacInfo.pl
perl getZfishBacInfo.pl danRerBacEnds.xml bacEndInfo.txt
# check all the names are there
grep '>' ./ensemblSeqs/Zv6BacEnds.fa > names
perl -pi.bak -e 's/>//' names
sort names | uniq > names.sort
awk '{print $2}' bacEndInfo.txt | sort | uniq > bacEndInfo.names.sort
comm -13 bacEndInfo.names.sort names.sort
# no difference so all clone ends in the FASTA file are also
# in the xml file.
rm *.bak *.sort names
# create mate-pair information
cp /cluster/bin/scripts/convertBacEndPairInfo convertZfishBacEndInfo
# comment out line 43 as this removes the suffix after a . from the
# trace names. In this case, we need to keep those.
# line 43: ($acc, $ver) = split(/\./,$acc);
# here used wrong script - used old one.
./convertZfishBacEndInfo bacEndInfo.txt
# creates pairs and singles files
# 312901 pairs and 35479 singles
# looks like pairs were made for both DKEY-32B21A and DKEY-32B21
# need to find singles that could be used in pairs.
awk '{print $2}' bacEndSingles.txt > singles.names
perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' singles.names
perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' singles.names
sort singles.names | uniq -c | sort -nr > singles.names.count
# 209 have 2 ends for the BAC clone.
# some are duplicates of the same end e.g. .ya and .yb but these
# have the same BAC clone name.
head -209 singles.names.count | awk '{print $2}' > singles.withPairs.names
awk '{print $2}' bacEndPairs.txt > pairs.names
perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' pairs.names
perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' pairs.names
mkdir -p /cluster/data/danRer4/bed/bacEnds/pairs
cd /cluster/data/danRer4/bed/bacEnds/pairs
set dir = /cluster/data/danRer4/bed/bacEnds
# use parameters from REDO of danRer3 BAC ends
/cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose $dir/bacEnds.psl $dir/bacEndPairs.txt all_bacends bacEnds
wc -l *
# 1714 bacEnds.long
# 14889 bacEnds.mismatch
# 109213 bacEnds.orphan
# 105294 bacEnds.pairs
# 347 bacEnds.short
# 782 bacEnds.slop
# create header required by "rdb" tools
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \
> ../header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
# edit header to make sure \t is/become tab character
cat header bacEnds.pairs | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairs.bed
# create bad BAC ends set
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBad.bed
# Also create a bad BAC ends set with no orphans since orphans are
# already added to the singles track and do not want to add these orphans
# twice when extracting PSL. Use this bacEndPairsBadNoOrphans.bed
# file when extracting PSLs for adding to the all_bacends table.
cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
| row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBadNoOrphans.bed
# To create singles set:
# also need to process bacEndSingles.txt into a database table
# for singles in bacEndSingles.txt, create a dummy file where they
# are given zJA11B12T7 as dummy sequence pair. If the single is a forward
# sequence, put the dummy sequence in the second column, if the single is
# a reverse sequence put in first column. use a perl script to do this.
cd /cluster/data/danRer4/bed/bacends
set bacDir = /cluster/data/danRer4/bed/bacEnds
mkdir singles
cd singles
cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
perl formatSingles.pl $bacDir/bacEndSingles.txt > \
$bacDir/bacEndSingles.format
# then run pslPairs on this formatted file
/cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
all_bacends bacEnds
wc -l bacEnds.*
# 0 bacEnds.long
# 0 bacEnds.mismatch
# 22036 bacEnds.orphan
# 0 bacEnds.pairs
# 0 bacEnds.short
# 0 bacEnds.slop
cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles
wc -l bacEnds.singles
# 131249 bacEnds.singles
# Of these, 109213 are from pair analysis and 22036 from singles.
# For danRer3: there are 11439 orphans from singles and 242235 from
# pair analysis so a total of 253674 orphans so this has improved.
# Although for danRer3, some of these could be replicate reads for the
# same BAC clone end.
# make singles bed file
cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndSingles.bed
# check if there are any overlapping alignments that can be removed.
cd /cluster/data/danRer4/bed/bacEnds
mkdir -p duplicates/overlapRun
cd duplicates/overlapRun
sort -k1,2 /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed \
> bacEndPairs.lfs
wc -l *.lfs
# 104546 bacEndPairs.lfs
nice /cluster/bin/x86_64/lfsOverlap bacEndPairs.lfs bacEndPairs.bed \
-name -minOverlap=0.999 -notBlocks
# Loaded 104546 elements of size 11
# only 5 lines removed
sort -k1,2 /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed \
> bacEndSingles.lfs
nice /cluster/bin/x86_64/lfsOverlap bacEndSingles.lfs bacEndSingles.bed \
-name -minOverlap=0.999 -notBlocks
# Loaded 125695 elements of size 11
# No lines removed.
sort -k1,2 \
/cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairsBadNoOrphans.bed \
> bacEndPairsBadNoOrphans.lfs
wc -l *.lfs
# 17611 bacEndPairsBadNoOrphans.lfs
nice /cluster/bin/x86_64/lfsOverlap bacEndPairsBadNoOrphans.lfs \
bacEndPairsBadNoOrphans.bed -name -minOverlap=0.999 -notBlocks
# Loaded 17611 elements of size 11
# Saving 17608 records to bacEndPairsBadNoOrphans.bed
# Only 3 alignments were removed.
# Therefore no point in doing using these files. Use the original bed
# files for pairs and singles. No further processing of BED files is
# needed as they have not been changed in any way.
# Remove duplicates directory.
rm -r /cluster/data/danRer4/bed/bacEnds/duplicates
# use new extract program that extracts PSLs using name and position:
ssh kkstore04
set bacDir=/cluster/data/danRer4/bed/bacEnds
cd $bacDir/pairs
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl bacEndPairs.bed bacPairs.psl
# for this, use bacEndPairsBadNoOrphans since pairs orphans are already
# included in bacEndSingles
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl bacEndPairsBadNoOrphans.bed bacPairsBadNoOrphans.psl
# then for singles
cd $bacDir/singles
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl bacEndSingles.bed bacSingles.psl
cd $bacDir
cat pairs/*.psl singles/bacSingles.psl > allBacends.load.psl
# try old program and compare
extractPslLoad -noBin bacEnds.psl pairs/bacEndPairs.bed \
pairs/bacEndPairsBadNoOrphans.bed singles/bacEndSingles.bed \
| sorttbl tname tstart | headchg -del > bacEnds.load.psl
wc -l *.load.psl
# 364457 allBacends.load.psl
# 4568907 bacEnds.load.psl
# Much reduced by using only BAC end alignments that are in BED files.
# load into database
ssh hgwdev
cd /cluster/data/danRer4/bed/bacEnds/pairs
hgLoadBed danRer4 bacEndPairs bacEndPairs.bed -notItemRgb \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
# Loaded 104546 elements of size 11
# note - this next track isn't pushed to RR, just used for assembly QA
hgLoadBed danRer4 bacEndPairsBad bacEndPairsBad.bed -notItemRgb \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
# Loaded 121728 elements of size 11
cd /cluster/data/danRer4/bed/bacEnds/singles
cp /cluster/data/danRer3/bed/bacends/singles/bacEndSingles.sql .
hgLoadBed danRer4 bacEndSingles bacEndSingles.bed -notItemRgb \
-sqlTable=bacEndSingles.sql
# Loaded 125695 elements of size 11
cd /cluster/data/danRer4/bed/bacEnds
hgLoadPsl danRer4 -table=all_bacends allBacends.load.psl
# All alignments were loaded into the table - no problems.
# load BAC end sequences into seq table so alignments may be viewed
# symlink to FASTA sequence file in ncbi directory
# move BAC ends to the ncbi directory
mkdir -p /cluster/data/ncbi/bacends/zebrafish/bacends.1
# remove some files
cd ensemblSeqs
rm tmp clone* index.html
cd /cluster/data/danRer4/bed/bacEnds
mv /cluster/data/danRer4/bed/bacEnds/ensemblSeqs/* \
/cluster/data/ncbi/bacends/zebrafish/bacends.1
rm -r ensemblSeqs
mkdir -p /gbdb/danRer4/bacends
ln -s /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
/gbdb/danRer4/bacends/Zv6BacEnds.fa
hgLoadSeq danRer4 /gbdb/danRer4/bacends/Zv6BacEnds.fa
# check trackDb.ra entry and description
# cleanup:
ssh kkstore04
cd /cluster/data/danRer4/bed/bacEnds/
rm -r sequences
rm -r /san/sanvol1/scratch/danRer4/bacEnds/sequences
rm -r sequences2
rm changes.txt bacEnds.load.psl *.log
du -sh /cluster/data/danRer4/bed/bacEnds
# 2.4G /cluster/data/danRer4/bed/bacEnds
gzip *.psl *.txt danRerBacEnds.xml
du -sh /cluster/data/danRer4/bed/bacEnds
# 599M /cluster/data/danRer4/bed/bacEnds
# (hartera, 2006-10-02)
# NOTE: Some BAC clones have duplicate reads and these end in the
# suffixes SP6A, T7A, SP6W and T7W. There is a corresponding read name
# without the W or A suffix. The names of the BAC clones
# are also suffixed with A or W for these reads. e.g There is a BAC
# clone called DKEY-32M8. DKEY-32M8A is the same one sequenced with
# different read ending in SP6A and T7A. The BAC ends names are
# zK32M8SP6A and zK32M8T7A.
# Check if there are any cases where both the version without the W or A
# suffix and the version with it are in the same track:
ssh hgwdev
cd /cluster/data/danRer4/bed/bacEnds
mkdir duplicates
cd duplicates
# found that there are some alignments in all_bacends where there
# is SP6W, SP6A, T7W, T7A suffixes for BAC ends. These are duplicate
# reads, there is a corresponding read name without the W or A suffix.
# Suffix Alignments Unique Names
# SP6W 179 153
# SP6A 254 245
# T7W 53 48
# T7A 247 238
hgsql -e 'select count(*) from bacEndPairs where lfNames like "%SP6A%";' \
danRer4
# 126 were found
hgsql -e \
'select count(distinct(name)) from bacEndPairs where lfNames like "%SP6A%";' \
danRer4
# 122 with distinct names
hgsql -N -e \
'select name, lfNames from bacEndPairs where lfNames like "%SP6A%";' \
danRer4 | sort > names.SP6A.txt
awk '{print $1}' names.SP6A.txt | sed -e 's/A$//' > names.SP6.txt
hgsql -N -e \
'select name, lfNames from bacEndPairs where lfNames not like "%SP6A%";' \
danRer4 | sort > pairs.nameswithoutA.txt
grep -w -f names.SP6.txt pairs.nameswithoutA.txt | sort | uniq \
> pairs.withAandwithout.txt
# there are 23 BAC clones in the bacEndPairs table where there are
# entries for both the clone names ending in A and that without the A.
hgsql -N -e 'select name, lfNames from bacEndSingles where (lfNames like
"%SP6A%") or (lfNames like "%SP6W%") or (lfNames like "%T7A%") or (lfNames
like "%T7W%");' danRer4 | sort | uniq > singles.names.sort
awk '{print $1}' singles.names.sort | sed -e 's/A$//' | sed -e 's/W$//' \
> names.SP6andT7.txt
wc -l names.SP6andT7.txt
# 372 names.SP6andT7.txt
sort names.SP6andT7.txt | uniq > names.SP6andT7.uniq
wc -l names.SP6andT7.uniq
# 309 names.SP6andT7.uniq
# Some may have both names ending in W and in A or could those
# where the SP6 and T7 end are both present.
hgsql -N -e 'select name, lfNames from bacEndSingles;' danRer4 \
> singles.names.txt
grep -w -f names.SP6andT7.uniq singles.names.txt | sort | uniq \
> singles.withAorWandwithout.txt
wc -l singles.withAorWandwithout.txt
# 212 singles.withAandwithout.txt
ssh kkstore04
cd /cluster/data/danRer4/bed/bacEnds/duplicates/tmp/singles
# Check to see if any pairs can be made that do not have the same
# suffix: A, W or without. Only for cases where there is not a pair
# already.
awk '{print $2}' singles.names.sort | sort | uniq > bacEnds.namesAorW.sort
# also add the BAC ends for those with the same name but withour A or W
awk '{print $2}' singles.withAorWandwithout.txt | sort | uniq \
> singles.withAorWandwithout.ends
cat bacEnds.namesAorW.sort singles.withAorWandwithout.ends \
| sort | uniq > bacEnds.namesAorWorwithout.sort
# make pairs where there is none with the same ending already. If an end
# has W and/or A suffix and/or no suffix, use just one and discard others.
# use a script to do this.
wc -l *.txt
# 93 diffSuffix.txt
# 69 sameSuffix.txt
# 212 singles.withAorWandwithout.txt
# 92 singlesEnds.txt
# changed program to do second pass using the extra ends.
# 76 diffSuffix.txt
# 78 extraEnds.txt
# 39 extraEnds2.txt
# 86 sameSuffix.txt
# 92 singlesEnds.txt
# /cluster/data/danRer4/bed/bacEnds/duplicates/tmp/singles/test2
# now check to see if any of the BACs represented by singles or pairs
# are already in the original file created.
# extraEnds2.txt are those to be removed
# diffSuffix.txt, sameSuffix.txt and singlesEnds.txt should all
# be checked against the entries in the bacEndPairs table since
# these are sequences that already passed all the criteria for
# being in the BAC end pairs track.
mkdir /cluster/data/danRer4/bed/bacEnds/duplicates/remove
cd /cluster/data/danRer4/bed/bacEnds/duplicates/remove
cp /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed .
# those that have the same suffix will have already been paired. It is
# the ones that are different that should be put into the pairs file
# and those that are singles should go into the singles file before
# processing the BAC ends.
# first remove the 23 that are duplicated in the bacEndPairs table.
cp ../pairs.withAandwithout.txt .
# cd /cluster/data/danRer4/bed/bacEnds/duplicates/remove
awk '{print $1"A"}' pairs.withAandwithout.txt > bacsToRemove.txt
# remove these from the BAC end pairs file
grep -wv -f bacsToRemove.txt bacEndPairs.bed > bacEndPairsRemBacA.bed
wc -l *.bed
# 104546 bacEndPairs.bed
# 104523 bacEndPairsRemBacA.bed
# then find out if there are any BACs with more than one set of pairs
# in each of the lists: sameSuffix.txt and diffSuffix.txt
cp ../*Suffix.txt .
# the first column has the stem of the BAC end names without the
# SP6 or T7 part of the suffix.
awk '{print $1;}' sameSuffix.txt | sort | uniq -c | sort -nr \
> sameSuff.count
# no duplicates within the file
awk '{print $1;}' diffSuffix.txt | sort | uniq -c | sort -nr \
> diffSuff.count
# no duplicates within the file
cat sameSuffix.txt diffSuffix.txt > allSuff.txt
awk '{print $1;}' allSuff.txt | sort | uniq -c | sort -nr \
> allSuff.count
# no duplicates between files
rm *.count
# then check if any of these are represented in the pairs table:
# All of these BAC end names begin with zK, these are DKEY- BAC clones
# translate names in column 1 to BAC clone names
awk '{print $1}' allSuff.txt | sed -e 's/zK/DKEY\-/' | sort \
> allSuff.BACclones.txt
grep -w -f allSuff.BACclones.txt bacEndPairsRemBacA.bed \
> newPairsDupsInPairsBed.txt
# only one is found: DKEY-32B21: zK32B21T7,zK32B21SP6
awk '{print $4}' newPairsDupsInPairsBed.txt \
> newPairsDupsInPairsBed.name
grep "zK32B21" *.txt
# found in sameSuffix.txt so delete from this file and from allSuff.txt
grep -wv "zK32B21" sameSuffix.txt > sameSuffix2.txt
grep -wv "zK32B21" allSuff.txt > allSuff2.txt
# in this case the zK32B21T7A alignment is much better than the
# zK32B21T7 alignment, also zK32B21SP6A is better than the zK32B21SP6
# alignment therefore it should be replaced with the SP6A and T7A
# versions.
cp /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed .
grep "zK32B21" bacEndSingles.bed
# then repeat this for the singles and see if any of those already
# have pairs in the bacEndPairsRemBacA.bed file.
cp ../singlesEnds.txt .
cp ../extraEnds2.txt .
# all these ends begin with "zK" so from "DKEY-" library.
# get BAC end prefixes and conver to DKEY BAC clone names.
awk '{print $1}' singlesEnds.txt | sed -e 's/zK/DKEY\-/' | sort \
> singles.BACclones.txt
grep -w -f singles.BACclones.txt bacEndPairsRemBacA.bed \
> singlesInPairsBed.txt
wc -l singlesInPairsBed.txt
# 40 singlesInPairsBed.txt
# get those names from the clone name in bacEndPairsRemBacA.bed
awk '{print $4}' singlesInPairsBed.txt | sed -e 's/DKEY\-/zK/' \
| sort | uniq > singlesDupsInPairs.txt
wc -l singlesDupsInPairs.txt
# 37 singlesDupsInPairs.txt
# All of these versions are in Genbank.
cat newPairsDupsInPairsBed.name singlesDupsInPairs.txt \
| sed -e 's/zK/DKEY\-/' > allDupsInPairs.txt
# BEST WAY FORWARD IS TO START AGAIN WITH PROCESSING THE BAC ENDS AND
# PROCESS DUPLICATES AS FOR danRer3.
##############################################################################
# REPROCESS BAC ENDS TO DEAL WITH DUPLICATES AND REDO BACENDS TRACKS
# (2006-10-06 - 2006-10-11, hartera)
# The bacEnds.psl from the first BACENDS TRACK section is used so all
# processing is the same up to that point.
# Now put together the pairs information:
ssh kkstore04
# move old bacends dir out the way
mv /cluster/data/danRer4/bed/bacEnds /cluster/data/danRer4/bed/bacEndsOld
mkdir /cluster/data/danRer4/bed/bacEnds
cd /cluster/data/danRer4/bed/bacEnds
# mv /cluster/data/danRer4/bed/bacEndsOld/bacEnds.psl .
# cat together the xml files of BAC clone end information
cat ensemblSeqs/*.xml > danRerBacEnds.xml
# get mate-pair information from xml,
# in convertBacEndInfo, forward is T7, reverse is SP6. Use this
# although before used the other way round. Arbitrary really as long
# as use the same in the same library. CHORI73 library has it the opposite
# way round to above.
# edit getBacInfo.pl used for canFam1 and adapt for use with zebrafish
# BAC ends. Not all entries in the xml file have clone_id or trace_end
# but sometimes they have trace_direction instead of trace_end.
# correct directions:
cat << '_EOF_' > getZfishBacInfo.pl
#!/usr/bin/perl -w
use strict;
my ($file, $outFile, $name, $clone, $library, $dir);
$file = $ARGV[0];
$outFile = $ARGV[1];
open (FILE, $file) || die "Can not open $file : $!\n";
open (OUT, ">$outFile") || die "Can not create $outFile : $!\n";
open (STDERR, ">error.log") || die "Can not create error.log : $!\n";
my %cloneHash = qw {
zC CH211-
zK DKEY-
zKp DKEYP-
bZ RP71-
dZ BUSM1-
CHORI73_ CH73-
};
$name = "";
$clone = "";
$dir = "";
while (<FILE>)
{
chomp;
my $l = $_;
if ($l =~ /<trace_name>([A-Za-z0-9\_\.]+)/)
{
$name = $1;
}
elsif ($l =~ /<clone_id>([A-Z0-9]+\-[0-9A-Z]+)/)
{
$clone = $1;
}
elsif ($l =~ /<library_id>([A-Z0-9a-z\s]+\-?[0-9A-Z]*)<\/library_id>/)
{
$library = $1;
if ($library eq "Daniokey Pilot")
{
$library = "DKEYP";
}
}
elsif ($l =~ /<trace_end>(F|R)/)
{
$dir = $1;
}
elsif ($l =~ /<trace_direction>(F|R)/)
{
$dir = $1;
}
# find end of record and print out end information
if ($l =~ /^\s+<\/trace>/)
{
printInfo($name, $clone, $library, $dir);
$name = $clone = $dir = $library = "";
}
}
close FILE;
close OUT;
close STDERR;
sub printInfo {
my ($name, $clone, $lib, $d) = @_;
# if no clone name read from file then create from trace name
if ($clone eq "")
{
foreach my $c (keys(%cloneHash))
{
if ($name =~ /$c/)
{
if (exists($cloneHash{$c}))
{
my $prefix = $cloneHash{$c};
$clone = $name;
# change to clone name
$clone =~ s/$c/$prefix/;
# remove suffix
$clone =~ s/\.[a-z]+|SP6|T7//;
}
}
}
}
# convert forward or reverse direction to T7 or SP6
if ($d ne "")
{
if ($d eq "F")
{
$d = "T7";
}
elsif ($d eq "R")
{
$d = "SP6";
}
}
else
{
print STDERR "No direction for $name found\n";
}
# print clone end information
print OUT "$clone\t$name\t0\t$lib\t0\t$d\n";
}
'_EOF_'
# << for emacs
chmod +x getZfishBacInfo.pl
perl getZfishBacInfo.pl danRerBacEnds.xml bacEndInfo.txt
# check all the names are there
grep '>' ./ensemblSeqs/Zv6BacEnds.fa > names
perl -pi.bak -e 's/>//' names
sort names | uniq > names.sort
awk '{print $2}' bacEndInfo.txt | sort | uniq > bacEndInfo.names.sort
comm -13 bacEndInfo.names.sort names.sort
# no difference so all clone ends in the FASTA file are also
# in the xml file.
rm *.bak *.sort names
# create mate-pair information
# convertBacEndPairInfo does not deal with replicate names. These can
# be in a comma separated list in the pairs and singles files.
# edit the script so that it does this and parses the bacEndInfo.txt file.
cp /cluster/bin/scripts/convertBacEndPairInfo convertZfishBacEndInfo
# comment out line 43 as this removes the suffix after a . from the
# trace names. In this case, we need to keep those.
# line 43: ($acc, $ver) = split(/\./,$acc);
cat << 'EOF' > convertZfishBacEndInfo
#!/usr/local/bin/perl
# File: convertBacEndPairZfishInfo
# Date: 10/2006
# Description: Converts bacends.cl_acc_gi_len_primer format file to
# bacEnds.pair file used for creating BAC End Pairs tracks
# Usage message
if ($#ARGV < 0) {
print stderr "USAGE: convertBacEndPairInfo <cl_acc_gi_len_primer>\n";
exit(1);
}
$file = shift(@ARGV);
open(FILE, "$file") || die("Could not open $file\n");
$pair = $single = 0;
# Read in and record end info
print stderr "Reading in end info\n";
while ($line = <FILE>) {
chomp($line);
($clone, $acc, $gi, $center, $length, $end) = split('\t',$line);
# ($acc, $ver) = split(/\./,$acc);
$end =~ tr/a-z/A-Z/;
$found{$clone} = 1;
$clone{$acc} = $clone;
$printa{$acc} = 0;
$print{$clone} = 0;
$end{$acc} = $end;
if (&isForward($end)) {
# print "Adding $acc for $clone as $end \n";
$t7{$clone} .= "$acc,";
# print "The entry for $clone is $t7{$clone}\n";
} elsif (&isReverse($end)) {
$sp6{$clone} .= "$acc,";
} elsif ($end) {
print stderr "End $end for $acc / $clone\n";
}
}
close(OUT);
# Print out pairs
open(OUT, ">bacEndPairs.txt");
print stderr "Writing out pair info\n";
foreach $clone (keys %found) {
if ($t7{$clone} && $sp6{$clone}) {
print OUT "$t7{$clone}\t$sp6{$clone}\t$clone\n";
$print{$clone} = 1;
@acc = split(/\,/,$t7{$clone});
for ($i = 0; $i <= $#acc; $i++) {
$printa{$acc[$i]} = 1;
}
@acc = split(/\,/,$sp6{$clone});
for ($i = 0; $i <= $#acc; $i++) {
$printa{$acc[$i]} = 1;
}
$pair++;
}
}
close(OUT);
# Print out singletons
print stderr "Writing out singleton info\n";
open(OUT, ">bacEndSingles.txt");
%sp6Singles;
%t7Singles;
foreach $acc (keys %printa) {
$clone = $clone{$acc};
# if not printed already then add to a new hash for singles
if (!$printa{$acc}) {
if (&isForward($end{$acc})) {
$t7Singles{$clone} .= "$acc,";
}
elsif (&isReverse($end{$acc})) {
$sp6Singles{$clone} .="$acc,";
}
else {
print stderr "$acc has unknown end\n";
}
}
}
# then print out the singles:
foreach $cl (keys %t7Singles) {
print OUT "$t7Singles{$cl}\t$cl\tT7\n";
$single++;
}
foreach $cl (keys %sp6Singles) {
print OUT "$sp6Singles{$cl}\t$cl\tSP6\n";
$single++;
}
close(OUT);
print stderr "$pair pairs and $single singles\n";
sub isForward {
$end = shift(@_);
if (($end =~ /FORWARD/) || ($end =~ /^T7/) || ($end eq "F") ||
($end eq "M13-21") || ($end eq "1") || ($end =~ /^TK/) ||
($end =~ /^EC1/) || ($end =~ /^RM1/)) {
return 1;
} else {
return 0;
}
}
sub isReverse {
if (($end =~ /REVERSE/) || ($end =~ /^SP6/) || ($end eq "R") ||
($end =~ /^TJ/)) {
return 1;
} else {
return 0;
}
}
'EOF'
# remove all W and A suffixes from the end of bacEndInfo.txt clone names
cp bacEndInfo.txt bacEndInfo2.txt
perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)W/$1/' bacEndInfo2.txt
perl -pi.bak -e 's/(DKEY\-[0-9]+[A-Z][0-9]+)A/$1/' bacEndInfo2.txt
./convertZfishBacEndInfo bacEndInfo2.txt
# creates pairs and singles files
# 312850 pairs and 34935 singles
mkdir -p /cluster/data/danRer4/bed/bacEnds/pairs
cd /cluster/data/danRer4/bed/bacEnds/pairs
set dir = /cluster/data/danRer4/bed/bacEnds
# use parameters from REDO of danRer3 BAC ends
/cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan -mismatch -verbose $dir/bacEnds.psl $dir/bacEndPairs.txt all_bacends bacEnds
wc -l *
# 2724 bacEnds.long
# 22959 bacEnds.mismatch
# 179405 bacEnds.orphan
# 156241 bacEnds.pairs
# 565 bacEnds.short
# 1196 bacEnds.slop
# create header required by "rdb" tools
echo 'chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes' \
> ../header
echo '10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10' >> ../header
# edit header to make sure \t is/become tab character
cat ../header bacEnds.pairs | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairs.bed
# create bad BAC ends set
cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
bacEnds.orphan | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBad.bed
# Also create a bad BAC ends set with no orphans since orphans are
# already added to the singles track and do not want to add these orphans
# twice when extracting PSL. Use this bacEndPairsBadNoOrphans.bed
# file when extracting PSLs for adding to the all_bacends table.
cat ../header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
| row score ge 300 | sorttbl chr start \
| headchg -del > bacEndPairsBadNoOrphans.bed
# To create singles set:
# also need to process bacEndSingles.txt into a database table
# for singles in bacEndSingles.txt, create a dummy file where they
# are given zJA11B12T7 as dummy sequence pair. If the single is a forward
# sequence, put the dummy sequence in the second column, if the single is
# a reverse sequence put in first column. use a perl script to do this.
cd /cluster/data/danRer4/bed/bacEnds
set bacDir = /cluster/data/danRer4/bed/bacEnds
mkdir singles
cd singles
cp /cluster/data/danRer2/bed/ZonLab/bacends/singles/formatSingles.pl .
perl formatSingles.pl $bacDir/bacEndSingles.txt > \
$bacDir/bacEndSingles.format
# then run pslPairs on this formatted file
/cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
-mismatch -verbose ../bacEnds.psl $bacDir/bacEndSingles.format \
all_bacends bacEnds
wc -l bacEnds.*
# 0 bacEnds.long
# 0 bacEnds.mismatch
# 23398 bacEnds.orphan
# 0 bacEnds.pairs
# 0 bacEnds.short
# 0 bacEnds.slop
cat bacEnds.orphan ../pairs/bacEnds.orphan > bacEnds.singles
wc -l bacEnds.singles
# 202803 bacEnds.singles
# Of these, 179405 are from pair analysis and 23398 from singles.
# For danRer3: there are 11439 orphans from singles and 242235 from
# pair analysis so a total of 253674 orphans so this has improved.
# Although for danRer3, some of these could be more replicate reads for the
# same BAC clone end.
# make singles bed file
cat ../header bacEnds.singles | row score ge 300 | sorttbl chr start \
| headchg -del > bacEndSingles.bed
# check if there are any overlapping alignments that can be removed.
cd /cluster/data/danRer4/bed/bacEnds
mkdir -p duplicates/overlapRun
cd duplicates/overlapRun
sort -k1,2 /cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairs.bed \
> bacEndPairs.lfs
wc -l *.lfs
# 154732 bacEndPairs.lfs
nice /cluster/bin/x86_64/lfsOverlap bacEndPairs.lfs bacEndPairs.bed \
-name -minOverlap=0.999 -notBlocks
# Loaded 154732 elements of size 11
# Took about 2.5 hours.
wc -l bacEndPairs*
# 154634 bacEndPairs.bed
# 154732 bacEndPairs.lfs
sort -k1,2 /cluster/data/danRer4/bed/bacEnds/singles/bacEndSingles.bed \
> bacEndSingles.lfs
nice /cluster/bin/x86_64/lfsOverlap bacEndSingles.lfs bacEndSingles.bed \
-name -minOverlap=0.999 -notBlocks
# Loaded 187638 elements of size 11
# Took about 4.5 hours
sort -k1,2 \
/cluster/data/danRer4/bed/bacEnds/pairs/bacEndPairsBadNoOrphans.bed \
> bacEndPairsBadNoOrphans.lfs
wc -l *.lfs
# 27301 bacEndPairsBadNoOrphans.lfs
nice /cluster/bin/x86_64/lfsOverlap bacEndPairsBadNoOrphans.lfs \
bacEndPairsBadNoOrphans.bed -name -minOverlap=0.999 -notBlocks
# Loaded 27301 elements of size 11
# Took 5 minutes
# check the numbers of lines are correct
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
awk 'BEGIN {OFS="\t"} {print $1,$2,$3,$4,$5}' ${f}.lfs \
| sort | uniq -c | sort -nr > ${f}.uniqCount
end
wc -l *
# 154634 bacEndPairs.bed
# 154732 bacEndPairs.lfs
# 154656 bacEndPairs.uniqCount
# 27282 bacEndPairsBadNoOrphans.bed
# 27301 bacEndPairsBadNoOrphans.lfs
# 27293 bacEndPairsBadNoOrphans.uniqCount
# 187601 bacEndSingles.bed
# 187638 bacEndSingles.lfs
# 187624 bacEndSingles.uniqCount
# different numbers for unique count since some of these alignments
# were not identical but very close to identical (>0.999 overlap)
rm *.uniqCount
cd /cluster/data/danRer4/bed/bacEnds/duplicates
mv ./overlapRun/* .
rm -r overlapRun
# copy perl script used for danRer3 to choose 2 BAC ends to represent
# each BAC clone since there are often more than one read for each BAC end
# in this set, 2 were chosen for each BAC pair or 1 for the singles. This
# was based on the ones that had the largest region aligned (using lfSizes).
cp /cluster/data/danRer3/bed/bacends/duplicatesNew/pickLfNamesv2.pl .
# need to sort by chrom, chromStart
foreach f (bacEndPairs bacEndPairsBadNoOrphans bacEndSingles)
sort -k1 -k2 -k3 ${f}.bed > ${f}Sort.bed
end
# run perl script: input bed file, pairs or singles, name of output file
perl pickLfNamesv2.pl bacEndPairsSort.bed pairs pairs2lfNames.bed
mv error.log log.pairs
perl pickLfNamesv2.pl bacEndSinglesSort.bed singles singles1lfName.bed
mv error.log log.singles
perl pickLfNamesv2.pl bacEndPairsBadNoOrphansSort.bed pairs \
badPairs2lfNames.bed
mv error.log log.badPairs
wc -l log*
# 1 log.badPairs
# 3 log.pairs
# 13 log.singles
# In future, could pick which set of alignments to pick based on the
# Blat score computed by pslScore.
# For badPairs, CH211-115F14 has 2 sets of pairs: zC115F14.zb,zC115F14.ya
# has a longer region between ends than for zC115F14.za,zC115F14.ya.
# so the latter was removed.
# for Pairs, CH211-74D17: the alignment with zC74D17.zb,zC74D17.yb was
# removed but there is also one with zC74D17.zb,zC74D17.yb to the same
# region that was retained so remove this one as zC74D17.zb,zC74D17.ya
# covers a longer region.
# CH211-98O15 has zC98O15.ya,zC98O15.za aligning to chr3 and
# zC98O15.yb,zC98O15.zb align to chr17. There is no similarity between
# zC98O15.ya and zC98O15.yb by bl2seq.
# CH211-98E22 has zC98E22.ya,zC98E22.za aligning to chr3 and
# zC98E22.yb,zC98E22.zb aligning to chr14. zC98E22.ya and zC98E22.yb
# has no similarity by bl2seq.
# For singles, there are 13 will alignments to more than 1 read for a
# BAC end:
# CH211-66E17: remove zC66E17.za as it has more mismatches and inserts.
# CH211-74O5: remove zC74O5.ya as is has more mismatches.
# CH211-42B4: remove zC42B4.yb as it has a shorter alignment. Not much
# difference in mismatches or inserts between this and zC42B4.ya.
# CH211-98O3: zC98O3.yb aligns to chr13 and zC98O3.ya aligns to chr16 and
# they have no similarity to each other.
# CH211-89J7: remove zC89J7.zb as it has more mismatches and inserts.
# CH211-97A18: remove zC97A18.yb has more mismatches and inserts.
# CH211-48O20: zC48O20.zb aligns to chr22 and zC48O20.za aligns twice
# to chr16. No similarity by bl2seq.
# CH211-60H17: remove zC60H17.ya as it has a more mismatches.
# CH211-189J23: remove zC189J23.yb as it has a large tNumInsert.
# CH211-124G12: remove zC124G12.za as it has more mismatches and inserts.
# CH211-60P6: remove zC60P6.ya as it has more inserts.
# CH211-42A6: remove zC42A6.za as it has more inserts.
# CH211-69K2: remove zC69K22.za as it has more inserts.
# Reported discrepancies to Mario Caccamo at Sanger (mc2@sanger.ac.uk)
# Here is his reply:
# This looks like a clone swap problem where names where associated to
# the wrong clones. All the examples you mention below are from
# projects sequenced at Max Planck (Germany).
# CH211-98O15 - the right place for this one is in chr3. This clone is
# currently assigned to ctg247 in chr3.
# CH211-98O3 - should go to chr14 (there is a problem in Zv6 most
# probably). This clone is assigned to ctg3009. The b ends are correct.
# CH211-48O20 - unfortunately this clone is not fingerprinted so I don't
# have any independent information to confirm the right placement.
# So for pairs,
# CH211-98O15: retain zC98O15.ya,zC98O15.za aligning to chr3
# CH211-98O3: retain zC98O3.yb and zC98O3.zb (should go to chr14)
# NOTE: For some singles, the lfStart does not equal the chromStart.
# Also chromStart - chromEnd should equal lfSizes.
# pslPairs has added min/2 to the end or subtracted min/2 from the start
# depending on whether it is a left or a right BAC end and the
# alignment orientation. min used here was 25000.
# That is ok. This is what gives the display where the aligning block is
# shown with a line with arrows on it showing the direction.
ssh kkstore04
cd /cluster/data/danRer4/bed/bacEnds/duplicates
# create remove lists for each set of alignments.
cat << 'EOF' > pairsToRemove
zC74D17.zb,zC74D17.yb
zC98O15.yb,zC98O15.zb
zC98E22.ya,zC98E22.za
zC98E22.yb,zC98E22.zb
'EOF'
cat << 'EOF' > singlesToRemove
zC66E17.za
zC74O5.ya
zC42B4.yb
zC98O3.ya
zC89J7.zb
zC97A18.yb
zC48O20.zb
zC48O20.za
zC60H17.ya
zC189J23.yb
zC124G12.za
zC60P6.ya
zC42A6.za
'EOF'
mv pairs2lfNames.bed pairs2lfNamesOld.bed
mv singles1lfName.bed singles1lfNameOld.bed
# recreate these files removing alignments for ends in lists above
grep -wv -f pairsToRemove bacEndPairsSort.bed > pairs2lfNames.bed
grep -wv -f singlesToRemove bacEndSinglesSort.bed > singles1lfName.bed
# for each of these new bed files, checks were made that there are
# only 2 BAC ends per alignments for pairs and 1 for singles.
# For each pair, there should only be 2 ends which can appear either
# way round depending on the orientation and there should be 1 end for
# the beginning (suffix T7, t7 or z) and one end for the end
# (suffix SP6, sp6 or y) for each BAC clone. These can appear as e.g.
# either zK7B23T7,zK7B23SP6 or zK7B23SP6,zK7B23T7 for the opposite
# orientation. For singles, there should be a single BAC end for each
# alignment and for each BAC clone, a sequence for either or both types
# of ends may appear e.g. zK153P14SP6 and zK153P14T7 appear in separate
# alignments.
e.g.
wc -l pairs2lfNames.bed
# 154632 pairs2lfNames.bed
grep ',' pairs2lfNames.bed | wc -l
# 154632
# should be the same number, every line should have a comma
# should be twice the number of above, just 2 end names per line
awk '{print $11}' pairs2lfNames.bed | sort | uniq > pairs.ends
wc -l pairs.ends
# 147668 pairs.ends
sed -e 's/,/\n/g' pairs.ends > pairs.ends2
wc -l pairs.ends2
# 295336 pairs.ends2
# should be twice the number of above, just 2 end names per lines so
# correct.
perl -pi.bak -e \
's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' pairs.ends
sort pairs.ends | uniq > pairs.ends.uniq
# check that these have the right combination of ends - one forward and
# one reverse. all ok.
# repeat for badPairs and singles
# badPairs:
wc -l badPairs2lfNames.bed
# 27281 badPairs2lfNames.bed
grep ',' badPairs2lfNames.bed | wc -l
# 27281
# should be the same number, every line should have a comma
# should be twice the number of above, just 2 end names per line
awk '{print $11}' badPairs2lfNames.bed | sort | uniq > badPairs.ends
wc -l badPairs.ends
# 25795 badPairs.ends
sed -e 's/,/\n/g' badPairs.ends > badPairs.ends2
wc -l badPairs.ends2
# 51590 badPairs.ends2
# should be twice the number of above, just 2 end names per lines so
# correct.
perl -pi.bak -e \
's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?,?.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1,$2/g' badPairs.ends
sort badPairs.ends | uniq > badPairs.ends.uniq
# check that these have the right combination of ends - one forward and
# one reverse. all ok.
# for singles
wc -l singles1lfName.bed
# 187587 singles1lfName.bed
grep ',' singles1lfName.bed | wc -l
# 0
# should be 0 as there should only be one BAC end name per line.
awk '{print $11}' singles1lfName.bed | sort | uniq > singles.ends
wc -l singles.ends
# 172981 singles.ends
# some singles have more than 1 alignment so appear more than once.
perl -pi.bak -e \
's/.+(SP6|T7|ya|za|Za|yb|zb|yc|zc|yt|yd|zd)[A-Za-z]?/$1/g' singles.ends
sort singles.ends | uniq > singles.ends.uniq
# check that these have the the right suffixes for the BAC ends. all ok.
# clean up
rm *.bak *.ends *.ends2 *.uniq
# Finally overlaps in BAC clone names were checked. All BAC clones
# represented in each of the pairs, badPairs and singles bed files are
# unique to that file. Between all three bed files, 302606 BAC clones
# have alignments.
foreach f (pairs2lfNames.bed badPairs2lfNames.bed singles1lfName.bed)
awk '{print $4}' $f | sort | uniq > ${f}.names
end
wc -l *.names
# 25421 badPairs2lfNames.bed.names
# 147501 pairs2lfNames.bed.names
# 129684 singles1lfName.bed.names
# 302606 total
comm -12 pairs2lfNames.bed.names badPairs2lfNames.bed.names
comm -12 pairs2lfNames.bed.names singles1lfName.bed.names
comm -12 badPairs2lfNames.bed.names singles1lfName.bed.names
# None of these files should have any BAC clone names in common and
# they do not so they are ok.
# NOTE: using sort and uniq on hgwdev produces tab delimited output
# after merging rows with the same BAC name, the scoring is now
# wrong in the bed files.
# Scores should be 1000 if there is 1 row for that name, else
# 1500/number of rows for that sequence name - calculated by pslPairs.
# Correct the scores.
ssh kkstore04
mkdir -p /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
cd /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
# copy over correctScores2.pl and checkscores.pl scripts from danRer3 and
# scripts were edited so that hits file is split on space,not on tabs
cp \
/cluster/data/danRer3/bed/bacends/scoresAndCoords/correctScores2.pl .
cp \
/cluster/data/danRer3/bed/bacends/scoresAndCoords/checkScores.pl .
awk '{print $4}' ../duplicates/pairs2lfNames.bed \
| sort | uniq -c > pairs.hits
perl correctScores2.pl ../duplicates/pairs2lfNames.bed pairs.hits \
noBin > bacEndPairsGoodScores.bed
# same for singles
awk '{print $4}' ../duplicates/singles1lfName.bed \
| sort | uniq -c > singles.hits
perl correctScores2.pl ../duplicates/singles1lfName.bed singles.hits \
noBin > bacEndSinglesGoodScores.bed
# and for badPairs
awk '{print $4}' ../duplicates/badPairs2lfNames.bed \
| sort | uniq -c > badPairs.hits
perl correctScores2.pl ../duplicates/badPairs2lfNames.bed \
badPairs.hits noBin > bacEndPairsBadGoodScores.bed
# check that the scores are now correct
awk '{print $4, $5}' bacEndPairsGoodScores.bed \
| sort | uniq -c > pairs.count
perl checkScores.pl < pairs.count
# all the BAC clones should be in good.txt and none in bad.txt
# wc -l should give same number of lines in good.txt as in pairs.hits
# and therefore bad.txt should be empty.
# repeat for other bed files
awk '{print $4, $5}' bacEndPairsBadGoodScores.bed \
| sort | uniq -c > badPairs.count
perl checkScores.pl < badPairs.count
awk '{print $4, $5}' bacEndSinglesGoodScores.bed \
| sort | uniq -c > singles.count
perl checkScores.pl < singles.count
# for the singles, 6 ended up in bad.txt because their scores are
# 214.285714285714 which is correct for 7 alignments. Rounding the score
# caused the discrepancy.
# round these values otherwise get a loading error when loading database:
perl -pi.bak -e 's/214\.285714285714/214/' bacEndSinglesGoodScores.bed
# clean up
rm error.log *.txt *.count *.hits
ssh hgwdev
cd /cluster/data/danRer4/bed/bacEnds/scoresAndCoords
# copy over table definition from danRer3
cp /cluster/data/danRer3/bed/bacends/singles/bacEndSingles.sql \
../singles
# Now load database tables:
hgsql -e 'drop table bacEndPairs;' danRer4
hgLoadBed danRer4 bacEndPairs bacEndPairsGoodScores.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql -notItemRgb
# Loaded 154632 elements of size 11
hgsql -e 'drop table bacEndSingles;' danRer4
hgLoadBed danRer4 bacEndSingles bacEndSinglesGoodScores.bed \
-sqlTable=../singles/bacEndSingles.sql -notItemRgb
# Loaded 187587 elements of size 11
hgsql -e 'drop table bacEndPairsBad;' danRer4
hgLoadBed danRer4 bacEndPairsBad bacEndPairsBadGoodScores.bed \
-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql -notItemRgb
# Loaded 27281 elements of size 11
# clean up
rm *.tab *.bak error.log
# The Zv6 BAC end sequences are already in /gbdb/danRer4/bacends/ and
# they have been loaded into the seq table - this is from the first section
# on BACENDS tracks. No need to repeat this here.
# loaded BAC end sequences into seq table so alignments may be viewed
# moved BAC ends to the ncbi directory previously
# symlink to FASTA sequence file in ncbi directory
mkdir -p /gbdb/danRer4/bacends
ln -s /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
/gbdb/danRer4/bacends/Zv6BacEnds.fa
hgLoadSeq danRer4 /gbdb/danRer4/bacends/Zv6BacEnds.fa
# use new extract program that extracts PSLs using name and position:
ssh kkstore04
set bacDir=/cluster/data/danRer4/bed/bacEnds
cd $bacDir/scoresAndCoords
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl bacEndPairsGoodScores.bed bacPairs.psl
# for this, use bacEndPairsGoodScores.bed which was derived from
# bacEndPairsBadNoOrphans since pairs orphans are already
# included in bacEndSingles
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl bacEndPairsBadGoodScores.bed bacPairsBad.psl
# then for singles
nice /cluster/home/hartera/bin/x86_64/extractPslForLfs -verbose=1 \
$bacDir/bacEnds.psl bacEndSinglesGoodScores.bed bacSingles.psl
cd $bacDir
cat $bacDir/scoresAndCoords/*.psl > allBacends.load.psl
wc -l *.load.psl
# 542725 allBacends.load.psl
# load PSL file into database
ssh hgwdev
cd /cluster/data/danRer4/bed/bacEnds/
hgsql -e 'drop table all_bacends;' danRer4
hgLoadPsl danRer4 -table=all_bacends allBacends.load.psl
# All alignments were loaded into the table - no problems.
# check trackDb.ra entry and modify description.
# Moved the searches up to the top level zebrafish trackDb.ra file
# in trackDb/zebrafish/ since the searches are common to all zebrafish
# assemblies. Deleted searches from each assembly trackDb.ra.
###########################################################################
# CREATE BAC CLONES ALIAS AND CROSS-REFERENCE TABLES
# (bacEndAlias, bacCloneAlias and bacCloneXRef)
# (DONE, 2006-09-29 - 2006-10-27, hartera)
# Process data and create bacEndAlias table
ssh kkstore04
# create a list of BAC end names and their accessions
# Downloaded BAC ends accessions from SRS
# SRS at Sanger is no longer available.
# Go to http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?CMD=search&DB=nucgss
# This is dbGSS at NCBI: GSS is Genomic Sequence Survey
# Search: Danio rerio[Organism] AND BAC
# There are 159020 entries. This is the same as for the BACEndAccs.txt
# for danRer3 in: /cluster/data/danRer3/bed/bacends/bacends.1
# getBacEndInfo.pl and extToIntNames.pl was used to create
# BACEnd_accessions.txt. Use this from danRer3 to load table.
cd /cluster/data/danRer4/bed/bacEnds
cp /cluster/data/danRer3/bed/bacends/bacends.1/BACEnd_accessions.txt .
grep '>' /cluster/data/ncbi/bacends/zebrafish/bacends.1/Zv6BacEnds.fa \
| sed -e 's/>//' > allBacEnds.names
# copy over getBacEndInfov2.pl - this produces the bacEndAccs.aliases file
cp /cluster/data/danRer3/bed/bacends/bacends.1/getBacEndInfov2.pl .
# edit to remove section that creates pairs and singles files
# and rename to getBacEndAliases.pl
cat << 'EOF' > getBacEndAliases.pl
#!/usr/bin/perl -w
use strict;
my $file = $ARGV[0]; # list of BAC end sequence read Sanger names
my $file2 = $ARGV[1]; # list of BAC ends and GenBank accessions
# translation for sequence prefixes from Sanger internal names to external names
my %cloneHash = qw {
zC CH211-
ZC CH211-
zK DKEY-
zKp DKEYP-
bZ RP71-
dZ BUSM1-
CHORI73_ CH73-
};
# need to get bacends into pairs and singles
# find duplicates also
# Get and store BAC ends and accessions
my %bacEnds;
open (BACENDS, $file2) || die "Can not open $file2: $!";
while (<BACENDS>) {
chomp;
my ($be, $a) = split(/\t/);
print "bac end $be and acc is $a\n";
$bacEnds{$be} = $a;
}
close BACENDS;
my %bacs;
my %bacAccs;
open(FILE, $file) || die "Can not open $file: $!";
open(STDERR, ">bacs.log") || die "Can not create bacs.log: $!";
open(OUT, ">direction.txt") || die "Can not create direction.txt:$!";
open(ACCS, ">bacEndAccs.aliases") || die "Can not create bacEndsAccs.aliases: $!";
while (<FILE>) {
chomp;
my $seqName = $_;
print "seqName is $seqName here\n";
$seqName =~ /^([CHORI73]*[|z|Z|b|d]?[C|K|Z|_]p?)([0-9]+[A-Z][0-9]+)\.?[pq1k]*(SP6|T7|ASP6|AT7|SP6W|T7W|y|z|Z)/;
my $prefix = $1;
my $rest = $2;
print "prefix is $prefix, rest is $rest\n";
my $dir = $3;
print STDERR "dir is $dir\n";
print OUT "$dir\n";
my $direction;
# forward or reverse direction
if (($dir =~ /SP6/) || ($dir =~ /T7/) ) {
$direction = $dir;
}
# reverse direction (as in convertZfishBacEndInfo)
elsif ($dir =~ /(sp6)/i || $dir =~ /y/i) {
$direction = "SP6";
}
# forward direction (as in convertZfishBacEndInfo)
elsif ($dir =~ /(t7)/i ||$dir =~ /z/i) {
$direction = "T7";
}
else {
print STDERR "seqName is $seqName - direction not found\n";
}
print "dir is $dir and direction is $direction\n";
my $extName = "";
my $intName = $prefix.$rest;
print "prefix is $prefix\n";
my $mid = "";
$mid = $rest;
$mid =~ s/\-//;
$mid =~ tr/a-z/A-Z/;
print "after trans, mid is $mid here\n";
if ($mid =~ /^([A-Z]*)0*([0-9]+[A-Z]+)0*([0-9]+$)/) {
print "matched mid $mid here\n";
my $new = $1.$2.$3;
$mid = $new;
print "new mid is $mid\n";
}
if (exists ($cloneHash{$prefix})) {
my $extPrefix = $cloneHash{$prefix};
$extName = $extPrefix.$mid;
print "External name is $extName\n";
}
else {
$extName = "";
}
# need to get duplicate clones, if switch to lower case and remove
# . and - and use as key to bacs hash
# add the internal and external name for BAC to hash
my $fullName = $seqName;
# my $intNameStem = $intName;
my $upDir = $dir;
$dir =~ tr/a-z/A-Z/;
# preserve prefix and change middle part of name to upper case
my $upperIntName = $prefix.$mid;
my $upperFullName = $prefix.$mid.$dir;
print "upper internal name is $upperIntName here\n";
# my $newFullName = "";
print "internal name is $intName, altered seq name is $upperIntName\n";
print "full name is now $upperFullName\n";
if (exists($bacEnds{$upperFullName})) {
my $ac = $bacEnds{$upperFullName};
print "seq is $upperFullName; acc is $ac\n";
$bacs{$upperIntName}->{$upperFullName}->{acc} = $ac;
}
push (@{$bacs{$upperIntName}->{$upperFullName}->{seqs} }, $seqName);
$bacs{$upperIntName}->{$upperFullName}->{extName} = $extName;
$bacs{$upperIntName}->{$upperFullName}->{direction} = $direction;
if (exists($bacAccs{$upperIntName}) ){
my $bacAcc = $bacAccs{$upperIntName};
print "bacacc is $bacAcc\n";
$bacs{$upperIntName}->{$upperFullName}->{bacAcc} = $bacAcc;
}
if (exists($bacEnds{$upperFullName} )) {
my $bacEndAcc = $bacEnds{$upperFullName};
print "bacendacc is $bacEndAcc\n";
$bacs{$upperIntName}->{$upperFullName}->{bacEndAcc} = $bacEndAcc;
}
}
close FILE;
# print accessions for BacEnds with BAC end aliases
my $count = 0;
print "printing accessions.\n";
foreach my $a (keys(%bacs)) {
print "$a is bac end from bacEnds hash\n";
foreach my $f (keys %{ $bacs{$a} } ) {
if (exists($bacs{$a}->{$f}->{acc} ) ) {
my $acc = $bacs{$a}->{$f}->{acc};
my @ids = @{$bacs{$a}->{$f}->{seqs} };
foreach my $i (@ids) {
$count++;
print ACCS "$i\t$count\t$acc\n";
}
}
}
}
'EOF'
chmod +x getBacEndAliases.pl
perl getBacEndAliases.pl allBacEnds.names BACEnd_accessions.txt \
> bacEnds.log
wc -l bacEndAccs.aliases
# 159370 bacEndAccs.aliases
# clean up
rm *.log direction.txt
# Only the DKEY- library clone ends have accessions in Genbank
# load this alias table and accessions for clone ends
ssh hgwdev
cd /cluster/data/danRer4/bed/bacEnds
# Carry on and process this file into the bacEndAlias table.
hgLoadSqlTab danRer4 bacEndAlias ~/kent/src/hg/lib/bacEndAlias.sql \
bacEndAccs.aliases
# Loaded successfully.
# Get the latest versions of the clonemarkers, contig names and markers
# files from Sanger: Provided by Mario Caccamo (mc2@sanger.ac.uk)
# at the Sanger Institute.
ssh kkstore04
mkdir -p /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
# Problem with the markers file - generated incorrectly. Contacted
# Sanger to ask for new markers file on 10/12/06 and new set of files
# were put up for ftp on 10/26/06. Another problem with markers file
# was found - there was a number in the second field instead of the
# sanger sts name which is an ID beginning with "et" or "st". Notified'
# Sanger and new files put out for ftp on 10/27/06.
wget --timestamp \
ftp://ftp.sanger.ac.uk/pub/human/zebrafish/ZFIN/README
wget --timestamp \
ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/clonemarkers.27.10.06.txt
wget --timestamp \
ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/ctgnames.27.10.06.txt
wget --timestamp \
ftp://ftp.sanger.ac.uk/pub/mc2/webfpc_dump/markers.27.10.06.txt
wc -l *27.10.06.txt
# 32612 clonemarkers.27.10.06.txt
# 168828 ctgnames.27.10.06.txt
# 12407 markers.27.10.06.txt
# get list of BAC end names, lfNames
foreach f (../scoresAndCoords/*.bed)
echo $f
awk '{print $11;}' $f >> allBacEnds.names
end
wc -l allBacEnds.names
# 369500 allBacEnds.names
# this is the total number of lines in the *.bed files
perl -pi.bak -e 's/,/\n/g' allBacEnds.names
sort allBacEnds.names | uniq > allBacEnds.names.uniq
# get list of BAC clone names
foreach f (bacEndPairs bacEndPairsBad bacEndSingles)
awk '{print $4}' \
/cluster/data/danRer4/bed/bacEnds/scoresAndCoords/${f}GoodScores.bed \
>> bacs.names
end
sort bacs.names | uniq > bacs.names.uniq
wc -l *.uniq
# 518827 allBacEnds.names.uniq
# 302606 bacs.names.uniq
# from psl file
awk '{print $10;}' ../bacEnds.psl > bacEndsPsl.names
# remove first few lines with no names
tail +6 bacEndsPsl.names | sort | uniq > bacEndsPsl.names.uniq
wc -l bacEndsPsl.names.uniq
# 549034 bacEndsPsl.names.uniq
# this is all the BAC ends that originally had alignments
# Add an alias table for BAC clones
# bacCloneAlias.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
# Add a xref table to give external clone registry names, internal names
# sanger name, relationship between STS and BAC clone (method of finding
# STS), UniSTS ID, chromosomes(s) to which BAC clone is mapped by BLAT,
# Genbank accession and STS primer sequences
# bacCloneXRef.sql is in $HOME/kent/src/hg/lib - see makeDanRer1.doc
set dir=/cluster/data/danRer4/bed/bacEnds/
awk 'BEGIN {OFS="\t"}{print $4, $1}' \
$dir/scoresAndCoords/bacEndPairsGoodScores.bed > bacClones.namesandchrom
awk 'BEGIN {OFS="\t"}{print $4, $1}' \
$dir/scoresAndCoords/bacEndSinglesGoodScores.bed >> bacClones.namesandchrom
sort bacClones.namesandchrom | uniq > bacClones.namesandchrom.uniq
wc -l bacClones.namesandchrom.uniq
# 306079 bacClones.namesandchrom.uniq
# so created a list of names and chroms for BAC clones only in pairs
# and singles, exclude bad Pairs since this track is not shown on RR.
# use a list of internal names,Genbank accessions, and BAC clone names
# use BACClonesIdsandAccs.txt.
# get list of UniSTS IDs using aliases to search alias file
# print Sanger name, alias and UniSTS ID, use find_markers3.pl
cat << '_EOF_' > find_markers3.pl
# example:
# perl find_markers.pl UniSTS.aliases markers.02.12.04.txt
use strict;
my $verbose = 0;
my ($a, $b, $f, $m, $s, $t, $aliases, @alias, @rest);
my $aliasFile = $ARGV[0];
my $markersFile = $ARGV[1];
open(ALIAS, $aliasFile) || die "Can not open $aliasFile\n";
open(MARKERS, $markersFile) || die "Can not open $markersFile\n";
# store aliases from aliasFile
my ($id, $al, @alsArray, %aliasHash);
while (<ALIAS>)
{
chomp;
($id, $al) = split /\t/;
@alsArray = split(/;/, $al);
foreach my $as (@alsArray)
{
push (@{$aliasHash{$as} }, $id);
}
}
close ALIAS;
while (<MARKERS>) {
my @idArray;
($f, $t, $m, $idArray[0]) = 0;
my @ids;
chomp; ($a, $b, $aliases, @rest) = split /\|/;
if ($verbose > 3) { printf "aliases $aliases \n"; }
@alias = split /;/, $aliases;
ALIAS: foreach $s (@alias) {
if ($s =~ /[\D]+/) {
if ($verbose > 5) { printf "this $s \n"; }
if (exists($aliasHash{$s}))
{
@idArray = @{$aliasHash{$s}};
}
if ($idArray[0]) {
$f = 1; $t = $s; @ids = @idArray;
if ($verbose) { printf "this $s found $m \n"; }
last ALIAS;
}
}
}
if ($f)
{
my @sNames = split(/;/, $b);
foreach my $sn (@sNames)
{
foreach my $i (@ids)
{
printf "$sn\t$i\n";
}
}
}
}
close MARKERS;
'_EOF_'
chmod +x find_markers3.pl
# download latest version of UniSTS (2006-10-26)
ssh kkstore02
mkdir -p /cluster/store5/sts.2006-10-26
ln -s /cluster/store5/sts.2006-10-26 /cluster/data/ncbi
cd /cluster/data/ncbi/sts.2006-10-26
wget ftp://ftp.ncbi.nih.gov/blast/db/FASTA/sts.gz
mkdir -p /cluster/store5/UniSTS.2006-10-26
ln -s /cluster/store5/UniSTS.2006-10-26 /cluster/data/ncbi
cd /cluster/data/ncbi/UniSTS.2006-10-26
wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
wget --timestamp ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
wget --timestamp -r l1 \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio/
mv
/cluster/data/ncbi/UniSTS.2006-10-26/ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Danio_rerio
/cluster/data/ncbi/UniSTS.2006-10-26
rm -r /cluster/data/ncbi/UniSTS.2006-10-26/ftp.ncbi.nih.gov
# then back to danRer4 BAC ends tables:
ssh kkstore04
cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
# change internal names in files to have CHORI73 instead of zH to
# keep names the same as those used in the BAC end tables.
perl -pi.bak -e 's/zH([0-9]+)/CHORI73_$1/' *.27.10.06.txt
perl find_markers3.pl /cluster/data/ncbi/UniSTS.2006-10-26/UniSTS.aliases \
markers.27.10.06.txt > sangerandUniSTSId.txt
# Need to sort and uniq this file since the UniSTS IDs are being
# replicated for each instance of the sanger name in field 2 of the
# markers file. In some cases the sanger name is replicated.
sort sangerandUniSTSId.txt | uniq > sangerandUniSTSId.uniq
# No need to reformat this for zfishBacClonesandSts
# FPC contig information (i.e. FPC contig number) from ctgnames file is
# not included in the tables as these are dynamic and constantly
# changing with the assembly.
# bacs.names.uniq has the list of BACS in this track
# Get accessions for BAC clones from Genbank (as for danRer3)
# go to http://www.ncbi.nlm.nih.gov
# 1) select "Nucleotide" as the search database.
# 2) Search string:
# Danio rerio[ORGN] AND clone[TITL] NOT survey[TITL]
# Including only those with BAC in the record seems to exclude some of the
# BAC clones as well as other types of sequence so this "BAC" was not
# used in the search.
# Those sequences with "genomic survey" in the title appear to be
# BAC clone end accessions. Here, we want only BAC clone accessions.
# 3) There are 1148560 sequences. (2006-10-27). Select File from Send To
# pulldown menu and name file "BACClones.gbAccs.txt".
# use script from danRer3 to parse out clone ID and the accession:
cat << '_EOF_' > getAccsandIdsFromGb.pl
#!/usr/bin/perl -w
use strict;
my @clonePrefixes = ("CH211-", "ch211-", "DKEY-", "DKEYP-", "RP71-", "BUSM1-", "CH73-", "CHORI-");
my %cloneHash = qw {
CH211- zC
DKEY- zK
DKEYP- zKp
RP71- bZ
BUSM1- dZ
CH73- CHORI73_
};
my $found = "FALSE";
my $acc = "";
my $id = "";
while (<STDIN>)
{
my ($l, @f, $intId, $extPref, $intPref);
$intPref = "";
$extPref = "";
chomp;
$l = $_;
if ($l =~ /^[0-9]+:\s+([A-Z]+[0-9]{3,})/)
{
$acc = "";
$acc = $1;
$found = "FALSE";
}
elsif ($l =~ /clone/)
{
$id = "";
# check for clone name in this line
foreach my $p (@clonePrefixes)
{
if ($l =~ /clone:?\s?($p[0-9-A-Za-z]+)/)
{
$id = $1;
# translate to upper case
$id =~ tr/a-z/A-Z/;
$extPref = $p;
$found = "TRUE";
}
}
}
if ($found eq "TRUE")
{
if (exists($cloneHash{$extPref}))
{
$intPref = $cloneHash{$extPref};
}
$intId = $id;
# translate this to internal ID
$intId =~ s/$extPref/$intPref/;
print "$intId\t$acc\t$id\n";
$found = "FALSE";
}
}
'_EOF_'
chmod +x getAccsandIdsFromGb.pl
nice perl getAccsandIdsFromGb.pl < BACClones.gbAccs.txt \
> BACClonesIdsandAccs.txt &
# Took about 1 minute
# compare the BAC clones for which accessions were found to those
# for danRer3:
awk '{print $3}' BACClonesIdsandAccs.txt | sort | uniq \
> clonesWithAccs.dr4
awk '{print $3}' \
/cluster/data/danRer3/bed/bacends/bacends.1/BACClonesIdsandAccs.txt \
| sort | uniq > clonesWithAccs.dr3
comm -13 clonesWithAccs.dr4 clonesWithAccs.dr3
# DKEY-188F22
# DKEY-30O13
# Checked these out for searching for each in the Nucleotide database
# at Genbank. DKEY-30O13 only has accessions for the
# end sequences. DKEY-188F22 has an accession for the BAC clone: AP007256
# For some reason this was not found by the search.
# Add this to list:
echo "zK188F22\tAP007256\tDKEY-188F22" >> BACClonesIdsandAccs.txt
# use zfishBacClonesandSts to create tab files for loading into
# bacCloneAlias and bacCloneXRef tables
# make output directory
mkdir out
# Asked Sanger for another version of the file with the Sanger sts aliases
# instead of these numbers in the second field of the markers file.
# (2006-10-26). Received new file (2006-10-27)
# Increased NUMSANGER from 5 to 40 and MAXSANGER from 50 to 60
# because there are multiple occurrences of Sanger names in the second
# field of the markers file and this can be quite a long list.
# clonemarkers file now has 0 for relationship where before it was blank.
# change this to blank again otherwise processed incorrectly.
perl -pi.bak -e 's/\|0/\|/' clonemarkers.27.10.06.txt
nice $HOME/bin/x86_64/zfishBacClonesandSts ctgnames.27.10.06.txt \
clonemarkers.27.10.06.txt markers.27.10.06.txt \
bacClones.namesandchrom.uniq BACClonesIdsandAccs.txt \
sangerandUniSTSId.uniq ./out > ./out/zfishBacs.out &
# output is in out directory so copy over
cp ./out/*.tab .
# sort alias tab file by sangerName
wc -l *.tab
# 120211 bacAlias.tab
# 507274 bacXRef.tab
# make sure there are no replicate lines:
# also sort alias tab file by sangerName
sort bacAlias.tab | uniq | sort -k2 > bacAlias.sort.tab.uniq
sort bacXRef.tab | uniq > bacXRef.tab.uniq
wc -l bac*.tab.uniq
# 58758 bacAlias.sort.tab.uniq
# 353042 bacXRef.tab.uniq
ssh hgwdev
cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
hgsql -e 'drop table bacCloneAlias;' danRer4
hgsql -e 'drop table bacCloneXRef;' danRer4
hgLoadSqlTab danRer4 bacCloneAlias \
$HOME/kent/src/hg/lib/bacCloneAlias.sql bacAlias.sort.tab.uniq
hgLoadSqlTab danRer4 bacCloneXRef \
$HOME/kent/src/hg/lib/bacCloneXRef.sql bacXRef.tab.uniq
###########################################################################
# BACENDS: TESTING OF bacCloneAlias AND bacCloneXRef TABLES
# (DONE, 2006-10-27, hartera)
# The following tests were carried out to check that all the data
# in the bacCloneAlias and bacCloneXRef tables is correct.
ssh hgwdev
cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases
mkdir -p testTables
cd testTables
# copy scripts over from danRer3:
cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/getName*.pl .
cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/getSanger*.pl .
cp /cluster/data/danRer3/bed/bacends/cloneandStsAliases/formatUniSts.pl .
# scripts were created for danRer2 - see danRer2.txt
# Check that the correct aliases are associated with their Sanger STS names
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $3;}' \
../markers.27.10.06.txt > sNameandaliases
# use script to get one Sanger name and one alias on each line
perl getSangerAndAlias.pl < sNameandaliases > sNameandaliases.format
sort sNameandaliases.format | uniq > sNameandaliases.sort
# get Sanger names and aliases from database
hgsql -N -e 'select sangerName, alias from bacCloneAlias;' danRer4 \
| sort | uniq > alias.db.sort
wc -l alias.db.sort
# 58758 alias.db.sort
diff sNameandaliases.sort alias.db.sort
# No difference between data file and data from database so ok
# Check Sanger STS names correspond in bacAlias and bacCloneXRef tables
# get Sanger names from alias table
hgsql -N -e 'select sangerName from bacCloneAlias;' danRer4 \
| sort | uniq > sName.alias.sort
wc -l sName.alias.sort
# 15595 sName.alias.sort
# get Sanger names from xRef table
hgsql -N -e 'select sangerName from bacCloneXRef where sangerName \
is not null;' danRer4 | sort | uniq > sName.xRef.sort
wc -l sName.xRef.sort
# 15946 sName.xRef.sort
comm -23 sName.alias.sort sName.xRef.sort
# nothing unique to alias file so all Sanger names in the alias table are
# also in the xRef table
comm -13 sName.alias.sort sName.xRef.sort > sNamexRefNotAlias
wc -l sNamexRefNotAlias
# 351 sNamexRefNotAlias
awk 'BEGIN {FS="|"}{print $2}' ../clonemarkers.27.10.06.txt | sort | uniq \
> clonemarkers.sNames.sort
# get Sanger names from markers file
awk 'BEGIN {FS="|"}{print $2}' ../markers.27.10.06.txt > markers.sNames
# remove semi-colons and sort
sed -e 's/;/\n/g' markers.sNames | sort | uniq > markers.sNames.sort
# sanger names unique to markers file
comm -13 clonemarkers.sNames.sort markers.sNames.sort
# there are none
comm -23 clonemarkers.sNames.sort markers.sNames.sort \
> sNames.clonemarkersOnly
wc -l sNames.clonemarkersOnly
# 351 sNames.clonemarkersOnly
diff sNames.clonemarkersOnly sNamexRefNotAlias
# No difference so all the extra Sanger Names in the xRef
# table are from the clonemarkers file and these have no aliases in
# the markers file so they are not in the alias table so this is all ok.
# Check that Sanger STS names and primers are associated correctly
cd /cluster/data/danRer4/bed/bacEnds/cloneandStsAliases/testTables
# get sanger names and primers from markers file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2, $4, $5;}' \
../markers.27.10.06.txt > sNameandPrimers
# use script to reformat and write with one Sanger name per line
chmod +x getSangerandPrimers.pl
perl getSangerandPrimers.pl < sNameandPrimers > sNameandPrimers.format
# Need to sort and uniq due to multiple occurrences of the same
# Sanger name in some lines of the markers file.
sort sNameandPrimers.format | uniq > sNameandPrimers.format.sort
wc -l sNameandPrim*
# 12407 sNameandPrimers
# 32098 sNameandPrimers.format
# 15595 sNameandPrimers.format.sort
# get Sanger names and primers from database
hgsql -N -e \
'select sangerName, leftPrimer, rightPrimer from bacCloneXRef \
where sangerName is not null and leftPrimer is not null and \
rightPrimer is not null;' danRer4 | sort | uniq \
> sNamesandprimers.fromdb.sort
wc -l sNamesandprimers.fromdb.sort
# 15595 sNamesandprimers.fromdb.sort
diff sNamesandprimers.fromdb.sort sNameandPrimers.format.sort
# No difference so ok.
# Check that UniSTS IDs and Sanger STS names are associated correctly
# get Sanger names and UniSTS IDs from the database
hgsql -N -e 'select sangerName, uniStsId from bacCloneXRef where \
uniStsId is not null;' danRer4 | sort | uniq > sNameUniSTS.fromdb.sort
wc -l sNameUniSTS.fromdb.sort
# 5699 sNameUniSTS.fromdb.sort
# Need to reformat the sNameUniSTS.fromdb.sort
chmod +x formatUniSts.pl
perl formatUniSts.pl < sNameUniSTS.fromdb.sort | sort \
> sNameUniSTS.fromdb.format.sort
# get Sanger names from data file and see how many UniSTS IDs there are
# for each name
awk '{print $1}' ../sangerandUniSTSId.txt | sort | uniq -c | sort -nr \
> sangerandUniSTSId.count
# the most is 160 - this is high due to replicate occurrences of sanger
# STS names (sangerName) in the markers file. Replicates are removed
# during processing.
# 160 etID9511.14
# 132 etID8743.18
# 124 etID9682.15
# 124 etID9681.15
# 96 etID10372.18
# 84 etID8170.14
# 76 etID10495.5
# 66 etID9328.14
# 56 etID9708.3
# use uniq'd file used to create database tables.
sort ../sangerandUniSTSId.uniq > sangerandUniSTSId.txt.sort
diff sangerandUniSTSId.txt.sort sNameUniSTS.fromdb.format.sort
# No difference between data from original file and that in database so ok
# Check that chrom mappings and external BAC clone names are correct
# get extNames and chroms they map to from the database
hgsql -N -e 'select name, chroms from bacCloneXRef where \
chroms is not null;' danRer4 | sort | uniq \
> nameandchromsfromdb.sort
# reformat nameandchromsfromdb.sort
perl formatUniSts.pl < nameandchromsfromdb.sort | sort \
> nameandchromsfromdb.format.sort
# compare extNames and chroms from db to those in data file
cp ../bacClones.namesandchrom .
sort -u bacClones.namesandchrom > bacClones.namesandchrom.uniq
diff bacClones.namesandchrom.uniq nameandchromsfromdb.format.sort
# no difference - all ok
# Check Genbank accessions and internal BAC clone names
hgsql -N -e 'select intName,genbank from bacCloneXRef where \
genbank is not null;' danRer4 | sort | uniq \
> intNamesandAccs.fromdb.sort
# this should be a subset of zfish_accsMerged.txt - not all BAC clones
# listed here appear in either our BAC ends tracks or the markers files.
awk 'BEGIN {OFS="\t"} {print $1,$2}' ../BACClonesIdsandAccs.txt \
| sort -u > BACClonesIntandAccs.sort
comm -23 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort
# there is nothing in the database that is not in BACClonesIntandAccs.sort
comm -13 intNamesandAccs.fromdb.sort BACClonesIntandAccs.sort \
> onlyinzfishAccs
wc -l onlyinzfishAccs
# 86 onlyinzfishAccs
hgsql -N -e 'select intName from bacCloneXRef where genbank is null;' \
danRer4 | sort | uniq > intNamesNoAcc.fromdb.sort
awk '{print $1;}' BACClonesIntandAccs.sort > intNames.withAccs.sort
comm -12 intNamesNoAcc.fromdb.sort intNames.withAccs.sort \
> indbNoAccsandAccs.out
# none of these names are common to both so all accessions from
# BACClonesIdsandAccs.txt are in the database for the internal names stored
# where there are accessions available.
# Test Sanger STS names, internal names and external names are all correct
# Test Sanger STS name and internal BAC clone names are associated correctly
# get internal names and Sanger names from data file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$2}' ../clonemarkers.27.10.06.txt \
| sort | uniq > intNameandSanger.sort
hgsql -N -e 'select intName, sangerName from bacCloneXRef \
where sangerName is not null;' danRer4 \
| sort | uniq > intNameandSanger.fromdb.sort
diff intNameandSanger.sort intNameandSanger.fromdb.sort
# No difference between data from file and that from database so ok
# Check BAC clone internal name and relationship fields
# get internal names and relationships from data file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $1,$3}' ../clonemarkers.27.10.06.txt \
| sort | uniq > intNameandRelation.sort
# get internal names and relationships from database, some internal names
# may have different relationships associated with each internal name
# and Sanger sts name pair
hgsql -N -e 'select intName, relationship from bacCloneXRef \
where relationship != 0;' danRer4 \
| sort | uniq > intNameandrelation.fromdb.sort
# differences unique to database file
comm -13 intNameandRelation.sort intNameandrelation.fromdb.sort \
> intNameRelation.indbonly
# differences unique to data file
comm -23 intNameandRelation.sort intNameandrelation.fromdb.sort \
> intNameRelation.incloneMarkersonly
wc -l intNameRelation*
# 5051 intNameRelation.incloneMarkersonly
# 5051 intNameRelation.indbonly
awk '{print $1}' intNameRelation.indbonly > intNameRelation.indbonly.names
awk '{print $1}' intNameRelation.incloneMarkersonly \
> intNameRelation.incloneMarkersonly.names
diff intNameRelation.indbonly.names intNameRelation.incloneMarkersonly.names
# there is no difference in the internal names with relationship fields
# no difference in names and the only places these should differ is that
# the second column should all be 3 in the data from the database only.
# this is because all the relationship entries that were blank were
# in the clonemarkers file were changed to 3 when entered into the database.
awk '{print $2}' intNameRelation.indbonly | sort | uniq
# 3 - correct so all ok
# all the differences should be that those that are blank in clonemarkers
# are 3 in the database.
# check that those that have 0 in the database bacCloneXRef relationshipe
# field are not in the list from cloneMarkers
# select these internal names with 0 relationship from the database
hgsql -N -e 'select intName from bacCloneXRef where relationship = 0;' \
danRer4 | sort | uniq > intNameNoRelation.fromdb.sort
# get all the internal names from the data file
awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.10.06.txt \
| sort | uniq > intNamefromCloneMarkers.sort
comm -12 intNameNoRelation.fromdb.sort intNamefromCloneMarkers.sort
# nothing in common between these two files as expected so there are
# no internal names in the db with 0 in the relationship field that
# appear in the clonemarkers file.
# Check all BAC clone internal names and external names from the
# ctgnames file are in the database
# get intName and extName from ctgnames file
awk 'BEGIN {FS="|"} {OFS="\t"} {print $2,$3}' ../ctgnames.27.10.06.txt \
| sort | uniq > intNameandextNamefromCtgNames.sort
# get intName and extName from database
hgsql -N -e 'select intName,name from bacCloneXRef;' danRer4 \
| sort | uniq > intNameandextName.fromdb.sort
wc -l intNameandextName*
# 334890 intNameandextName.fromdb.sort
# 168828 intNameandextNamefromCtgNames.sort
comm -12 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
> intandextindbAndCtgNames
wc -l intandextindbAndCtgNames
# 168828 intandextindbAndCtgNames
# there are 168828 name pairs common between the file and the database
# and this is the same number of name pairs as in the data file
diff intandextindbAndCtgNames intNameandextNamefromCtgNames.sort
# no difference between those name pairs from the data file and those that
# are common between the data file and the database so all internal and
# external names from ctgNames file are in the database
# get the list of extra ones from db
comm -23 intNameandextName.fromdb.sort intNameandextNamefromCtgNames.sort \
> intandextNamesindbNotinCtgNames
wc -l intandextNamesindbNotinCtgNames
# 166062 intandextNamesindbNotinCtgNames
# get list of internal names from the clonemarkers file
awk 'BEGIN {FS="|"} {print $1}' ../clonemarkers.27.10.06.txt | sort | uniq \
> clonemarkers.intName.sort
wc -l clonemarkers.intName.sort
# 14460 clonemarkers.intName.sort
# compare these intNames to those from the database not in the ctgnames file
comm -12 clonemarkers.intName.sort intandextNamesindbNotinCtgNames
# none of these clone markers internal names are in this list so they
# must all be in the ctgnames file too. These extra internal names will be
# translations of external names found in the list of mappings of BAC clones
# to chroms.
# Check that all the BAC clone external names from the list of chromosome
# mappings and from the ctgnames file are in the database.
# get all extNames from baclones.namesandchrom.uniq and from ctgnames
awk '{print $1}' ../bacClones.namesandchrom.uniq > \
extNames.ctgnamesandbacClones
awk 'BEGIN {FS="|"} {print $3;}' ../ctgnames.27.10.06.txt \
>> extNames.ctgnamesandbacClones
wc -l extNames.ctgnamesandbacClones
# 474907 extNames.ctgnamesandbacClones
sort extNames.ctgnamesandbacClones | uniq \
> extNames.ctgnamesandbacClones.sort
wc -l extNames.ctgnamesandbacClones.sort
# 334890 extNames.ctgnamesandbacClones.sort
# get extNames from the database
hgsql -N -e 'select name from bacCloneXRef;' danRer4 | sort | uniq \
> extNames.fromdb.sort
wc -l extNames.fromdb.sort
# 334890 extNames.fromdb.sort
comm -12 extNames.fromdb.sort extNames.ctgnamesandbacClones.sort \
> extNames.fromdbandfiles
wc -l extNames.fromdbandfiles
# 334890 extNames.fromdbandfiles
# find extNames in common from data files and database
diff extNames.fromdb.sort extNames.fromdbandfiles
# no difference, all extNames from files are in db
# Check that all BAC clone internal names from the ctgnames and clonemarkers
# files are in the database
# get internal names from ctgnames and clonemarkers files
awk 'BEGIN {FS="|"} {print $2;}' ../ctgnames.27.10.06.txt \
> intNames.ctgnamesandclonemarkers
awk 'BEGIN {FS="|"} {print $1;}' ../clonemarkers.27.10.06.txt \
>> intNames.ctgnamesandclonemarkers
wc -l intNames.ctgnamesandclonemarkers
# 201440 intNames.ctgnamesandclonemarkers
sort intNames.ctgnamesandclonemarkers | uniq \
> intNames.ctgnamesandclonemarkers.sort
wc -l intNames.ctgnamesandclonemarkers.sort
# 168828 intNames.ctgnamesandclonemarkers.sort
# get internal names from database
hgsql -N -e 'select intName from bacCloneXRef;' danRer4 | sort | uniq \
> intNames.fromdb.sort
wc -l intNames.fromdb.sort
# 334890 intNames.fromdb.sort
# some of these intNames are derived from the corresponding extNames
# all of the intNames from the file should be in the db
comm -12 intNames.fromdb.sort intNames.ctgnamesandclonemarkers.sort \
> intNames.fromdbandfiles
wc -l intNames.fromdbandfiles
# 168828 intNames.fromdbandfiles
comm -13 intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
comm -23 intNames.fromdbandfiles intNames.ctgnamesandclonemarkers.sort
# no difference, all intNames from files are in db
# Check that all translations are correct between BAC clone
# external and internal names.
# write script to get the prefixes from internal and external names
chmod +x getNamePrefixes.pl
hgsql -N -e 'select name, intName from bacCloneXRef;' danRer4 \
| sort | uniq > extandintNames.fromdb.sort
perl getNamePrefixes.pl < extandintNames.fromdb.sort \
> extandintNames.prefixes
sort extandintNames.prefixes | uniq > extandintNames.prefixes.uniq
# these all look good
# BUSM1 dZ
# CH211 zC
# CH211 zc
# CH73 CHORI
# CT7 bP
# DKEY zK
# DKEY zk
# DKEYP zKp
# RP71 bZ
# XX bY
# zk is a internal name prefix for the external name prefix, DKEY-. There
# is only one example where this is used (DKEY-81G7) and this in the
# ctgnames file and is in the bacCloneXRef table so that is ok.
# All data looks good in these tables now.
###########################################################################
# SPLIT SEQUENCE FOR LIFTOVER CHAINS FROM OTHER DANRER ASSEMBLIES
# (DONE, 2006-06-27, hartera)
# ADD TO SAN FOR PK RUNS (DONE, 2006-05-30, hartera)
ssh kkr3u00
# change script to do this and only rsync to 4,5,6,7, and 8 as
# kkr1u00 and kkr2u00 are down.
cd /cluster/data/danRer4/bed
mkdir -p liftOver
cd liftOver
# commented out lines in local copy that makes the script abort if
# kkr1u00 not used. can not connect to kkr1u00 at the moment.
~/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh danRer4 \
/cluster/data/danRer4/nib >&! split.log &
# rsync didn't work properly so do manually
foreach R (4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer4/ kkr${R}u00:/iscratch/i/danRer4/
end
ssh kk
# add split10k to san for pk runs (2006-05-30, hartera)
rsync -a --progress /iscratch/i/danRer4/split10k \
/san/sanvol1/scratch/danRer4/
###########################################################################
# LIFTOVER CHAINS TO DANRER3 (DONE, 2006-05-30 = 2006-05-31, hartera)
# Split (using makeLoChain-split) of danRer3 is doc'ed in makeDanRer3.doc
# Do what makeLoChain-split says to do next (start blat alignment)
# Took too long on kk. Try pk. Scripts only run on kk so run manually.
ssh pk
mkdir -p /cluster/data/danRer4/bed/liftOver
cd /cluster/data/danRer4/bed/liftOver
cat << '_EOF_' > align.csh
#!/bin/csh -fe
set oldAssembly = $1
set oldNibDir = $2
set newAssembly = $3
set newSplitDir = $4
set ooc = $5
if ("$ooc" != "") then
set ooc = '-ooc='$ooc
endif
set blatDir = /cluster/data/$oldAssembly/bed/blat.$newAssembly.`date +%Y-%m-%d`
echo "Setting up blat in $blatDir"
rm -fr $blatDir
mkdir $blatDir
cd $blatDir
mkdir raw psl run
cd run
echo '#LOOP' > gsub
echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} ' \
'-tileSize=11 '$ooc' -minScore=100 -minIdentity=98 -fastMap' \
>> gsub
echo '#ENDLOOP' >> gsub
# target
ls -1S $oldNibDir/*.{nib,2bit} > old.lst
# query
ls -1S $newSplitDir/*.{nib,fa} > new.lst
gensub2 old.lst new.lst gsub spec
/parasol/bin/para create spec
echo ""
echo "First two lines of para spec:"
head -2 spec
echo ""
echo "DO THIS NEXT:"
echo " cd $blatDir/run"
echo " para try, check, push, check, ..."
echo ""
exit 0
'_EOF_'
# << emacs
chmod +x align.csh
align.csh danRer4 /san/sanvol1/scratch/danRer4/nib danRer3 \
/san/sanvol1/scratch/danRer3/split10k \
/san/sanvol1/scratch/danRer3/danRer3_11.ooc >&! align.log &
# Took a few seconds.
# Do what its output says to do next (start cluster job)
cd /cluster/data/danRer4/bed/blat.danRer3.2006-05-30/run
para try, check, push, check, ...
para time >&! run.time
# Completed: 784 of 784 jobs
# CPU time in finished jobs: 1482693s 24711.54m 411.86h 17.16d 0.047 y
# IO & Wait Time: 2873s 47.89m 0.80h 0.03d 0.000 y
# Average job time: 1895s 31.58m 0.53h 0.02d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 11350s 189.17m 3.15h 0.13d
# Submission to last job: 13914s 231.90m 3.87h 0.16d
ssh pk
cd /cluster/data/danRer4/bed/liftOver
cat << '_EOF_' > lift.csh
#!/bin/csh -ef
set oldAssembly = $1
set newAssembly = $2
set newLiftDir = /san/sanvol1/scratch/$newAssembly/split10k
set prefix = /cluster/data/$oldAssembly/bed/blat.$newAssembly
set blatDir = `ls -td $prefix.20* | head -1`
echo "using dir $blatDir"
if ( ! -e $blatDir/raw ) then
echo "Can't find $blatDir/raw"
endif
if (`ls -1 $newLiftDir/*.lft | wc -l` < 1) then
echo "Can't find any .lft files in $newLiftDir"
exit 1
endif
cd $blatDir/raw
foreach chr (`awk '{print $1;}' /cluster/data/$newAssembly/chrom.sizes`)
echo $chr
liftUp -pslQ ../psl/$chr.psl $newLiftDir/$chr.lft warn chr*_$chr.psl
end
set execDir = $0:h
echo ""
echo "DO THIS NEXT:"
echo " ssh pk"
echo " $execDir/makeLoChain-chain $oldAssembly <$oldAssembly-nibdir> $newAssembly <$newAssembly-nibdir>"
echo ""
exit 0
'_EOF_'
# << emacs
chmod +x lift.csh
lift.csh danRer4 danRer3 >&! lift.log &
# makeLoChain-chain can be run on pk. chain alignments
makeLoChain-chain danRer4 /san/sanvol1/scratch/danRer4/nib \
danRer3 /san/sanvol1/scratch/danRer3/nib >&! chain.log &
cd /cluster/data/danRer4/bed/blat.danRer3.2006-05-30/chainRun
para try, check, push, check, ...
para time
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 4030s 67.16m 1.12h 0.05d 0.000 y
# IO & Wait Time: 939s 15.66m 0.26h 0.01d 0.000 y
# Average job time: 177s 2.96m 0.05h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 797s 13.28m 0.22h 0.01d
# Submission to last job: 953s 15.88m 0.26h 0.01d
# net alignment chains
ssh kkstore04
cd /cluster/data/danRer4/bed/liftOver
makeLoChain-net danRer4 danRer3 >&! net.log &
# load reference to over.chain into database table,
# and create symlinks /gbdb and download area
ssh hgwdev
cd /cluster/data/danRer4/bed/liftOver
makeLoChain-load danRer4 danRer3 >&! load.log &
# clean up
rm *.log
# add md5sum.txt to include this new liftOver file
cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
md5sum *.gz > md5sum.txt
# copy README.txt from another liftOver directory.
# test by converting a region using the "convert" link on
# the browser, and comparing to blat of the same region
###########################################################################
# PRODUCING GENSCAN PREDICTIONS (DONE, 2006-05-27, hartera)
# Use scaffolds for random chroms to avoid getting false predictions
# spanning scaffolds in chrNA_random and chrUn_random.
ssh kkstore04
cd /cluster/data/danRer4
# already have a file of soft-masked scaffolds for chrNA_random and
# chrUn_random. Use this to create hard-masked scaffolds FASTA file
# for Genscan run.
foreach c (NA_random Un_random)
cd /cluster/data/danRer4/$c
mkdir scaffoldsHardMask
echo "Hard-masking scaffolds for $c ..."
cd scaffoldsSoftMask
foreach f (*.fa)
maskOutFa $f hard ../scaffoldsHardMask/${f}.masked
end
end
ssh hgwdev
mkdir /cluster/data/danRer4/bed/genscan
cd /cluster/data/danRer4/bed/genscan
cvs co hg3rdParty/genscanlinux
ssh pk
cd /cluster/data/danRer4/bed/genscan
# Make 3 subdirectories for genscan to put their output files in
mkdir gtf pep subopt
# Generate a list file, genome.list, of all the hard-masked contigs that
# *do not* consist of all-N's (which would cause genscan to blow up)
cp /dev/null genome.list
foreach c (`cat /cluster/data/danRer4/chrom.lst`)
echo $c
if (($c == "NA_random") || ($c == "Un_random")) then
foreach s (/cluster/data/danRer4/${c}/scaffoldsHardMask/Zv6_*.fa.masked)
egrep '[ACGT]' $s > /dev/null
if ($status == 0) echo $s >> genome.list
end
else
foreach f ( `ls -1S /cluster/data/danRer4/$c/chr*_*/chr*_?{,?}.fa.masked` )
egrep '[ACGT]' $f > /dev/null
if ($status == 0) echo $f >> genome.list
end
endif
end
wc -l genome.list
# 3237 genome.list
# Create template file, gsub, for gensub2. For example (3-line file):
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
gensub2 genome.list single gsub jobList
para create jobList
para try, check, push, check ... etc.
para time
# Completed: 3236 of 3237 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 46601s 776.69m 12.94h 0.54d 0.001 y
# IO & Wait Time: 10409s 173.48m 2.89h 0.12d 0.000 y
# Average job time: 18s 0.29m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 363s 6.05m 0.10h 0.00d
# Submission to last job: 445s 7.42m 0.12h 0.01d
# If there are crashes, diagnose with "para problems" / "para crashed".
# If a job crashes due to genscan running out of memory, re-run it
# manually with "-window=1200000" instead of "-window=2400000".
para problems > problems
nice /cluster/bin/x86_64/gsBig /cluster/data/danRer4/8/chr8_5/chr8_5.fa.masked gtf/chr8_5.fa.gtf -trans=pep/chr8_5.fa.pep -subopt=subopt/chr8_5.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000 >& chr8_5.fa.log &
# Took about 5 minutes to run
# check log and then remove it
rm chr8_5.fa.log
ssh kkstore04
cd /cluster/data/danRer4/bed/genscan
liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
cat pep/*.pep > genscan.pep
# Load into the database as so:
ssh hgwdev
cd /cluster/data/danRer4/bed/genscan
ldHgGene danRer4 genscan genscan.gtf
# Read 44534 transcripts in 325488 lines in 1 files
# 44534 groups 28 seqs 1 sources 1 feature types
# 44534 gene predictions
hgPepPred danRer4 generic genscanPep genscan.pep
hgLoadBed danRer4 genscanSubopt genscanSubopt.bed
# Loaded 332782 elements of size 6
# compare to other assemblies:
featureBits danRer4 genscan
# 64448019 bases of 1626093931 (3.963%) in intersection
featureBits rn4 genscan
# 54781052 bases of 2571531505 (2.130%) in intersection
featureBits monDom4 genscan
# 45991425 bases of 3501643220 (1.313%) in intersection
featureBits tetNig1 genscan
# 30459626 bases of 342403326 (8.896%) in intersection
featureBits -chrom=chr1 refGene genscan -enrichment
# refGene 1.129%, genscan 4.195%, both 0.653%, cover 57.80%, enrich 13.78x
# check CDS only
featureBits -chrom=chr1 danRer4 refGene:cds genscan:cds -enrichment
# refGene:cds 0.746%, genscan:cds 4.195%, both 0.631%, cover 84.52%,
# enrich 20.15x
###########################################################################
# BLASTZ/CHAIN/NET GALGAL3 (DONE 5/30/06 angie)
ssh pk
mkdir /cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
cd /cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
cat << '_EOF_' > DEF
# zebrafish vs. chicken
BLASTZ=/cluster/bin/penn/i386/blastz
# Use same params as used for danRer1-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Zebrafish danRer4
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
SEQ1_CTGDIR=/san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/danRer4/liftNAandUnScaffoldsToChrom.lft
SEQ1_LEN=/cluster/data/danRer4/chrom.sizes
SEQ1_CTGLEN=/san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000
SEQ1_LIMIT=100
# QUERY: Chicken galGal3 - single chunk big enough to run while chrom
SEQ2_DIR=/san/sanvol1/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/danRer4/bed/blastz.galGal3.2006-05-30
'_EOF_'
# << emacs
doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/scratch/danRer4GalGal3 \
-bigClusterHub=pk -smallClusterHub=pk \
-chainMinScore=5000 -chainLinearGap=loose DEF \
>& do.log & tail -f do.log
ln -s blastz.galGal3.2006-05-30 /cluster/data/danRer4/bed/blastz.galGal3
###########################################################################
# CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO
# AFFY ZEBRAFISH ALIGNMENTS (DONE, 2006-06-10, hartera)
# UPDATE ARRAY DATA TRACK AFTER PROCESSING ARRAY DATA DIFFERENTLY AND
# RELOADING INTO hgFixed (see hgFixed.txt for details).
# (DONE, 2006-10-20, hartera)
# UPDATE ARRAY DATA TRACK AFTER REPROCESSING ARRAY DATA TO ANTILOG THE LOG2
# VALUES FROM NORMALISATION TO GET THE ABSOLUTE VALUES AND
# RELOADING INTO hgFixed (see hgFixed.txt for details).
# (DONE, 2007-01-08, hartera)
# RE-ORDERED DISPLAY IN TRACK - see ZON LAB WILD TYPE MICROARRAY DATA section
# in danRer3.txt make doc. (DONE, hartera, 2007-04-09)
# Array data is for whole embryos of five wild type zebrafish strains.
# Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's
# Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
ssh hgwdev
mkdir /cluster/data/danRer4/bed/ZonLab/wtArray
cd /cluster/data/danRer4/bed/ZonLab/wtArray
# use AllRatio table for mapping. There are not many arrays in this
# dataset so using AllRatio will allow the selection of All Arrays
# from the track controls on the track description page. Also set up the
# Zebrafish microarrayGroups.ra so that the Medians of replicates or
# Means of replicates can also be selected for display.
# Create mapped data in zebrafishZonWT.bed.
rm zebrafishZonWT.bed
hgsql -e 'drop table affyZonWildType;' danRer4
hgMapMicroarray zebrafishZonWT.bed hgFixed.zebrafishZonWTAllRatio \
/cluster/data/danRer4/bed/affyZebrafish/affyZebrafish.psl
# Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTMedian
# Mapped 14952, multiply-mapped 3867, missed 0, unmapped 665
hgLoadBed danRer4 affyZonWildType zebrafishZonWT.bed
# Loaded 18819 elements of size 15
# add trackDb.ra entry at trackDb/zebrafish level
# look at range of scores:
hgsql -N -e 'select expScores from zebrafishZonWTAllRatio;' hgFixed \
> ratioExps.out
perl -pi.bak -e 's/,/\n/g' ratioExps.out
sort ratioExps.out | uniq -c > ratioExps.uniq.count
textHistogram -binSize=0.5 -real -maxBinCount=40 -minVal=-10 \
ratioExps.out > expRatios.hist
# Most values are between -3 and +2.
# Therefore use the following trackDb entry:
# track affyZonWildType
# shortLabel Wild Type Array
# longLabel Zon Lab Expression data for Wild Type Zebrafish strains
# group regulation
# priority 80
# visibility hide
# type expRatio
# expScale 2.0
# expStep 0.2
# groupings affyZonWildTypeGroups
# The .ra file in /usr/local/apache/cgi-bin/hgCgiData/Zebrafish
# (from ~/kent/src/hg/makeDb/hgCgiData/Zebrafish in the source tree)
# which is microarrayGroups.ra defines how the array data is
# displayed and also grouped for the Medians and Means of Replicates.
# It also defines the labels for the track controls for showing
# All Arrays, Arrays Grouped By Replicate Means or
# Arrays Grouped By Replicate Medians. This is in the description field.
# RE-ORDERED DISPLAY IN TRACK - see danRer3.txt make doc
# (hartera, 2007-04-09)
# 14 somites and 15 somites should come before 36 hpf
# 14-19 somites stage is 16-19h.
# from hgFixed.zebrafishZonWTAllExps
# for AB, 0-8 should go after 14,
# for TL, 16-22 should go after 24
# for TU, 25-27 should go after 32
# re-order accordingly in the config file:
# ~/kent/src/hg/makeDb/hgCgiData/Zebrafish/microarrayGroups.ra
###########################################################################
# HUMAN ORTHOLOGS ADDED TO AFFY ZEBRAFISH TRACK DETAILS
# (DONE, 2006-06-08, hartera)
# Human orthologs were mapped to Affy Zebrafish probes by
# Tony DiBiase (adibiase@enders.tch.harvard.edu) from Len Zon's group
# at Children's Hospital, Boston. They map to human hg16.
ssh kkstore04
mkdir -p /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
cd /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
sed -e 's/"//g' cumuList.gedi.2005oct12.txt > hg16Orthologs.txt
awk \
'BEGIN {FS="\t"} {OFS="\t"} {if ($2 == $1) print $1,"",""; else print;}' \
hg16Orthologs.txt > hg16Orthologs.tab
# create a table definition for this set:
cat << 'EOF' > orthologs.sql
# Link together an item with an ortholog
CREATE TABLE affyToHg16Orthologs (
name varchar(255) not null, # Item ID
geneSymbol longblob not null, # Gene Symbol of ortholog
description longblob not null, # Description of ortholog
# Indices
INDEX(name(20)),
INDEX(geneSymbol(20))
);
'EOF'
# load table
ssh hgwdev
cd /cluster/data/danRer4/bed/affyZebrafish/humanOrthologs
hgsql -e 'drop table affyToHg16Orthologs;' danRer4
hgLoadSqlTab danRer4 affyToHg16Orthologs orthologs.sql hg16Orthologs.tab
# edit hgc.c to use this table on affyZebrafish details page and add
# a search to use the human ortholog gene symbol in a search:
# affyZebrafishHg16Ortholog, put in trackDb/zebrafish/trackDb.ra
###########################################################################
# SWAP rn4 BLASTZ CHAIN/NET (DONE, 2006-06-19, hartera)
# See also makeRn4.doc
ssh pk
cd /cluster/data/rn4/bed/blastzDanRer4.2006-06-19
# blastz parameters used in blastz alignment of danRer4 on mm8:
# BLASTZ_ABRIDGE_REPEATS=0
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap `pwd`/DEF >& swap.log &
ssh hgwdev
featureBits danRer4 chainRn4Link
# 68978593 bases of 1626093931 (4.242%) in intersection
featureBits danRer4 refGene:cds chainRn4Link -chrom=chr1 -enrichment
# refGene:cds 0.746%, chainRn4Link 4.333%, both 0.564%,
# cover 75.55%, enrich 17.43x
featureBits danRer3 refGene:cds chainRn4Link -chrom=chr1 -enrichment
# refGene:cds 0.786%, chainRn4Link 4.320%, both 0.604%,
# cover 76.87%, enrich 17.80x
featureBits danRer4 refGene:cds netRn4 -chrom=chr1 -enrichment
# refGene:cds 0.746%, netRn4 29.601%,both 0.623%,cover 83.49%,enrich 2.82x
featureBits danRer3 refGene:cds netRn4 -chrom=chr1 -enrichment
# refGene:cds 0.786%, netRn4 33.103%, both 0.671%,cover 85.33%,enrich 2.58x
# Add symbolic link to new swap directory
ssh kkstore04
cd /cluster/data/danRer4/bed
ln -s blastz.rn4.swap blastz.rn4
# Check README.txt for downloads.
#######################################################################
# VEGA GENES (DONE, 2006-08-14 - 2006-08-25, hartera)
# ADD DESCRIPTIONS FOR VEGA GENES (DONE, 2006-09-25 - 2006-09-26, hartera)
# Data provided by Kerstin Howe from Sanger: kj2@sanger.ac.uk
# and also Mario Caccamo: mc2@sanger.ac.uk
ssh kkstore04
mkdir /cluster/data/danRer4/bed/vega
cd /cluster/data/danRer4/bed/vega
wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/kj2/gff/vega_in_ensembl.gff
wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/genes_for_tom_new.txt
# checked list of genes found in vega_in_ensembl.gff but not in
# genes_for_tom_new.txt against this file
grep -f genesWithNoInfo.txt genes_for_tom_20060725.txt
# got a list of 20 that were not in this file: genesWithNoInfo2.txt
# e-mailed Kerstin at Sanger and got the information for these 20 genes:
# moreInfo.txt
# Need to rewrite this file using tabs:
# checked format for VEGA genes in hg17. Includes an alternate name.
cd /cluster/data/hg17/bed/vega30
# to look at human VEGA
# vegaInfo is transcriptId, otterId, geneId, method and geneDesc
awk '{if ((($9 ~ /^ID=OTTDART/) && ($9 ~ /Parent=OTTDARG/)) || \
(($9 ~ /^ID=OTTDART/) && ($9 ~ /Parent=OTTDART/))) print $9;}' \
vega_in_ensembl.gff | sort | uniq > vegaIDs.txt
perl -pi.bak -e 's/ID=//' vegaIDs.txt
# list of transcript ID and corresponding gene ID for Vega
perl -pi.bak -e 's/;Parent=/\t/' vegaIDs.txt
perl -pi.bak -e 's/;Note=Only//' vegaIDs.txt
# write a script to reformat the GFF3 file to GFF format.
# some exon and CDS items belong to more than one transcript ID so these
# lines can just be duplicated. Those items that are labelled as mRNA or
# gene can be ignored and not added to the GFF file. Some of these lines
# have an extra comment e.g. Note=" . These will be ignored anyway as
# they are on the lines with mRNA or gene in them so they will not be in
# the final GFF file.
cat << '_EOF_' > formatGff3ToGff.pl
#!/usr/bin/perl -w
use strict;
my (%idsHash, $gffFile, $idsFile);
$gffFile = $ARGV[0];
open(GFF, $gffFile) || die "Can not open $gffFile\n";
while (<GFF>)
{
my ($line, @f, $t, @trans, $r, $chr);
$line = $_;
if ($line !~ /^#/)
{
@f = split(/\t/, $line);
$chr = "chr" . $f[0];
if (($f[2] ne "gene") && ($f[2] ne "mRNA"))
{
$f[8] =~ /Parent=(OTTDART[0-9]+[A-Z0-9,]+)/;
$t = $1;
@trans = split(/,/, $t);
foreach $r (@trans)
{
print "$chr\t$f[1]\t$f[2]\t$f[3]\t$f[4]\t$f[5]\t$f[6]\t$f[7]\t$r\n";
}
}
}
else
{
# print lines beginning with "#"
print $line;
}
}
close GFF;
'_EOF_'
chmod +x formatGff3ToGff.pl
# Use script to format the GFF3 file to GFF format in order to load
# using ldHgGene
perl formatGff3ToGff.pl vega_in_ensembl.gff > vega.gff
# then use the info file to grab those genes that are pseudogenes, get the
# transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
# to a separate file. Create an info file. Remove the .NOVEL or .PUTATIVE
# or .KNOWN or .NOVEL from the method column and add as a separate
# confidence column.
# check number of items on each line: there are 4 or 6.
# Some genes have more than one clone ID in a comma separated list
# so create two files for loading into two tables.
# Found that some of the clone ID fields have comma separated lists
# and for OTTDARG00000006367, there are 30. Therefore create two info
# tables where one is just for clone IDs.
# NOTE: in future, make sure each row of vegaInfoZfish.txt output has
# 8 fields. The pseudogene entries are missing an entry in the
# confidence field so this should be an empty field.
cat << '_EOF_' > formatVegaInfo.pl
#!/usr/bin/perl -w
use strict;
# format Vega additional information into one file for vegaInfoZfish table
# and another for the vegaToCloneIdZfish table which contains the
# geneId and cloneId for each gene since there are multiple clone IDs for
# some of the genes.
my ($idsFile, $infoFile, $outFile1, $outFile2, %idsHash);
$idsFile = $ARGV[0];
$infoFile = $ARGV[1];
$outFile1 = $ARGV[2];
$outFile2 = $ARGV[3];
open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
open (INFO, $infoFile) || die "Can not open $infoFile: $!\n";
open (OUT1, ">$outFile1") || die "Can not create $outFile1: $!\n";
open (OUT2, ">$outFile2") || die "Can not create $outFile2: $!\n";
open (STDERR, ">info.log") || die "Can not create info.log: $!\n";
while (<IDS>)
{
my ($line, @f);
chomp;
$line = $_;
@f = split(/\t/, $line);
$idsHash{$f[1]} = $f[0];
}
close IDS;
while (<INFO>)
{
my ($line,@fi,$id,$gene,$trans,@transIds, $tr,@clones, $c,@t, $method, $conf);
chomp;
$gene = "";
$line = $_;
@fi = split(/\t/, $line);
$id = $gene = $fi[1];
# get all the transcript IDs for a gene
while (exists($idsHash{$id}))
{
$trans = $idsHash{$id};
push(@transIds, $trans);
$id = $trans;
}
# push clone IDs into an array:
@clones = split(/,/, $fi[2]);
@t = split(/\./, $fi[3]);
$method = $t[0];
if ($#t > 0)
{
$conf = $t[1];
}
elsif ($#t == 0)
{
$conf = "";
}
else
{
print STDERR "Should be 4 or 6 items per row, found $#fi \n";
}
foreach $tr (@transIds)
{
print OUT1 "$tr\t$fi[1]\t$fi[0]";
if ($#fi == 5)
{
print OUT1 "\t$fi[4]\t$fi[5]\t$method\t\t$conf\n";
}
elsif ($#fi == 3)
{
print OUT1 "\t\t\t$method\t\t$conf\n";
}
# print out clone IDs for each transcript
foreach $c (@clones)
{
print OUT2 "$tr\t$c\n";
}
}
if($gene && !exists($idsHash{$gene}))
{
print STDERR "$gene\n";
}
}
close IDS;
close INFO;
close OUT1;
close OUT2;
close STDERR;
'_EOF_'
chmod +x formatVegaInfo.pl
wc -l genes_for_tom_new.txt
# 4822 genes_for_tom_new.txt
awk '{print $2}' genes_for_tom_new.txt | sort | uniq > genesWithInfo.txt
awk '{if ($2 ~ /OTTDARG/) print $2;}' vegaIDs.txt \
| sort | uniq > genesFromGff.txt
wc -l genesFromGff.txt
# 4947 genesFromGff.txt
comm -12 genesWithInfo.txt genesFromGff.txt | wc -l
# 4033
comm -13 genesWithInfo.txt genesFromGff.txt | wc -l
# 914
comm -13 genesWithInfo.txt genesFromGff.txt > genesWithNoInfo.txt
# sent this list to Sanger to ask about getting additional information
# for these genes.
comm -23 genesWithInfo.txt genesFromGff.txt | wc -l
# 789
# got another file from Sanger that should contain the information
# for the 914 genes missing information above.
ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/060725/genes_for_tom_20060725.txt
# check if this contains all of the list missing before
sort genesWithNoInfo.txt | uniq > genesWithNoInfo.sort
awk '{print $2}' genes_for_tom_20060725.txt | sort | uniq > genes.txt
comm -13 genes.txt genesWithNoInfo.uniq > genesWithNoInfo2.txt
# there are 20 of these. Sent these to Sanger and received
# information for these. Copied and pasted these from e-mail into
# moreInfo.txt. Write script to reformat: addTabs.pl
perl addTabs.pl < moreInfo.txt > geneInfo3.txt
grep -f genesWithInfo.txt genes_for_tom_20060725.txt > tmp
wc -l tmp
# 4738
wc -l genesWithInfo.txt
# 4822 genesWithInfo.txt
# Not all of these are in genes_for_tom_20060725.txt so merge all the
# info files and uniq:
cat genes_for_tom_new.txt genes_for_tom_20060725.txt geneInfo3.txt \
| sort | uniq > allGeneInfo.txt
awk '{print $2}' allGeneInfo.txt | sort | uniq -c | sort -nr > count
# counts gene names - often occur twice but with more information in
# one case than the other. Seems like newer file has most information for
# each gene.
grep -f genesFromGff.txt genes_for_tom_20060725.txt > info1.txt
# then list genes in info1.txt
comm -13 genesInInfo1.sort genesFromGff.txt > genes1
wc -l genes1
# 55 genes1
grep -f genes1 genes_for_tom_new.txt > info2.txt
awk '{print $2}' info2.txt | sort | uniq > genesInInfo2.txt
comm -13 genesInInfo2.sort genes1 > genes2
wc -l genes2
# 20 genes2
# genes2 is list of genes not found in either file. Should be 20 left.
awk '{print $2}' geneInfo3.txt | sort | uniq > genes3
comm -12 genes2 genes3 | wc -l
# 20 - so these are the same 20 that are in geneInfo3.txt
# These are in geneInfo3.txt. cat all these files together
cat info1.txt info2.txt geneInfo3.txt > allGeneInfo2.txt
# Recreate the tab file for loading into the vegaInfoZfish table:
rm vegaInfoZfish.txt
# Use new version that prints out one row for each accession in field 3.
perl formatVegaInfo.pl vegaIDs.txt allGeneInfo2.txt vegaInfoZfish.txt \
vegaToCloneId.txt
# info.log contains genes for which are not in the gff file of VEGA
# and this is empty as it should be.
wc -l vegaInfoZfish.txt
# 6606 vegaInfoZfish.txt
wc -l vegaToCloneId.txt
# 7245 vegaToCloneId.txt
awk '{print $1}' vegaInfoZfish.txt | sort | uniq -c | sort -nr > out2
# transcripts only have 1 entry
awk '{print $2}' vegaInfoZfish.txt | sort | uniq > infogenes.txt
comm -13 infogenes.txt genesFromGff.txt
# There are no genes in the GFF file that are not in vegaInfoZfish.txt
# Then remake the pseudogenes track from this.
# Next step is to find which transcripts are pseudogenes.
grep pseudogene vegaInfoZfish.txt | sort | uniq | wc -l
# There are only 51 in the info file, and all of these are in the GFF
# file. Anyway, this is too sparse for a separate track, but
# a subtrack could be created.
# Get transcript IDs for pseudogenes.
grep pseudogene vegaInfoZfish.txt | awk '{print $1}' > pseudogenes.ids
grep -f pseudogenes.ids vega.gff > vegaPseudoGene.gff
awk '{print $9}' vegaPseudoGene.gff |sort | uniq | wc -l
# 51
grep -v -f pseudogenes.ids vega.gff > vegaGene.gff
wc -l vega*ff
# 98170 vega.gff
# 97999 vegaGene.gff
# 171 vegaPseudoGene.gff
# load gff files:
ssh hgwdev
cd /cluster/data/danRer4/bed/vega
hgsql -e 'drop table vegaGene;' danRer4
hgsql -e 'drop table vegaPseudoGene;' danRer4
ldHgGene danRer4 vegaGene vegaGene.gff
# Read 6555 transcripts in 88104 lines in 1 files
# 6555 groups 25 seqs 1 sources 2 feature types
# 6555 gene predictions
ldHgGene danRer4 vegaPseudoGene vegaPseudoGene.gff
# Read 51 transcripts in 171 lines in 1 files
# 51 groups 9 seqs 1 sources 1 feature types
# 51 gene predictions
# Then create SQL table for adding the zebrafish-specific information
# Add clone_id to a separate table instead of this one.
cat << '_EOF_' > ~/kent/src/hg/lib/vegaInfoZfish.as
table vegaInfoZfish
"Vega Genes track additional information"
(
string transcriptId; "Vega transcript ID"
string geneId; "Vega gene ID (OTTER ID)"
string sangerName; "Sanger gene name"
string zfinId; "ZFIN ID"
string zfinSymbol; "ZFIN gene symbol"
string method; "GTF method field"
string geneDesc; "Vega gene description"
string confidence; "Status (KNOWN, NOVEL, PUTATIVE, PREDICTED)"
)
'_EOF_'
cd ~/kent/src/hg/lib/
autoSql vegaInfoZfish.as vegaInfoZfish
mv vegaInfoZfish.h ../inc/
# commit vegaInfoZfish{.h,.c,.as,.sql} files to CVS
# add INDEX(geneId) to vegaInfoZfish.sql
# Need to change geneDesc to longblob type because some descriptions
# are long (2006-09-26, hartera)
cd ~/kent/src/hg/lib
perl -pi.bak -e 's/geneDesc varchar\(255\)/geneDesc longblob/' \
vegaInfoZfish.sql
# create a second table for the cloneId accessions since there
# are multiple ids for some VEGA genes. Otherwise, there would be
# a comma separated list in this field or many rows repeated but just
# different in the cloneId field. Associate transcript ID to clone IDs.
grep ',' allGeneInfo2.txt | wc -l
# 378
cat << '_EOF_' > ~/kent/src/hg/lib/vegaToCloneId.as
table vegaToCloneId
"Vega Genes track cloneId information"
(
string transcriptId; "Vega transcript ID"
string cloneId; "clone ID"
)
'_EOF_'
cd ~/kent/src/hg/lib/
autoSql vegaToCloneId.as vegaToCloneId
# replace PRIMARY KEY(transcriptId) with Indices on geneId and cloneId:
perl -pi.bak -e \
's/PRIMARY KEY\(transcriptId\)/INDEX\(transcriptId\),\nINDEX\(cloneId\)/' \
vegaToCloneId.sql
rm *.bak
# mv vegaInfoZfish.h ../inc/
cd /cluster/data/danRer4/bed/vega
hgsql -e 'drop table vegaInfoZfish;' danRer4
hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
vegaInfoZfish.txt
hgsql -e 'drop table vegaToCloneId;' danRer4
hgLoadSqlTab danRer4 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
vegaToCloneId.txt
# Add code to hgc.c so that this works for Zebrafish and creates the
# relevant links. Add searches by vega transcript ID, ZFIN ID and
# clone ID. Add a Vega zebrafish-specific description to
# trackDb/zebrafish. The Pseudogenes are a subtrack of the Genes track
# because it is too sparse to show as a separate track.
# Added entry in zebrafish/trackDb.ra to create these tracks as subtracks of
# a Vega Genes track.
# track vegaGeneZfish
# compositeTrack on
# shortLabel Vega Genes
# longLabel Vega Annotations
# group genes
# priority 37
# visibility hide
# chromosomes chr1,chr2,chr3,chr4,chr5,chr6,chr7,chr8,chr9,chr10,chr11,chr12,chr13,chr14,chr15,chr16,chr17,chr18,chr19,chr20,chr21,chr22,chr23,chr24,chr25
# type genePred
# url http://vega.sanger.ac.uk/Danio_rerio/geneview?transcript=$$
# track vegaGene
# subTrack vegaGeneZfish
# shortLabel Vega Genes
# longLabel Vega Gene Annotations
# priority 1
# color 0,100,180
# track vegaPseudoGene
# subTrack vegaGeneZfish
# shortLabel Vega Pseudogenes
# longLabel Vega Annotated Pseudogenes
# priority 2
# color 155,0,125
# ADD Descriptions for Vega Genes (2006-09-25 - 2006-09-26, hartera)
# Looked into using description from BioMart for VEGA genes but easier
# to get them all directly from Sanger. Kerstin sent a list of
# descriptions: for_rachel.txt
ssh kkstore04
mkdir /cluster/data/danRer4/bed/vega/description
# copy file here and rename
cd /cluster/data/danRer4/bed/vega/description
mv for_rachel.txt vegaDesc.txt
# get list of VEGA gene IDs in vegaInfoZfish
ssh hgwdev
cd /cluster/data/danRer4/bed/vega/description
hgsql -e 'select distinct(geneId) from vegaInfoZfish;' danRer4 | sort \
> geneIdsFromInfo.sort
# get sorted list of gene IDs from description file:
awk '{print $2;}' vegaDesc.txt | sort | uniq > vegaDesc.ids.sort
wc -l *.sort
comm -12 geneIdsFromInfo.sort vegaDesc.ids.sort | wc
# 4892
comm -23 geneIdsFromInfo.sort vegaDesc.ids.sort > genesNoDesc.txt
# 55 with no description. sent this list to Sanger and got the
# descriptions for these too: descriptions_for_Rachel.txt
awk '{print $1}' descriptions_for_Rachel.txt | sort | uniq \
> geneIds.newDesc.sort
comm -12 genesNoDesc.txt geneIds.newDesc.sort | wc
# 55 gene names in common
ssh kkstore04
cd /cluster/data/danRer4/bed/vega/description
cat vegaDesc.txt descriptions_for_Rachel.txt > vegaAllDesc.txt
wc -l vegaAllDesc.txt
# 6440 vegaAllDesc.txt
# clean up
rm genesNoDesc.txt geneIds* vegaDesc.ids.sort
# Then add these to the vegaInfoZish table
cat << 'EOF' > addDesc.pl
#!/usr/bin/perl -w
use strict;
my ($infoFile, $descFile, %descHash);
$infoFile = $ARGV[0]; # vegaInfoZfish.txt file
$descFile = $ARGV[1]; # file of descriptions
open(INFO, $infoFile) || die "Can not open $infoFile : $!\n";
open(DESC, $descFile) || die "Can not open $descFile : $!\n";
while (<DESC>)
{
my($line, @f, $id, $desc);
chomp;
$line = $_;
@f = split(/\t/, $line);
if ($#f > 0 && $f[1] =~ /^OTTDARG/)
{
$id = $f[1];
$desc = $f[2];
}
elsif ($f[0] =~ /^(OTTDARG[0-9]+)\s*(.+)/)
{
# some lines have just id and description with only a space between
$id = $1;
$desc = $2;
}
else
{
print "OTTDARG ID not found \n";
}
$descHash{$id} = $desc;
}
close DESC;
while (<INFO>)
{
my ($li, @fi, $de, $i, $last);
$de = "";
chomp;
$li = $_;
@fi = split(/\t/, $li);
if ($fi[1] =~ /OTTDARG/)
{
if (exists($descHash{$fi[1]}))
{
$de = $descHash{$fi[1]};
}
else
{
print "There is no description for $fi[1] available.\n";
}
}
$last = $#fi;
for ($i = 0; $i <= 5; $i++ )
{
print "$fi[$i]\t";
}
print "$de\t";
if ($last == 5)
{
# if there are only 5 fields, the last one is missing so add extra tab
print "\t\n";
}
else
{
print "$fi[$last]\n";
}
}
close INFO;
'EOF'
chmod +x addDesc.pl
# add new descriptions to vegaInfoZfish.txt file
perl addDesc.pl ../vegaInfoZfish.txt vegaAllDesc.txt \
> vegaInfoZfishWithDesc.txt
# Reload vegaInfoZfish table
ssh hgwdev
cd /cluster/data/danRer4/bed/vega/description
# 105 warnings when loading the table
# remove "\N" from desc
perl -pi.bak -e 's/\\N//' vegaInfoZfishWithDesc.txt
# this removed 3 warnings
# after dumping the contents of the table and diffing with the input
# file, found that the pseudogenes are missing the confidence field
# and so there is a tab missing from the file. Modified addDesc.pl to
# add the extra tab when only 7 tabbed fields instead of 8 is found
# in a row.
hgsql -e 'drop table vegaInfoZfish;' danRer4
hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
vegaInfoZfishWithDesc.txt
# Try loading GTF format file (2006-10-19)
ssh kkstore04
cd /cluster/data/danRer4/bed/vega/new
wget ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
ssh hgwdev
cd /cluster/data/danRer4/bed/vega/new
ldHgGene -bin -genePredExt danRer4 vegaNew vega.gtf
# Error: Read 6371 transcripts in 88275 lines in 1 files
# 6371 groups 25 seqs 4 sources 2 feature types
# invalid gffGroup detected on line: chr22 NOVEL exon 6782575
# 67832400.000000 - . gene_id "si:rp71-1i20.2"; transcript_id
# "si:rp71-1i20.2-001";
# GFF/GTF group si:rp71-1i20.2-001 on chr22+, this line is on chr22-, all group
# members must be on same seq and strand
# transcript_id is not unique. otter_transcript_id is unique so switch these.
cp vega.gtf vegaNew.gtf
# ldHgGene groups by transcript Id so use OTTER IDS instead
perl -pi.bak -e 's/transcript_id/other_transcript_id/' vegaNew.gtf
perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaNew.gtf
ldHgGene -bin -genePredExt danRer4 vegaNew vegaNew.gtf
# worked ok
# Added this as a vegaGeneNew subtrack for Vega Genes
ssh kkstore04
cd /cluster/data/danRer4/bed/vega/new
# find genes that has same transcript Ids for different OTTER gene_ids
awk 'BEGIN {FS="\t"} {print $9}' vega.gtf > vegaAttributes
awk 'BEGIN {FS=";"} {print $2, $5}' vegaAttributes \
> vegaAttrib.transIdandotterId
sort vegaAttrib.transIdandotterId | uniq \
> vegaAttrib.transIdandotterId.uniq
awk '{print $2}' vegaAttrib.transIdandotterId.uniq | sed -e 's/\s//' \
| sort | uniq -c | sort -nr > vegaAttrib.transId.count
# 88 of these transcripts have more than one entry in gtf file. Need
# to check if they have different OTTER gene ids in each case.
head -88 vegaAttrib.transId.count | awk '{print $2}' > transIds.morethan1
grep -w -f transIds.morethan1 vegaAttrib.transIdandotterId.uniq \
> transIdswithDiffOtterGeneIds.txt
awk '{print $2}' transIdswithDiffOtterGeneIds.txt | sort | uniq \
> transIds.diffOtterGeneIds.txt
# send transIdswithDiffOtterGeneIds.txt to Kerstin at Sanger. List
# of transcript Ids with different instances of OTTER gene ids.
# WAITING NOW FOR VEGA GENE UPDATE (2006-10-19)
# Received e-mail from Ian Sealy at Sanger (is1@sanger.ac.uk) that
# Vega gene update is ready in gtf format (2006-11-02)
ssh kkstore04
cd /cluster/data/danRer4/bed/vega
mkdir update
cd update
wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
ssh hgwdev
cd /cluster/data/danRer4/bed/vega/update
ldHgGene -bin -genePredExt danRer4 vegaUpdate vega.gtf
# Read 6823 transcripts in 93253 lines in 1 files
6823 groups 25 seqs 4 sources 2 feature types
invalid gffGroup detected on line: chr22 PUTATIVE exon
67909276791256 0.000000 - . gene_id "si:rp71-1i20.2";
transcript_id "RP71-1I20.1-001";
GFF/GTF group RP71-1I20.1-001 on chr22+, this line is on chr22-, all group
members must be on same seq and strand
# Still has non-unique transcript IDs - need to wait for next release
# of VEGA genes and Ensembl for this to be fixed.
# Received new update of VEGA from Ian Sealy (is1@sanger.ac.uk) on
# 2007-02-14.
ssh kkstore04
cd /cluster/data/danRer4/bed/vega
wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
# Load into database
# 2007-03-09
ssh hgwdev
cd /cluster/data/danRer4/bed/vega
ldHgGene -bin -genePredExt danRer4 vega vega.gtf
invalid gffGroup detected on line: chr4 NOVEL exon 35259893
352599940.000000 + . gene_id "sinup"; transcript_id
"siah2l-001";
GFF/GTF group siah2l-001 on chr21-, this line is on chr4+, all group members
must be on same seq and strand
# still get duplicate transcript IDs on different chromosome.
# Below is what Kerston Howe (kj2@sanger.ac.uk) advised on these cases:
# "this will continue to happen as long as the map still changes. The
# gene in question was annotated on two adjacent clones which were
# apparently then broken up and assigned to different chromosomes.
# Usually, this is not too alarming (just delete those cases, please)"
# Find other such cases:
awk 'BEGIN{OFS="\t"} {print $1, $12}' vega.gtf > vegachromAndId.txt
sort vegachromAndId.txt | uniq > vegachromAndId.uniq
awk '{print $2}' vegachromAndId.uniq | sort | uniq -c | sort -nr \
> vegaIds.count
# These transcript IDs all appear twice on different chromosomes. There could
# be cases where there are transcripts that are duplicated on the same
# chromosome.
# 2 "taf6-001";
# 2 "siah2l-001";
# 2 "rasgrf2-001";
# 2 "lmx1b-001";
# 2 "fvt1-001";
# 2 "ckmt2-002";
# 2 "ckmt2-001";
# 2 "accn2c-001";
# There are some cases where the gene is on the same chrom but different
# strands.
awk 'BEGIN{OFS="\t"} {print $1, $7, $12}' vega.gtf \
| sort | uniq > vegachromStrandAndId.uniq
awk '{print $1, $3}' vegachromStrandAndId.uniq | sort | uniq -c \
| sort -nr > vegaIdsAndChroms.count
# These occur twice on different strands of the same chromosome:
# 2 chr19 "DKEY-264N13.5-001";
# 2 chr14 "stx5a-001";
# Remove these from the GTF file as suggested by Kerstin Howe (Sanger)
head -8 vegaIds.count | awk '{print $2}' > transcriptIds.remove
head -2 vegaIdsAndChroms.count | awk '{print $3}' >> transcriptIds.remove
grep -v -f transcriptIds.remove vega.gtf > vega2.gtf
# reload into danRer4 database
hgsql -e 'drop table vegaUpdate;' danRer4
ldHgGene -bin -genePredExt danRer4 vegaUpdate vega2.gtf
# successfully loads now.
# ldHgGene groups by transcript Id so use OTTER IDS instead
sed -e 's/transcript_id/other_transcript_id/' vega.gtf > vegaFormat.gtf
perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaFormat.gtf
# Now it loads ok without removing duplicate transcript IDs:
ldHgGene -bin -genePredExt danRer4 vegaFormat vegaFormat.gtf
# Read 8817 transcripts in 119707 lines in 1 files
# 8817 groups 29 seqs 4 sources 2 feature types
# 8817 gene predictions
hgsql -N -e 'select distinct(name2) from vegaFormat;' danRer4 > name2
# Extra information obtained from Sanger:
ssh kkstore04
cd /cluster/data/danRer4/bed/vega
wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/061111/noH/genes_for_tom.txt
sort name2 > name2.sort
awk '{print $1}' genes_for_tom.txt | sort | uniq > genesfortom.symbs.sort
comm -23 name2.sort genesfortom.symbs.sort > vegaGtfOnly
wc -l vegaGtfOnly
# 4021
awk '{print $6}' genes_for_tom.txt | sort | uniq > genesfortom.altsymb.sort
comm -23 vegaGtfOnly genesfortom.altsymb.sort
# rest of symbols found as alternate symbols in column 6 of this file
# subtract this from original list
comm -13 vegaGtfOnly name2.sort > genesincol1
# Also received descriptions file and additional information from Sanger.
# Now the track can be updated since the vega.gtf file loads into the
# database see VEGA UPDATE section below.
#######################################################################
# VEGA UPDATE (DONE, 2007-03-26 - 2007-03-28, hartera)
# Data provided by Kerstin Howe from Sanger: kj2@sanger.ac.uk
# and also Ian Sealy: is1@sanger.ac.uk
# GTF file sent on 2007-02-14
# Updated formatVegaInfo.pl script as not all transcripts were being included
# in the vegaInfoZfish and the vegaToCloneId tables so all tables were
# re-made (DONE, 2007-04-06, hartera)
ssh kkstore04
mkdir /cluster/data/danRer4/bed/vega.2007-02-14
cd /cluster/data/danRer4/bed/vega
ln -s /cluster/data/danRer4/bed/vega.2007-02-14 \
/cluster/data/danRer4/bed/vega
wget --timestamping ftp://ftp.sanger.ac.uk/pub/is1/vega.gtf
wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/kj2/ZFIN/061111/noH/genes_for_tom.txt
# list of gene descriptions by Kerstin Howe (2007-03-12)
mv for_rachel.txt vegaDescriptions.txt
mv genes_for_tom.txt vegaInformation.txt
# vegaInfo is transcriptId, otterId, geneId, method and geneDesc
# Get otter transcript ID and otter gene ID:
awk 'BEGIN{OFS="\t"} \
{if (($17 ~ /otter_gene_id/) && ($19 ~ /otter_transcript_id/)) \
print $20, $18;}' vega.gtf \
> vegaIDs.txt
perl -pi.bak -e 's/;//g' vegaIDs.txt
perl -pi.bak -e 's/\"//g' vegaIDs.txt
# list of transcript ID and corresponding gene ID for Vega
sort vegaIDs.txt | uniq > vegaIDs.uniq
# then use the info file to grab those genes that are pseudogenes, get the
# transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
# to a separate file. Create an info file. Remove the .NOVEL or .PUTATIVE
# or .KNOWN or .NOVEL from the method column and add as a separate
# confidence column.
# check number of items on each line: there are 4 or 6.
# Some genes have more than one clone ID in a comma separated list
# so create two files for loading into two tables.
# Found that some of the clone ID fields have comma separated lists
# and for OTTDARG00000006367, there are 30. Therefore create two info
# tables where one is just for clone IDs.
# NOTE: in future, make sure each row of vegaInfoZfish.txt output has
# 8 fields. The pseudogene entries are missing an entry in the
# confidence field so this should be an empty field.
# Updated formatVegaInfo.pl as not getting all transcript IDs in the
# vegaInfoZfish table (hartera, 2007-04-06)
cat << '_EOF_' > formatVegaInfo.pl
#!/usr/bin/perl -w
use strict;
# format Vega additional information into one file for vegaInfoZfish table
# and another for the vegaToCloneIdZfish table which contains the
# geneId and cloneId for each gene since there are multiple clone IDs for
# some of the genes.
my ($idsFile, $infoFile, $outFile1, $outFile2, %idsHash);
$idsFile = $ARGV[0]; # list of Transcript IDs and Gene IDs
$infoFile = $ARGV[1]; # information file for Vega genes
$outFile1 = $ARGV[2]; # output1 is the formatted file of Vega info for table
$outFile2 = $ARGV[3]; # output2 is a vega to clone ID conversion table
open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
open (INFO, $infoFile) || die "Can not open $infoFile: $!\n";
open (OUT1, ">$outFile1") || die "Can not create $outFile1: $!\n";
open (OUT2, ">$outFile2") || die "Can not create $outFile2: $!\n";
open (STDERR, ">info.log") || die "Can not create info.log: $!\n";
while (<IDS>)
{
my ($line, @f);
chomp;
$line = $_;
@f = split(/\t/, $line);
# hash is keyed by gene ID but there could be more than one transcript
# associated with a gene ID so need to create an array for the hash
push @{$idsHash{$f[1]}}, $f[0];
}
close IDS;
while (<INFO>)
{
my ($line,@fi,$id,$gene,@transIds, $tr,@clones, $c,@t, $method, $conf);
chomp;
$gene = "";
$line = $_;
@fi = split(/\t/, $line);
$id = $gene = $fi[1];
# get all the transcript IDs for a gene
if (exists($idsHash{$id}))
{
@transIds = @{$idsHash{$id}};
}
# push clone IDs into an array:
@clones = split(/,/, $fi[2]);
@t = split(/\./, $fi[3]);
$method = $t[0];
if ($#t > 0)
{
$conf = $t[1];
}
elsif ($#t == 0)
{
$conf = "";
}
else
{
print STDERR "Should be 4 or 6 items per row, found $#fi \n";
}
foreach $tr (@transIds)
{
print OUT1 "$tr\t$fi[1]\t$fi[0]";
if ($#fi == 5)
{
print OUT1 "\t$fi[4]\t$fi[5]\t$method\t\t$conf\n";
}
elsif ($#fi == 3)
{
print OUT1 "\t\t\t$method\t\t$conf\n";
}
# print out clone IDs for each transcript
foreach $c (@clones)
{
print OUT2 "$tr\t$c\n";
}
}
if($gene && !exists($idsHash{$gene}))
{
print STDERR "$gene\n";
}
}
close IDS;
close INFO;
close OUT1;
close OUT2;
close STDERR;
'_EOF_'
chmod +x formatVegaInfo.pl
wc -l vegaInformation.txt
# 7169 vegaInformation.txt
awk '{print $2}' vegaInformation.txt | sort | uniq > genesWithInfo.txt
awk '{if ($2 ~ /OTTDARG/) print $2;}' vegaIDs.uniq \
| sort | uniq > genesFromGtf.txt
# Number of genes with info AND in gtf file:
wc -l genesFromGtf.txt
# 6171 genesFromGtf.txt
comm -12 genesWithInfo.txt genesFromGtf.txt | wc -l
# 6171
# Number of genes wih no info:
comm -13 genesWithInfo.txt genesFromGtf.txt | wc -l
# 0
# Use perl script above to extract vegaInfo table information.
# Re-did this with updated perl script to get all transcript IDs
# (hartera, 2007-04-07)
perl formatVegaInfo.pl vegaIDs.uniq vegaInformation.txt \
vegaInfoZfish.txt vegaToCloneId.txt
# info.log contains genes for which are not in the gff file of VEGA
# and this is empty as it should be.
wc -l vegaInfoZfish.txt
# 8817 vegaInfoZfish.txt
wc -l vegaToCloneId.txt
# 9698 vegaToCloneId.txt
# The vegaToCloneId.txt file is also larger than before as it now
# has all the transcript IDs (hartera, 2007-04-05).
awk '{print $1}' vegaInfoZfish.txt | sort | uniq -c | sort -nr > out2
# transcripts only have 1 entry
awk '{print $2}' vegaInfoZfish.txt | sort | uniq > infogenes.txt
comm -13 infogenes.txt genesFromGtf.txt
# There are no genes in the GFF file that are not in vegaInfoZfish.txt
# However, there are genes in the information file that do not have
# transcripts represented that are in the GFF file.
# Then remake the pseudogenes track from this.
# Next step is to find which transcripts are pseudogenes.
grep pseudogene vegaInfoZfish.txt | sort | uniq | wc -l
# Once vegaInfoZfish.txt updated, found 53 pseudogenes so need to update
# the pseudogene track
# There are only 53 in the info file, and all of these are in the GFF
# file. Anyway, this is too sparse for a separate track, but
# a subtrack could be created.
# Get transcript IDs for pseudogenes.
grep pseudogene vegaInfoZfish.txt | awk '{print $1}' > pseudogenes.ids
grep -w -f pseudogenes.ids vega.gtf > vegaPseudoGene.gtf
awk '{print $20}' vegaPseudoGene.gtf | sort | uniq | wc -l
# 53
# Need to remake the vegGene table:
grep -vw -f pseudogenes.ids vega.gtf > vegaGene.gtf
wc -l vega*gtf
# 119707 vega.gtf
# 119529 vegaGene.gtf
# 178 vegaPseudoGene.gtf
# Need to relabel IDs to get the name to be the otter transcript ID
# and name 2 to be the transcript_id (needs to be labeled as gene_id)
# Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
# groups the rows by this ID.
sed -e 's/gene_id/tmp_id/' vegaGene.gtf > vegaGeneFormat.gtf
perl -pi.bak -e 's/transcript_id/gene_id/' vegaGeneFormat.gtf
perl -pi.bak -e 's/otter_transcript_id/transcript_id/' vegaGeneFormat.gtf
# Do the same for the pseudogene GTF files:
sed -e 's/gene_id/tmp_id/' vegaPseudoGene.gtf > vegaPseudoGeneFormat.gtf
perl -pi.bak -e 's/transcript_id/gene_id/' vegaPseudoGeneFormat.gtf
perl -pi.bak -e 's/otter_transcript_id/transcript_id/' \
vegaPseudoGeneFormat.gtf
rm *.bak
# load GTF files for Vega genes and pseudogenes:
# Reloaded all tables after updating as above (2007-04-06, hartera)
ssh hgwdev
cd /cluster/data/danRer4/bed/vega
hgsql -e 'drop table vegaGene;' danRer4
hgsql -e 'drop table vegaPseudoGene;' danRer4
ldHgGene -bin -genePredExt danRer4 vegaGene vegaGeneFormat.gtf
# Read 8764 transcripts in 119529 lines in 1 files
# 8764 groups 29 seqs 3 sources 2 feature types
# 8764 gene predictions
ldHgGene -bin -genePredExt danRer4 vegaPseudoGene vegaPseudoGeneFormat.gtf
# Read 53 transcripts in 178 lines in 1 files
# 53 groups 11 seqs 1 sources 1 feature types
# 53 gene predictions
hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer4 \
| sort | uniq
hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer4 \
| sort | uniq
# vegaGene includes several scaffolds so need to lift file to chrom
# level for these and reload vegaGene. vegaPseudoGene has no scaffolds.
# scaffolds in vegaGene:
# chrZv6_scaffold3697
# chrZv6_scaffold3723
# chrZv6_scaffold3731
# chrZv6_scaffold3734
# These are all on the chrUn_random virtual chrom
ssh kkstore04
cd /cluster/data/danRer4/bed/vega
sed -e 's/chrZv6_scaffold/Zv6_scaffold/g' vegaGeneFormat.gtf \
> vegaGeneFormat2.gtf
liftUp vegaGeneFormatLifted.gtf \
/cluster/data/danRer4/jkStuff/liftAll.lft carry vegaGeneFormat2.gtf
# Reload vegaGene table:
ssh hgwdev
cd /cluster/data/danRer4/bed/vega
hgsql -e 'drop table vegaGene;' danRer4
ldHgGene -bin -genePredExt danRer4 vegaGene vegaGeneFormatLifted.gtf
# Read 8764 transcripts in 119529 lines in 1 files
# 8764 groups 26 seqs 3 sources 2 feature types
# 8764 gene predictions
# Vega information tables:
# mySQL table definition and autosql-generated files created previously
# for zebrafish-specific information (vegaInfoZfish) in the VEGA GENES
# section above.
# Add clone_id to a separate table instead of this one.
# created a second table for the cloneId accessions since there
# are multiple ids for some VEGA genes. Otherwise, there would be
# a comma separated list in this field or many rows repeated but just
# different in the cloneId field. Associate transcript ID to clone IDs.
# see VEGA GENES section
# load these tables:
cd /cluster/data/danRer4/bed/vega
hgsql -e 'drop table vegaInfoZfish;' danRer4
hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
vegaInfoZfish.txt
hgsql -e 'drop table vegaToCloneId;' danRer4
hgLoadSqlTab danRer4 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
vegaToCloneId.txt
# Add code to hgc.c so that this works for Zebrafish and creates the
# relevant links. Add searches by vega transcript ID, ZFIN ID and
# clone ID. trackDb entry added as in VEGA GENES section above.
# Added track handler to hgTracks.c for vegaGeneZfish so that the
# transcript names from the name2 column of the genePred table is
# used for the item name displayed in the track.
# Add a Vega zebrafish-specific html description to trackDb/zebrafish.
# The Pseudogenes are a subtrack of the Genes track
# because it is too sparse to show as a separate track.
# ADD Descriptions for Vega Genes
# Looked into using description from BioMart for VEGA genes but easier
# to get them all directly from Sanger. Kerstin sent a list of
# descriptions: for_rachel.txt
# Add these again to updated tables (2007-04-06, hartera)
ssh kkstore04
mkdir -p /cluster/data/danRer4/bed/vega/description
# copy file here and rename
cd /cluster/data/danRer4/bed/vega/description
mv ../vegaDescriptions.txt .
# get list of VEGA gene IDs in vegaInfoZfish
ssh hgwdev
cd /cluster/data/danRer4/bed/vega/description
hgsql -N -e 'select distinct(geneId) from vegaInfoZfish;' danRer4 | sort \
> geneIdsFromInfo.sort
# get sorted list of gene IDs from description file:
awk '{print $1;}' vegaDescriptions.txt | sort | uniq > vegaDesc.ids.sort
wc -l *.sort
# 6171 geneIdsFromInfo.sort
# 14150 vegaDesc.ids.sort
comm -12 geneIdsFromInfo.sort vegaDesc.ids.sort | wc
# 6168
comm -23 geneIdsFromInfo.sort vegaDesc.ids.sort > genesNoDesc.txt
# There are 3 with no description
# OTTDARG00000004654
# OTTDARG00000018757
# OTTDARG00000018760
# Searched for these three at
# http://vega.sanger.ac.uk/Danio_rerio/index.html
# and found that these three do not have a description.
# add them to the descriptions list
ssh kkstore04
cd /cluster/data/danRer4/bed/vega/description
# add the three with no description to the descriptions list
cat vegaDescriptions.txt genesNoDesc.txt > vegaAll.txt
# remove header
tail +2 vegaAll.txt | sort | uniq > vegaAllDesc.txt
wc -l vegaAll*
# 23058 vegaAll.txt
# 15460 vegaAllDesc.txt
# clean up
rm genesNoDesc.txt geneIds* vegaDesc.ids.sort
# Then add these to the vegaInfoZish table
cat << 'EOF' > addDesc.pl
#!/usr/bin/perl -w
use strict;
my ($infoFile, $descFile, %descHash);
$infoFile = $ARGV[0]; # vegaInfoZfish.txt file
$descFile = $ARGV[1]; # file of descriptions
open(INFO, $infoFile) || die "Can not open $infoFile : $!\n";
open(DESC, $descFile) || die "Can not open $descFile : $!\n";
open(ERROR, ">error.log") || die "Can not create error.log : $!\n";
open(OUT, ">out.txt") || die "Can not create out.txt: $!\n";
while (<DESC>)
{
my($line, @f, $id, $desc);
chomp;
$line = $_;
@f = split(/\t/, $line);
if ($f[0] =~ /^OTTDARG/)
{
$id = $f[0];
$desc = $f[1];
}
else
{
print ERROR "OTTDARG ID is not found on a line of the descriptions file.\n";
}
$descHash{$id} = $desc;
}
close DESC;
while (<INFO>)
{
my ($li, @fi, $de, $i, $last);
$de = "";
chomp;
$li = $_;
@fi = split(/\t/, $li);
if ($fi[1] =~ /OTTDARG/)
{
if (exists($descHash{$fi[1]}))
{
$de = $descHash{$fi[1]};
}
else
{
print ERROR "There is no description for $fi[1] available.\n";
}
}
$last = $#fi;
for ($i = 0; $i <= 5; $i++ )
{
print OUT "$fi[$i]\t";
}
print OUT "$de\t";
if ($last == 5)
{
# if there are only 5 fields, the last one is missing so add extra tab
print OUT "\t\n";
}
else
{
print OUT "$fi[$last]\n";
}
}
close INFO;
close ERROR;
'EOF'
chmod +x addDesc.pl
# add new descriptions to vegaInfoZfish.txt file
perl addDesc.pl ../vegaInfoZfish.txt vegaAllDesc.txt
# check output in out.txt then rename
mv out.txt vegaInfoZfishWithDesc.txt
rm error.log # empty
# Reload vegaInfoZfish table
ssh hgwdev
cd /cluster/data/danRer4/bed/vega/description
hgsql -e 'drop table vegaInfoZfish;' danRer4
hgLoadSqlTab danRer4 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
vegaInfoZfishWithDesc.txt
# No errors loading
# Added code already to hgc.c so that this works for Zebrafish and creates the
# relevant links. Add searches by vega transcript ID, ZFIN ID and
# clone ID. trackDb entry added as in VEGA GENES section above.
# Added track handler to hgTracks.c for vegaGeneZfish so that the
# transcript names from the name2 column of the genePred table are
# used for the item name displayed in the track.
# Add a Vega zebrafish-specific html description to trackDb/zebrafish.
# The Pseudogenes are a subtrack of the Genes track because it is too sparse
# to show as a separate track.
##########################################################################
# N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)
cd /cluster/data/danRer4/bed/nscan/
# obtained NSCAN predictions from michael brent's group
# at WUSTL
wget -nv -r -np http://ardor.wustl.edu/jeltje/zebrafish/chr_gtf
wget -nv -r -np http://ardor.wustl.edu/jeltje/zebrafish/chr_ptx
# clean up and rename downloaded directorys:
mv ardor.wustl.edu/jeltje/zebrafish/chr_gtf .
mv ardor.wustl.edu/jeltje/zebrafish/chr_ptx .
rm -rf ardor.wustl.edu
rm chr_*/index.html*
gzip chr_*/*
chmod a-w chr_*/*.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt danRer4 nscanGene chr_gtf/chr*.gtf.gz
# load protein, add .1 suffix to match transcript id
hgPepPred -suffix=.1 danRer4 generic nscanPep chr_ptx/chr*.fa.gz
rm *.tab
# update trackDb; need a danRer4-specific page to describe informants
zebrafish/danRer4/nscanGene.html (copy from mm8 and edit)
zebrafish/danRer4/trackDb.ra
# changed search regex to
termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
#######################################################################
# UPDATE AFFY ZEBRAFISH TRACK USING BLAT WITHOUT -mask OPTION AND
# USING -repeats OPTION AND DIFFERENT FILTERING TO REMOVE SHORT
# ALIGNMENTS (DONE, 2006-09-27 - 2006-09-28, hartera)
# With the previous version of this track, QA found a number of short
# alignments of <= 30 bp and there are a number in the <= 50bp range.
# These do not seem to be meaningful so filtering was changed to try to
# remove these alignments while retaining meaningful alignments.
# pslCDnaFilter was used with the same settings as used for the
# Genbank EST alignments for zebrafish.
# Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
# higher minIdentity is causing alignments to be dropped that should not be.
# Blat's minIdentity seems to be more severe than that for pslReps or
# pslCDnaFilter as it takes insertions and deletions into account.
# These are Jim's recommendations.
# NOTE: Also added alignments for NA_random and Un_random, these had not
# been done for the original affyZebrafish track but should have been.
# Array chip sequences already downloaded for danRer1
ssh hgwdev
cd /projects/compbio/data/microarray/affyZebrafish
mkdir -p /san/sanvol1/scratch/affy
cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/san/sanvol1/scratch/affy/
# Set up cluster job to align Zebrafish consensus sequences to danRer3
mkdir -p /cluster/data/danRer4/bed/affyZebrafish.2006-09-27
# remove old link and create new one
rm /cluster/data/danRer4/bed/affyZebrafish
ln -s /cluster/data/danRer4/bed/affyZebrafish.2006-09-27 \
/cluster/data/danRer4/bed/affyZebrafish
# Align sequences on the pitakluster. Scaffolds were aligned for NA_random
# and Un_random and lifted to chrom level afterwards. Chroms 1-25 and M
# were aligned as ~5 Mb chunks.
ssh pk
cd /cluster/data/danRer4/bed/affyZebrafish
ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
ls -1 $f >> genome.lst
end
wc -l genome.lst
# 3237 genome.lst
# for output:
mkdir -p /san/sanvol1/scratch/danRer4/affy/psl
# use -repeats option to report matches to repeat bases separately
# to other matches in the PSL output.
echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
-ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer4/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
> template.sub
gensub2 genome.lst affy.lst template.sub para.spec
para create para.spec
para try, check, push ... etc.
para time
# Completed: 3237 of 3237 jobs
#CPU time in finished jobs: 19319s 321.98m 5.37h 0.22d 0.001 y
#IO & Wait Time: 9297s 154.95m 2.58h 0.11d 0.000 y
#Average job time: 9s 0.15m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 98s 1.63m 0.03h 0.00d
#Submission to last job: 3135s 52.25m 0.87h 0.04d
# need to do pslSort and lift up
ssh pk
cd /san/sanvol1/scratch/danRer4/affy
# Do sort, liftUp and then best in genome filter.
# only use alignments that have at least
# 95% identity in aligned region.
# Previously did not use minCover since a lot of sequence is in
# Un and NA so genes may be split up so good to see all alignments.
# However, found a number of short alignments of <= 50 bp. These are
# not meaningful so maybe need to use minCover. If increased too much,
# then hits on poor parts of the assembly will be missed.
# use pslCDnaFilter with the same parameters as used for zebrafish
# Genbank EST alignments.
pslSort dirs raw.psl tmp psl
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
# seqs aligns
# total: 15272 828202
#drop minNonRepSize: 2763 741674
# drop minIdent: 2656 39188
# drop minCover: 2550 10784
# weird over: 359 1439
# kept weird: 277 347
# drop localBest: 2830 17737
# kept: 14952 18819
# Kept 97.9% of alignments. There are 15502 Affy sequences originally
# aligned so there are now 96.5% remaining.
# lift up the coordinates to chrom level
liftUp affyZebrafish.psl \
/cluster/data/danRer4/jkStuff/liftAll.lft warn contig.psl
# Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
# Lifting contig.psl
# rsync these psl files
rsync -a --progress /san/sanvol1/scratch/danRer4/affy/*.psl \
/cluster/data/danRer4/bed/affyZebrafish/
ssh kkstore04
cd /cluster/data/danRer4/bed/affyZebrafish
# shorten names in psl file
sed -e 's/Zebrafish://' affyZebrafish.psl > affyZebrafish.psl.tmp
mv affyZebrafish.psl.tmp affyZebrafish.psl
pslCheck affyZebrafish.psl
# psl is good
# load track into database
ssh hgwdev
cd /cluster/data/danRer4/bed/affyZebrafish
hgsql -e 'drop table affyZebrafish;' danRer4
hgLoadPsl danRer4 affyZebrafish.psl
# Add consensus sequences for Zebrafish chip
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/hgFixed/affyProbes
ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/gbdb/hgFixed/affyProbes
# these sequences were loaded previously so no need to reload.
hgLoadSeq -abbr=Zebrafish: danRer3 \
/gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
# Clean up
rm batch.bak contig.psl raw.psl
# check number of short alignments:
hgsql -e \
'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer4
# 7
# for previous filtered set, there were 1272 alignments of <= 50 bp so
# this has improved.
hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer4
# 14952
# Previously, there were 14819 so more sequences have aligned but less
# short alignments are retained. Many of the short alignments may also
# have longer alignments to different regions of the genome that are good.
#########################################################################
# COMPUGEN ZEBRAFISH OLIGOS TRACK (in progress, 2006-10-20, hartera)
# Align the zebrafish oligos from Compugen used to create the arrays
# used by GIS to study expression at different developmental stages.
ssh hgwdev
mkdir -p /projects/compbio/data/microarray/compugen/zebrafish
# save Compugen oligos FASTA file here. obtained from
# Sinnakaruppan Mathavan <mathavans@gis.a-star.edu.sg> at the
# Genome Institute of Singapore (GIS).
# Permission was obtained from Compugen to display the sequences
# along with a disclaimer. see README.txt
cd /projects/compbio/data/microarray/compugen/zebrafish
unzip Zebrafish\ Oligos_Compugen_XEBLIB96_pov_070704.zip
# this gives an Excel file, XEBLIB96_pov_070704.xls
# save as a tab separated text file using Excel: XEBLIB96_pov_070704.txt
# Remove quotation marks
sed -e 's/"//g' XEBLIB96_pov_070704.txt > GISArray.txt
# also remove other unwanted characters, ^@, which is ASCII for NULL
tr -d '\0' < GISArray.txt > GISArray.format.txt
awk 'BEGIN{FS="\t"} {if ($2 !~ /Serial/ && ($2 != "")) print ">"$2"\n"$4}' \
GISArray.format.txt > GISZfishArray.fa
grep '>' GISZfishArray.fa | wc -l
# 16399
# align sequences to the zebrafish genome on pk
mkdir -p /san/sanvol1/scratch/compugen
cp /projects/compbio/data/microarray/compugen/zebrafish/GISZfishArray.fa \
/san/sanvol1/scratch/compugen/
# Set up cluster job to align Zebrafish consensus sequences to danRer4
mkdir -p /cluster/data/danRer4/bed/compugenZebrafish.2006-11-03
ln -s /cluster/data/danRer4/bed/compugenZebrafish.2006-11-03 \
/cluster/data/danRer4/bed/compugenZebrafish
# Align sequences on the pitakluster. Scaffolds were aligned for NA_random
# and Un_random and lifted to chrom level afterwards. Chroms 1-25 and M
# were aligned as ~5 Mb chunks.
ssh pk
cd /cluster/data/danRer4/bed/compugenZebrafish
ls -1 /san/sanvol1/scratch/compugen/GISZfishArray.fa > oligos.lst
ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
ls -1 $f >> genome.lst
end
wc -l genome.lst
# 3237 genome.lst
# oligos are 65 bp in length.
# for output:
mkdir -p /san/sanvol1/scratch/danRer4/compugen/psl
# use -repeats option to report matches to repeat bases separately
# to other matches in the PSL output.
echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
-ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer4/compugen/psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
gensub2 genome.lst oligos.lst template.sub para.spec
para create para.spec
para try, check, push ... etc.
para time
# Completed: 3237 of 3237 jobs
# CPU time in finished jobs: 1948s 32.46m 0.54h 0.02d 0.000 y
# IO & Wait Time: 11145s 185.75m 3.10h 0.13d 0.000 y
# Average job time: 4s 0.07m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 428s 7.13m 0.12h 0.00d
# Submission to last job: 621s 10.35m 0.17h 0.01d
# need to do pslSort and lift up
ssh pk
cd /san/sanvol1/scratch/danRer4/compugen
# Do sort, liftUp and then best in genome filter.
# only use alignments that have at least
# 95% identity in aligned region.
# Previously did not use minCover since a lot of sequence is in
# Un and NA so genes may be split up so good to see all alignments.
# However, found a number of short alignments of <= 50 bp. These are
# not meaningful so maybe need to use minCover. If increased too much,
# then hits on poor parts of the assembly will be missed.
# use pslCDnaFilter with the same parameters as used for zebrafish
# Genbank EST alignments.
pslSort dirs raw.psl tmp psl
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
# for Compugen:
# Dropping minCover to 0.10 doesn't make a difference. Decreasing the minId to
# 0.92 increases the number of sequences aligned and does not increase
# the number of alignments for sequences with the most alignments.
# Removed the minimum non repeat filter does significantly increase the
# number of alignments for some sequences.
145 CGENZEB_456015402_0
79 CGENZEB_456008445_0
72 CGENZEB_456015991_0
53 CGENZEB_456012678_0
46 CGENZEB_456004521_0
# Total sequences: 16399
seqs aligns
total: 15544 102554
drop minNonRepSize: 1004 72545
drop minIdent: 825 3549
weird over: 13 48
kept weird: 8 16
drop localBest: 1288 7040
kept: 14632 19420
# 89.2% are kept.
# minCov = 0.10 minNonRepSize = 8
# seqs aligns
seqs aligns
total: 15544 102554
drop minNonRepSize: 1004 72545
drop minIdent: 825 3549
weird over: 13 48
kept weird: 8 16
drop localBest: 1288 7040
kept: 14632 19420
# 89.2% are kept.
# minCov=0.10 minNonRepSize = 10
seqs aligns
total: 15544 102554
drop minNonRepSize: 1015 72795
drop minIdent: 811 3462
weird over: 13 48
kept weird: 8 16
drop localBest: 1278 6901
kept: 14616 19396
# 89.1% kept.
# minNonRepSize = 0
seqs aligns
total: 15544 102554
drop minIdent: 1344 23893
weird over: 42 271
kept weird: 24 44
drop localBest: 1772 49794
kept: 15338 28867
# 93.8% kept from total
# but there are large numbers of alignments for some probes:
# 62 CGENZEB_456005547_0
603 CGENZEB_456005221_0
454 CGENZEB_456010007_0
409 CGENZEB_456014900_0
372 CGENZEB_456009900_0
# try increase identity but low minReps
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
-ignoreNs -bestOverlap -minId=0.97 -minCover=0.15 raw.psl \
contigMinRep8minId97.psl
# seqs aligns
total: 15544 102554
drop minNonRepSize: 1004 72545
drop minIdent: 1982 8772
weird over: 9 29
kept weird: 7 14
drop localBest: 766 2915
kept: 13715 18322
# this has improved highest number of hits a lot but this is similar to
# that achieved with higher identity too
# but only kept 80% of seqeuences.
145 CGENZEB_456015402_0
79 CGENZEB_456008445_0
72 CGENZEB_456015991_0
53 CGENZEB_456012678_0
46 CGENZEB_456004521_0
# lower minCov:
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=8 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.08 raw.psl \
contigMinCov8.psl
# seqs aligns
total: 15544 102554
drop minNonRepSize: 1004 72545
drop minIdent: 825 3549
weird over: 13 48
kept weird: 8 16
drop localBest: 1288 7040
kept: 14632 19420
# 89.2%, now nearBest = 0.1%
pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.10 raw.psl \
contigMinCov10NearBest1percent.psl
# seqs aligns
total: 15544 102554
drop minNonRepSize: 1004 72545
drop minIdent: 825 3549
weird over: 13 48
kept weird: 7 15
drop localBest: 1350 7451
kept: 14632 19009
# same number of sequences aligning but less overall alignments:
# 115 CGENZEB_456015402_0
# 71 CGENZEB_456015991_0
# 71 CGENZEB_456008445_0
# 46 CGENZEB_456004521_0
# 38 CGENZEB_456008610_0
# CGENZEB_456012678_0 now went down to 1.
# 89.2% aligned
# use minCover = 0.40
pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.40 raw.psl \
contig.psl
seqs aligns
total: 15544 102554
drop minNonRepSize: 1004 72545
drop minIdent: 825 3549
weird over: 13 48
kept weird: 7 15
drop localBest: 1350 7451
kept: 14632 19009
# little difference using minCover=0.60
cd /san/sanvol1/scratch/danRer4/compugen
rm contig*
# Use these parameters:
pslCDnaFilter -localNearBest=0.001 -minQSize=20 -minNonRepSize=8 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.40 raw.psl \
contig.psl
# use minCover = 0.40
seqs aligns
total: 15544 102554
drop minNonRepSize: 1004 72545
drop minIdent: 825 3549
weird over: 13 48
kept weird: 7 15
drop localBest: 1350 7451
kept: 14632 19009
# use minCover=0.60
seqs aligns
total: 15544 102554
drop minNonRepSize: 1004 72545
drop minIdent: 825 3549
drop minCover: 198 507
weird over: 9 39
kept weird: 4 12
drop localBest: 1285 7009
kept: 14588 18944
# lift up the coordinates to chrom level
liftUp compugenZebrafish.psl \
/cluster/data/danRer4/jkStuff/liftAll.lft warn contig.psl
# Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
# Lifting contig.psl
# rsync these psl files
rsync -a --progress /san/sanvol1/scratch/danRer4/compugen/*.psl \
/cluster/data/danRer4/bed/compugenZebrafish
ssh kkstore04
cd /cluster/data/danRer4/bed/compugenZebrafish
# shorten names in psl file
pslCheck compugenZebrafish.psl
# psl is good
# load track into database
ssh hgwdev
cd /cluster/data/danRer4/bed/compugenZebrafish
hgsql -e 'drop table compugenZebrafish;' danRer4
hgLoadPsl danRer4 compugenZebrafish.psl
# Add entry in trackDb/zebrafish/trackDb.ra and a search for hgFindSpec
# Add a description page.
# Need to add disclaimer for sequences.
# Add consensus sequences for Zebrafish chip
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/hgFixed/compugenProbes
ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/gbdb/hgFixed/affyProbes
# these sequences were loaded previously so no need to reload.
hgLoadSeq -abbr=Zebrafish: danRer3 \
/gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
# Clean up
rm batch.bak contig.psl raw.psl
# check number of short alignments:
hgsql -e \
'select count(*) from affyZebrafish where (qEnd - qStart) <= 50;' danRer4
# 7
# for previous filtered set, there were 1272 alignments of <= 50 bp so
# this has improved.
hgsql -e 'select count(distinct(qName)) from affyZebrafish;' danRer4
# 14952
# Previously, there were 14819 so more sequences have aligned but less
# short alignments are retained. Many of the short alignments may also
# have longer alignments to different regions of the genome that are good.
#########################################################################
# ENSEMBL GENES TRACKS FOR ENSEMBL VERSION 42
# ENSEMBL GENES (PROTEIN-CODING) AND ENSEMBL NON-CODING GENES
# (DONE, 2007-01-08 - 2007-01-09 hartera)
# Obtained from BioMart at Ensembl (The Wellcome Trust Sanger Institute)
# Starting downloading Ensembl v41 genes (2006-12-13)
# get "unexpected end of file" error with the peptide download.
# Notified Ensembl (2006-12-15).
# Ensembl helpdesk say that the files sometimes get terminated early
# for large downloads so try using this link to BioMart instead:
# http://www.biomart.org/biomart/martview
# Repeat above using this link. This has Ensembl42 though so e-mailed
# Ensembl to ask if they are releasing Ensembl42 soon (2006-12-18)
# Ensembl was updated to v42 in Dec. 2006 so use this new data set
# (2007-01-08):
ssh kkstore04
mkdir -p /cluster/data/danRer4/bed/ensembl42
cd /cluster/data/danRer4/bed/ensembl42
# Get the Ensembl gene data from BioMart at:
# http://www.biomart.org/biomart/martview
# Follow this sequence through the pages: (NOTE: this interface has changed
# significantly since danRer3). Ensembl version is 42 (Dec 2006).
# 1) The Dataset link in the left panel is selected. Select the
# Ensembl dataset (v42 here) and the Danio_rerio choice (ZFISH6 here).
# 2) Click on the Attributes link in the left panel.
# 3) Select Structures. Click on the + next to GENE to expand it
# and check the boxes for the Ensembl Gene ID and Ensembl
# Transcript ID.
# 4) Clicking on the "Count" link on the top black menu shows that there
# are 28,508 / 28,508 Genes selected in Danio rerio genes (ZFISH6)
# 5) Click on the "Results" link on the top black menu and then select GFF
# as the format and select to export all results to a
# "Compressed web file (notify by e-mail)" and hit the "Go" button and
# enter e-mail address as requested.
# When results are ready, you will receive an e-mail with a link to
# download the results, save as ensemblGene42.gff.gz
# Save as and move file to
# /cluster/data/danRer4/bed/ensembl42
gunzip ensemblGene42.gff.gz
# file unzips ok.
# Repeat above but at step 3, selec the Features Attribute
# select Ensembl Transcript ID and Biotype under the GENE section.
# Select "Text, tab separated" as the output format and gzip
# compression. Biotype gives information to separate the genes into
# protein-coding and RNA genes and pseudogenes.
# For step 5, select CSV as the output and then select to export all
# results to a "Compressed web file (notify by e-mail)" and hit the
# "Go" button and enter e-mail address as requested.
# Save as ensemblGene42Biotype.tsv.gz and move to
# /cluster/data/danRer4/bed/ensembl42
gunzip ensemblGene42Biotype.tsv.gz
# file unzips ok.
# The Ensembl gene predictions are mapped to chromosomes except for
# chrNA and chrUn. Use lift files for scaffolds to these chroms.
# get chrUn and chrNA Ensembl records.
ssh kkstore04
cd /cluster/data/danRer4/bed/ensembl42
# need to lift up the NA and Un scaffolds to chroms
liftUp -type=.gtf ensemblGene42.lifted \
/cluster/data/danRer4/jkStuff/liftAll.lft carry ensemblGene42.gff
# Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
# Lifting ensemblGene42.gff
wc -l ensemblGene42*
# 807871 ensemblGene42.gff
# 807871 ensemblGene42.lifted
# 39626 ensemblGene42Biotype.tsv
# check there are no scaffolds left in lifted file:
grep Zv6_NA ensemblGene42.lifted
grep Zv6_scaffold ensemblGene42.lifted
# there are none so ok.
# add chr at beginning of each line. NA and Un already have "chr"
# prefix so then remove the extra one.
sed -e "s/^/chr/" ensemblGene42.lifted | sed -e "s/chrchr/chr/" \
> ensGene42.gff
# check file sizes -ok and some of the lifted co-ordinates
# Also remove the suffix that denotes the transcript version number.
# This is not in the ensGtp or ensPep tables.
perl -pi.bak -e 's/\.[0-9]+//'g ensGene42.gff
# Next split up the gff into a protein-coding gene set and a RNA gene and
# pseudogene set and load into different tracks.
# get transcript IDs only for protein coding transcripts
grep "protein_coding" ensemblGene42Biotype.tsv | awk '{print $1}' \
> ensGene42ProteinCoding.ids
# skip header line and grab everything else from the file
tail +2 ensemblGene42Biotype.tsv | grep -v "protein_coding" \
| awk '{print $1}' > ensGene42NonCoding.ids
wc -l ensGene42*ids
# 3560 ensGene42NonCoding.ids
# 36065 ensGene42ProteinCoding.ids
# 39625 total
wc -l ensemblGene42Biotype.tsv
# 39626 ensemblGene41Biotype.tsv
# extra line is the header line
# then get only the protein-coding trancsripts from the GFF file
# write a script to do this as grep is slow
cat << 'EOF' > getIds.pl
#!/usr/bin/perl -w
use strict;
my ($in, $file, %ids);
$in = $ARGV[0]; # list of ids
$file = $ARGV[1]; # GFF file or other data file
open(IN, $in) || die "Can not open $in :$!\n";
open (FILE, $file) || die "Can not open $file :$!\n";
open (FOUND, ">found.log") || "Can not create found.log: $!\n";
while (<IN>) {
chomp;
my $l = $_;
$ids{$l} = 1;
}
close(IN);
# read GFF file or other data file and check whether transcript ID is in
# the hash before printing out that line.
while (<FILE>){
my ($line, $transId);
$line = $_;
$transId = "";
if ($line =~ /(ENSDART[0-9]+)/){
$transId = $1;
}
if (exists($ids{$transId})){
print $line;
print FOUND "$transId\n";
}
}
close(FILE);
'EOF'
chmod +x getIds.pl
perl getIds.pl ensGene42ProteinCoding.ids ensGene42.gff \
> ensGene42ProteinCoding.gff
# uniq found.log and check against input ids
sort found.log | uniq > foundProtein.uniq
sort ensGene42ProteinCoding.ids > ens42ProteinIds.sort
comm -13 foundProtein.uniq ens42ProteinIds.sort
# All ids were found in the gff file
perl getIds.pl ensGene42NonCoding.ids ensGene42.gff \
> ensGene42NonCoding.gff
sort found.log | uniq > foundNonCoding.uniq
sort ensGene42NonCoding.ids > ens42NonCodingIds.sort
comm -13 foundNonCoding.uniq ens42NonCodingIds.sort
# All ids were found in the gff file
rm *.sort *.uniq *.bak found.log
wc -l ensGene42*.gff
# 807871 ensGene42.gff
# 3695 ensGene42NonCoding.gff
# 804176 ensGene42ProteinCoding.gff
# load into database
ssh hgwdev
cd /cluster/data/danRer4/bed/ensembl42
hgsql -e 'drop table ensGene;' danRer4
hgsql -e 'drop table ensGeneNonCoding;' danRer4
/cluster/bin/x86_64/ldHgGene danRer4 ensGene ensGene42ProteinCoding.gff
# Read 36065 transcripts in 804176 lines in 1 files
# 36065 groups 27 seqs 1 sources 4 feature types
# 36065 gene predictions
/cluster/bin/x86_64/ldHgGene danRer4 ensGeneNonCoding ensGene42NonCoding.gff
# Read 3560 transcripts in 3695 lines in 1 files
# 3560 groups 27 seqs 1 sources 1 feature types
# 3560 gene predictions
# The only difference between Ensembl v42 and v41 for zebrafish is two
# extra gene predictions in the non-coding category in v42.
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use BioMart to create it as above, except:
# Step 3) Choose "Features". Expand the GENE section and under
# "Ensembl Attributes", check boxes for Ensembl Gene ID,
# Ensembl Transcript ID, Ensembl Peptide ID.
# After clicking on the Results link in the top black menu,
# Choose CSV as the output format and Export all results to a
# "Compressed web file (notify by e-mail)" and hit the
# "Go" button and enter e-mail address as requested.
# Result name: ensembl42Gtp.tsv.gz
ssh kkstore04
cd /cluster/data/danRer4/bed/ensembl42
gunzip ensembl42Gtp.tsv.gz
# separate the IDs for protein-coding genes and the rest (RNA genes and
# pseudogenes).
# transcript ID and gene ID are in different columns than before so switch
# Gene ID should be in first column and Transcrip ID in the second column.
awk 'BEGIN {FS="\t"} {OFS="\t"} {print $2,$1,$3}' ensembl42Gtp.tsv \
> ens42GtpFormat.tsv
perl getIds.pl ensGene42ProteinCoding.ids ens42GtpFormat.tsv \
> ensGtpProteinCoding.txt
# uniq found.log and check against input ids
sort found.log | uniq > foundProtein.uniq
sort ensGene42ProteinCoding.ids > ens42ProteinIds.sort
comm -13 foundProtein.uniq ens42ProteinIds.sort
perl getIds.pl ensGene42NonCoding.ids ens42GtpFormat.tsv \
> ensGtpNonCoding.txt
# uniq found.log and check against input ids
sort found.log | uniq > foundNonCoding.uniq
sort ensGene42NonCoding.ids > ens42NonCodingIds.sort
comm -13 foundNonCoding.uniq ens42NonCodingIds.sort
# All ids were found in the gff file
rm *.sort *.uniq *.bak found.log
wc -l ensGtp*.txt
# 3560 ensGtpNonCoding.txt
# 36065 ensGtpProteinCoding.txt
# The non-coding set have only gene ids and transcript ids and
# no protein ids.
# Load database
ssh hgwdev
cd /cluster/data/danRer4/bed/ensembl42/
hgsql -e 'drop table ensGtp;' danRer4
# load ensGtp for protein-coding genes
hgLoadSqlTab danRer4 ensGtp ~/kent/src/hg/lib/ensGtp.sql \
ensGtpProteinCoding.txt
# only load IDs for the protein coding genes. The non-coding genes
# have no protein ID.
# Get the ensembl peptide sequences from
# http://www.biomart.org/biomart/martview
# Follow this sequence:
# 1) Choose the Ensembl Genes 42 as the database and then
# Danio Rerio genese (ZFISH6) as the dataset.
# 2) Click on the Attributes link in the left panel. Select sequences.
# 3) Expand the SEQUENCES section and choose Peptide as type of sequence
# to export and then expand the Header Information section and select
# Ensembl Gene ID from Gene Attributes and
# Ensembl Transcript ID and Ensembl Peptide ID from
# Transcript Attributes
# 4) Click on the Filters link in the left panel and expand the GENE
# section. Select the Gene type box and then select protein_coding as
# these are the only genes with an associated protein sequence.
# 5) Click on the Results link in the top black menu bar and
# choose FASTA for the output and export all results to
# Compressed file (notify by e-mail).
# save the file as ensembl42Pep.fasta.gz and move to
# /cluster/data/danRer4/bed/ensembl42
# Got results URL by e-mail but BioMart seems to be currently inaccessible
ssh kkstore04
cd /cluster/data/danRer4/bed/ensembl42
gunzip ensembl42Pep.fasta.gz
grep '>' ensembl42Pep.fasta | wc -l
# 36048
grep '>' ensembl42Pep.fasta > headers
awk 'BEGIN {FS="|"} {print $2;}' headers > pepTranscript.ids
sort pepTranscript.ids | uniq > pepTranscript.ids.sort
sort ensGene42ProteinCoding.ids | uniq > proteinCoding.ids.sort
comm -13 proteinCoding.ids.sort pepTranscript.ids.sort
# no difference
comm -23 proteinCoding.ids.sort pepTranscript.ids.sort > noPep
# There are 17 of these.
# found some of them on the Ensembl zebrafish Genome Browser and found
# the peptide sequences. E-mailed Ensembl's helpdesk to ask how to get
# peptide sequences for these 17 transcript IDs (2007-01-09).
# Then downloaded peptide sequences for just this set of 17, but only got
# 16 of them. To do this, follow the instructions as above for the
# obtaining the peptide sequences but on the Filters page, expand the GENE
# section and check the box for ID list limit and select
# Ensembl Transcript ID(s) and paste in the list. Name output file
# otherIDs.fasta.gz
gunzip otherIDs.fasta.gz
grep '>' otherIDs.fasta > headers2
awk 'BEGIN {FS="|"} {print $2;}' headers2 > otherPepTranscript.ids
sort otherPepTranscript.ids | uniq > otherPepTranscript.ids.sort
comm -13 noPep otherPepTranscript.ids.sort
# no difference
comm -23 noPep otherPepTranscript.ids.sort
# ENSDART00000049311
# Repeat above procedure to query for peptide sequence for just this
# transcript ID and name file: otherIDs2.fasta.gz
# E-mailed helpdesk@ensembl.org to report all these problems (2007-01-09)
gunzip otherIDs2.fasta.gz
# Concatenate all sequences:
cat ensembl42Pep.fasta otherIDs.fasta otherIDs2.fasta > ens42Pep.fasta
grep '>' ens42Pep.fasta | wc
# 36065
grep '>' ens42Pep.fasta > all.headers
awk 'BEGIN {FS="|"} {print $2;}' all.headers | sort | uniq > allTxIds.sort
comm -13 proteinCoding.ids.sort allTxIds.sort
# no difference
comm -23 proteinCoding.ids.sort allTxIds.sort
# no difference so got all protein sequences for the protein-coding
# trancsript IDs now.
# load into database
ssh hgwdev
cd /cluster/data/danRer4/bed/ensembl42
hgsql -e 'drop table ensPep;' danRer4
hgPepPred danRer4 ensembl ensembl42Pep.fasta
# edit trackDb/zebrafish/danRer4 to have an ensGene entry with the
# archive date for Enembl v42 which is used for creating stable archive
# links for the transcript ID and protein ID to make sure that these
# always connect to the correct version of Ensembl Genes.
# added track handler to hgTracks.c for ensGeneNonCoding and added
# code to hgc.c to handle creating the correct stable archive link for
# a particular version of Ensembl.
# trackDb/zebrafish/danRer4/trackDb.ra entries for ensGene and
# ensGeneNonCoding include these lines for creating the correct URLs:
# url http://dec2006.archive.ensembl.org/Danio_rerio/transview?transcript=$$
# urlName gene
# archive dec2006
# Add Biotype and External Gene ID to the Ensembl Non-Coding genes table
# These can be retrieved from BioMart using the method as above for
# Biotype but also selecting the External Gene ID. Click on the Filter
# link on the left panel and expand the GENE section and check the box
# for Gene Type and select all types except for protein_coding.
# Select TSV as the output and Compressed file (*.gz) as the format.
# save as ensNonCoding.biotype.txt.gz
ssh hgwdev
cd /cluster/data/danRer4/bed/ensembl42
gunzip ensNonCoding.biotype.txt.gz
tail +2 ensNonCoding.biotype.txt > ensNonCoding.biotype.tab
cat << 'EOF' > ensBiotype.sql
CREATE TABLE ensBiotype (
transcriptId varchar(255) not null,
biotype varchar(255) not null,
extGeneId varchar(255) not null
);
'EOF'
hgLoadSqlTab danRer4 ensBiotype ensBiotype.sql ensNonCoding.biotype.tab
# Add extra fields to ensNonCoding genePred table:
hgsql -e \
'alter table ensGeneNonCoding add biotype varchar(255) NOT NULL;' \
danRer4
hgsql -e \
'alter table ensGeneNonCoding add extGeneId varchar(255) NOT NULL;' \
danRer4
# Add index to the extGeneId column:
hgsql -e 'alter table ensGeneNonCoding add index(extGeneId);' danRer4
hgsql -e 'select count(*) from ensGeneNonCoding;' danRer4
# 3560
hgsql -e 'update ensGeneNonCoding set biotype = "";' danRer4
hgsql -e 'update ensGeneNonCoding set extGeneId = "";' danRer4
# Now populate these columns with data from the ensBiotype table
hgsql -e 'select count(*) from ensGeneNonCoding as g, ensBiotype as b \
where g.name = b.transcriptId;' danRer4
# 3560
hgsql -e 'update ensGeneNonCoding as g, ensBiotype as b \
set g.biotype = b.biotype where g.name = b.transcriptId;' danRer4
hgsql -e 'select count(*) from ensGeneNonCoding where biotype != "";' \
danRer4
# 3560
# then set the External Gene ID:
hgsql -e 'update ensGeneNonCoding as g, ensBiotype as b \
set g.extGeneId = b.extGeneId where g.name = b.transcriptId;' danRer4
hgsql -e 'select count(*) from ensGeneNonCoding where biotype != "";' \
danRer4
# 3393
# This is correct since 167 rows in the ensNonCoding.biotype.tab have no
# external Gene ID:
awk '{if ($3 == "") print;}' ensNonCoding.biotype.tab | wc -l
# 167
# 3393 + 167 = 3360
# Now check code in hgc.c for handling the details page for this track.
#########################################################################
# RADIATION HYBRID (RH) MAP TRACK (DONE, 2007-01-12 - 2007-01-23, hartera)
# Data from Yi Zhou at Boston Children's Hospital:
# yzhou@enders.tch.harvard.edu
# Latest RH map sequences and primers received on 2006-10-03 from
# Anhua (Peter) Song - asong@enders.tch.harvard.edu
# Changed the name of rhMapInfo table and related files to rhMapZfishInfo
# to make the name more zebrafish-specific (2007-02-08, hartera)
# Remake track as one of the primer sequences was in the sequence for
# 1942C.INSERTMUT and also changed another marker name to remove a forward
# slash. Remade rhMapZfishInfo table and removed spaces from primer sequences.
# (2007-02-14, hartera)
# Collected stats on RH map alignments for Yi Zhou (DONE, 2007-03-28, hartera)
ssh kkstore04
mkdir /cluster/data/danRer4/bed/ZonLab/rhMap-2006-10-03
cd /cluster/data/danRer4/bed/ZonLab
ln -s rhMap-2006-10-03 rhMap
cd rhMap
# download data files from e-mail:
# rhSequenceSubmit100306.zip and rhSequenceSubmitSeq100306.zip
unzip rhSequenceSubmit100306.zip
unzip rhSequenceSubmitSeq100306.zip
dos2unix rhSequenceSubmit100306.txt
dos2unix rhSequenceSubmitSeq100306.txt
# Sequences are in rhSequenceSubmitSeq100306.txt and primers and other
# information are in rhSequenceSubmi100306.txt
grep '>' rhSequenceSubmitSeq100306.txt | wc -l
# 11514
wc -l rhSequenceSubmit100306.txt
# 13438 rhSequenceSubmit100306.txt
grep '>' rhSequenceSubmitSeq100306.txt > rhMap.names
# remove '>' from names and grab first field
perl -pi.bak -e 's/>//' rhMap.names
awk 'BEGIN {FS="|"} {print $1;}' rhMap.names | sort | uniq \
> rhMap.namesOnly.sort
awk 'BEGIN {FS="|"} {print $1;}' rhSequenceSubmit100306.txt | sort | uniq \
> rhMapPrimers.namesOnly.sort
wc -l *.sort
# 11514 rhMap.namesOnly.sort
# 13436 rhMapPrimers.namesOnly.sort (after removing blank line)
# get a list of headers from the FASTA file
grep '>' rhSequenceSubmitSeq100306.txt > rhMap.headers
awk 'BEGIN {FS="|"} {print $5;}' rhMap.headers | sort | uniq
# BAC_END
# EST
# GENE
# SSLP
# STS
# There are 5 types of sequence here.
awk 'BEGIN {FS="|"} {print $9;}' rhMap.headers | sort | uniq
#BACends
#Custom
#Insertion_Mutant
#Insertion_Mutants
#MGH
#NCBI
#Sanger SG
#Sequencing_Project
#ThisseClone
#Thisse_Clone
#other_zfEst
#wu_zfEst
#wz
awk 'BEGIN {FS="|"} {print $10;}' rhMap.headers | sort | uniq
# CHBG
# MPIEB
# Insertion_Mutant = Insertion_Mutants; ThisseClone = Thisse_Clone;
# So there are 11 different sources.
# There are 2 sequences with problem primers. E-mailed Peter Song about
# these and he suggested to delete thoser primers:
# >fb33f01.u1|5|388|5615|EST|f|cR|f|wu_zfEst|CHBG|+++33333333333333333333.|
# >zfishb-a976e04.p1c|14|16|158|STS|f|cR|f|Sequencing_Project|CHBG|A|A|
# edit rhMap022306.fa and rhMapPrimers022306.txt and delete these primers.
# need to reformat FASTA headers so they are in the format:
# NAME.SOURCE.TYPE.ORIGIN
# Insertion_Mutant=Insertion_Mutants; Thisse_Clone=ThisseClone
# so change these to have the same name. Also shorten Sanger SG to
# Shotgun.
sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmitSeq100306.txt \
| sed -e 's/Insertion_Mutant/InsertMut/' \
| sed -e 's/Sanger SG/Shotgun/' \
| sed -e 's/ThisseClone/Thisse/' \
| sed -e 's/Thisse_Clone/Thisse/' \
| sed -e 's/Sequencing_Project/Seqproj/' > rhMap100306.fa
# Do the same for the primers and information file:
sed -e 's/Insertion_Mutants/InsertMut/' rhSequenceSubmit100306.txt \
| sed -e 's/Insertion_Mutant/InsertMut/' \
| sed -e 's/Sanger SG/Shotgun/' \
| sed -e 's/ThisseClone/Thisse/' \
| sed -e 's/Thisse_Clone/Thisse/' \
| sed -e 's/Sequencing_Project/Seqproj/' > rhMapPrimers100306.txt
# edit these files to remove the extra newline char after the first primer
# for 1942c and then change "/" in FJ34C05.Y1/FJ56G09.Y1.WU_ZFEST to
# an underscore (2007-02-14, hartera)
perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
rhMap100306.fa
perl -pi.bak -e 's/fj34c05\.y1\/fj56g09/fj34c05\.y1_fj56g09/' \
rhMapPrimers100306.txt
# use a script to reformat the names for the FASTA headers to the format
# >NAME.SOURCE where name is the first field separated by "|" and source
# is the 9th field. The source is used to make the name unique. Some
# of these names are BAC ends that occur in the BAC ends track so there
# are name clashes in the seq table if the names are not made unique.
# Also make the name upper case as for those for the danRer1 and danRer2
# RH map and remove base numbering on each sequence line of FASTA file.
cat << '_EOF_' > rhFix
#!/usr/bin/awk -f
#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
split(toupper($0), a, "\\|");
print a[1]"."a[9];
next;
}
/^[0-9]+ / {
$0 = $2;
}
{
print $0;
}
'_EOF_'
# << keep emacs coloring happy
chmod +x rhFix
rhFix rhMap100306.fa > rhMap.fa
# Blat sequences vs danRer4 genome
ssh pk
mkdir -p /cluster/data/danRer4/bed/ZonLab/rhMap/blatRun
cd /cluster/data/danRer4/bed/ZonLab/rhMap
# put the rhMap sequences on the san
mkdir -p /san/sanvol1/scratch/danRer4/rhMap
cp rhMap.fa /san/sanvol1/scratch/danRer4/rhMap/
# do blat run to align RH map sequences to danRer4 and and use
# chrNA_random and chrUn_random separated into scaffolds.
cd blatRun
ls -1S /san/sanvol1/scratch/danRer4/rhMap/rhMap.fa > rhMap.lst
ls -1 /san/sanvol1/scratch/danRer4/trfFa/chr[0-9M]*.fa > genome.lst
foreach f (/san/sanvol1/scratch/danRer4/scaffoldsSoftMask/*.fa)
ls -1 $f >> genome.lst
end
wc -l genome.lst
# 3237 genome.lst
# for output:
mkdir -p /san/sanvol1/scratch/danRer4/rhMap/psl
# use -repeats option to report matches to repeat bases separately
# to other matches in the PSL output.
echo '#LOOP\n/cluster/bin/x86_64/blat -repeats=lower -minIdentity=80
-ooc=/san/sanvol1/scratch/danRer4/danRer4_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer4/rhMap/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
> template.sub
gensub2 genome.lst rhMap.lst template.sub para.spec
para create para.spec
para try, check, push ... etc.
para time
# Completed: 3237 of 3237 jobs
#CPU time in finished jobs: 4787s 79.78m 1.33h 0.06d 0.000 y
#IO & Wait Time: 8080s 134.67m 2.24h 0.09d 0.000 y
#Average job time: 4s 0.07m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 18s 0.30m 0.01h 0.00d
#Submission to last job: 752s 12.53m 0.21h 0.01d
# need to do pslSort and lift up
ssh pk
cd /san/sanvol1/scratch/danRer4/rhMap
# Do sort, liftUp and then best in genome filter.
# only use alignments that have at least
# 95% identity in aligned region.
# Previously did not use minCover since a lot of sequence is in
# Un and NA so genes may be split up so good to see all alignments.
# However, found a number of short alignments of <= 50 bp. These are
# not meaningful so maybe need to use minCover. If increased too much,
# then hits on poor parts of the assembly will be missed.
# use pslCDnaFilter with the same parameters as used for zebrafish
# Genbank EST alignments.
pslSort dirs raw.psl tmp psl
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
# drop minNonRepSize: 3068 1286657
# drop minIdent: 3442 104586
# drop minCover: 2838 205568
# weird over: 163 1124
# kept weird: 107 172
# drop localBest: 3011 17130
# kept: 11121 14216
# 11514
# The percentage aligned is 11121/11514 = 96.6%
# Number of alignments for markers with most alignments after filtering:
# 35 BZ83M20.Z.BACENDS
# 17 ZKP63A5.YA.BACENDS
# 17 ZKP117C9.YA.BACENDS
# 16 ZK30E10.SP6.BACENDS
# 15 ZC133H17.ZA.BACENDS
# 12 Z13442.MGH
# 11 ZK105J10.T7.BACENDS
# 10 ZC261G9.ZAF.BACENDS
# 10 ZC261G9.ZA.BACENDS
# 9 ZK19H9.SP6.BACENDS
# 9 Z4910.MGH
# 9 FJ07G09.X1.WU_ZFEST
# 8 ZK4I5.T7.BACENDS
# 8 ZC27I3.ZA.BACENDS
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.92 -minCover=0.15 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
# drop minNonRepSize: 3068 1286657
# drop minIdent: 2740 60578
# drop minCover: 3083 223430
# weird over: 318 3132
# kept weird: 154 249
# drop localBest: 3480 43022
# kept: 11212 14470
# Percentage aligned is 11212/11514 = 97.4%
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=10 \
-ignoreNs -bestOverlap -minId=0.92 -minCover=0.15 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
#drop minNonRepSize: 3026 1258275
# drop minIdent: 2902 72521
# drop minCover: 3256 231002
# weird over: 344 3365
# kept weird: 157 252
# drop localBest: 3604 51799
# kept: 11228 14560
# There isn't much difference 11228/11514 = 97.5%
awk '{print $10}' contig.psl | sort | uniq -c | sort -nr
# Top numbers of hits:
# 35 BZ83M20.Z.BACENDS
# 17 ZKP63A5.YA.BACENDS
# 17 ZKP117C9.YA.BACENDS
# 16 ZK30E10.SP6.BACENDS
# 15 ZC133H17.ZA.BACENDS
# 13 FJ07G09.X1.WU_ZFEST
# 12 Z13442.MGH
# 11 ZK105J10.T7.BACENDS
# 10 ZC261G9.ZAF.BACENDS
# 10 ZC261G9.ZA.BACENDS
# 9 ZK19H9.SP6.BACENDS
# 9 Z4910.MGH
# 9 Z3157.MGH
# 8 ZK4I5.T7.BACENDS
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.90 -minCover=0.15 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
# drop minNonRepSize: 3068 1286657
# drop minIdent: 2306 34000
# drop minCover: 3166 230461
# weird over: 388 5030
# kept weird: 168 270
# drop localBest: 3647 62505
# kept: 11232 14534
# Percent sequences aligned: 11232/11514 = 97.6%
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.90 -minCover=0.20 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
# drop minNonRepSize: 3068 1286657
# drop minIdent: 2306 34000
# drop minCover: 3418 245102
# weird over: 343 4235
# kept weird: 159 252
# drop localBest: 3206 48291
# kept: 11189 14107
# Percent sequences aligned: 11189/11514 = 97.2%
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.80 -minCover=0.20 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
#drop minNonRepSize: 3068 1286657
# drop minIdent: 1 2
# drop minCover: 3599 256955
# weird over: 414 8594
# kept weird: 173 270
# drop localBest: 3410 70389
# kept: 11205 14154
# Percent sequences aligned: 11205/11514 = 97.3%
# 35 BZ83M20.Z.BACENDS
# 17 ZKP63A5.YA.BACENDS
# 17 ZKP117C9.YA.BACENDS
# 16 ZK30E10.SP6.BACENDS
# 15 ZC133H17.ZA.BACENDS
# 13 FJ07G09.X1.WU_ZFEST
# 11 ZK105J10.T7.BACENDS
# 10 ZC261G9.ZAF.BACENDS
# 10 ZC261G9.ZA.BACENDS
# 9 ZK19H9.SP6.BACENDS
# 9 Z4910.MGH
# 8 ZK4I5.T7.BACENDS
# 8 ZC27I3.ZA.BACENDS
# 8 Z7243.MGH
# 8 Z3157.MGH
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.80 -minCover=0.15 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
#drop minNonRepSize: 3068 1286657
# drop minIdent: 1 2
# drop minCover: 3322 238087
# weird over: 470 9995
# kept weird: 181 288
# drop localBest: 3876 88821
# kept: 11246 14590
# Percent sequences aligned: 11246/11514 = 97.7%
# Use lower minId and higher minCover (0.20) as for the BAC ends and for
# the RH map on other zebrafish assemblies.
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.85 -minCover=0.20 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
#drop minNonRepSize: 3068 1286657
# drop minIdent: 775 3806
# drop minCover: 3552 255528
# weird over: 403 7578
# kept weird: 171 268
# drop localBest: 3358 68020
# kept: 11203 14146
# 97.3% (11203/11514) of sequences are aligned using these filter criteria
# Loaded these sequences as below and then checked the rhMap track in the
# danRer4 Genome Browser to see if there are any pileups.
# there is one big pileup on chr24 that is in the same region as
# that was found for danRer3 after using liftOver:
# i.e. chr13:8,112,962-8,113,055 on danRer3 which lifts over to
# chr24:8,191,404-8,191,497 on danRer4 and there is also a pileup
# of RH map sequences here. If you look at Z33743, it has 3 alignments
# to chr23, chr24 and chrNA_random. The chr23 alignment is the best and
# this is where its primers map to. If a higher threshold is taken
# for min coverage in the filtering, this may be avoided. Checked all the
# whole chromosome views in the Browser and chr24 is the only one that
# appears to have this large pileup.
# try increasing the minCover parameter:
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.85 -minCover=0.25 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
#drop minNonRepSize: 3068 1286657
# drop minIdent: 775 3806
# drop minCover: 3754 271241
# weird over: 358 6379
# kept weird: 157 252
# drop localBest: 2916 52769
# kept: 11100 13684
# Percent sequences aligned: 11100/11514 = 96.4%
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.85 -minCover=0.30 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
# drop minNonRepSize: 3068 1286657
# drop minIdent: 775 3806
# drop minCover: 3929 283124
# weird over: 310 5451
# kept weird: 145 236
# drop localBest: 2549 41325
# kept: 10938 13245
# Percent sequences aligned: 10938/11514 = 95.0%
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.85 -minCover=0.40 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
#drop minNonRepSize: 3068 1286657
# drop minIdent: 775 3806
# drop minCover: 4293 298517
# weird over: 245 4052
# kept weird: 128 211
# drop localBest: 2079 26658
# kept: 10489 12519
# Percent sequences aligned: 10489/11514 = 91.1%
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.85 -minCover=0.35 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
# drop minNonRepSize: 3068 1286657
# drop minIdent: 775 3806
# drop minCover: 4119 292022
# weird over: 274 4640
# kept weird: 137 227
# drop localBest: 2279 32801
# kept: 10724 12871
# Percent sequences aligned: 10724/11514 = 93.1%
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.85 -minCover=0.32 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
# drop minNonRepSize: 3068 1286657
# drop minIdent: 775 3806
# drop minCover: 4001 287002
# weird over: 296 5113
# kept weird: 144 235
# drop localBest: 2437 37599
# kept: 10862 13093
# Percent sequences aligned: 10862/11514 = 94.3%
rm contig*
# Final parameters: use minCover=0.33
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.85 -minCover=0.33 raw.psl contig.psl
# seqs aligns
# total: 11326 1628158
# drop invalid: 1 1
# drop minNonRepSize: 3068 1286657
# drop minIdent: 775 3806
# drop minCover: 4045 288763
# weird over: 287 4946
# kept weird: 142 233
# drop localBest: 2375 35906
# kept: 10818 13025
# Percent sequences aligned: 10818/11514 = 94.0%
# This is a compromise between reducing the number of sequences piling
# up on chr24 but not losing all alignments for too many sequences.
cd /cluster/data/danRer4/bed/ZonLab/rhMap
# lift up to genome level coordinates
rm rhMap*psl
liftUp rhMap.psl \
/cluster/data/danRer4/jkStuff/liftAll.lft warn \
/san/sanvol1/scratch/danRer4/rhMap/contig.psl
# Got 6247 lifts in /cluster/data/danRer4/jkStuff/liftAll.lft
pslCheck rhMap.psl
# psl looks ok
# cleanup
rm *.bak rhMap.headers rhMap.names *.sort headers.new
# Load sequence alignments into the database
ssh hgwdev
cd /cluster/data/danRer4/bed/ZonLab/rhMap
# drop test tables and reload final psl file
# drop old rhMap table
hgsql -e 'drop table rhMap;' danRer4
hgLoadPsl danRer4 rhMap.psl
# Copy sequences to gbdb if they are not already there.
mkdir -p /gbdb/danRer4/rhMap
# remove old sequences
rm /gbdb/danRer4/rhMap/rhMap20061003.fa
ln -s \
/cluster/data/danRer4/bed/ZonLab/rhMap/rhMap.fa \
/gbdb/danRer4/rhMap/rhMap20061003.fa
# then add sequences to database:
# remove old sequences (2007-02-14, hartera)
hgsql -e 'select * from extFile where path like "%rhMap%";' danRer4
# id | name | path | size |
+--------+------------------+--------------------------------------+---------+
#| 709793 | rhMap20061003.fa | /gbdb/danRer4/rhMap/rhMap20061003.fa | 7456887 |
hgsql -e 'select count(*) from seq where extFile = 709793;' danRer4
# 11514
hgsql -e 'delete from seq where extFile = 709793;' danRer4
hgsql -e 'delete from extFile where id = 709793;' danRer4
# then reload the new sequence file
hgLoadSeq danRer4 /gbdb/danRer4/rhMap/rhMap20061003.fa
# loaded succesfully
# Check in the Browser and see if there are many pileups
# Much reduced now on chr24. Took 10 random sequences in the pileup from
# minCover=0.20 and found that 7 of them still align to danRer4
# with minCover=0.33 and 2 of those that don't also have primers that
# do not map using the hgPcr tool.
# Add trackDb entry and also an rhMap.html for trackDb/zebrafish/danRer4
# also add the search specs for hgFindSpec to trackDb.ra
# Add table of related information for the RH map details pages:
# Check that all the headers from rhMap.headers are also in the primers
# file which seems to contain the same headers from the FASTA file
# as well as additional markers.
# Remake the rhMapZfishInfo table too (hartera, 2007-02-14) so that
# new line is removed from 1942C.INSERTMUT line and also the underscore is
# added to the FJ34C05.Y1_FJ56G09.Y1.WU_ZFEST ID in place of "/".
ssh kkstore04
cd /cluster/data/danRer4/bed/ZonLab/rhMap/
grep '>' rhMap100306.fa > rhMap.headers
perl -pi.bak -e 's/>//' rhMap.headers
sort rhMap.headers > rhMap.headers.sort
sort rhMapPrimers100306.txt > rhMapPrimers.sort
wc -l *.sort
# 11514 rhMap.headers.sort
# 13437 rhMapPrimers.sort
comm -12 rhMap.headers.sort rhMapPrimers.sort | wc -l
# 11514 in common
# so all FASTA headers from rhMap022306.fa are in the primers file
# Get headers again from rhMap.fa file as the names of the sources have
# been changed. Parse out information from headers to add to an rhMapInfo
# table so that this information can be displayed on the details page for
# the RH map markers.
# Fields: 1 - name, 2 - linkage group (chrom), 3 - position number on the
# RH map for that linkage group, 4 - distance (in cR) from the
# top of a linkage group, 4 - position number in entire RH map (ordered
# from LG1 to LG25, 5 - type of marker (SSLP, BAC_END, EST, GENE, STS),
# 9 - source, 10 - institute that mapped the marker, 11 - 5' forward primer,
# 12 - 3' reverse primer.
# Sort headers by linkage group and by position
grep '>' rhMap100306.fa > rhMap.headers2
# then use the rhMap.headers2 file to extract the marker information
# and to reformat the names for the FASTA headers to the format
# >NAME.SOURCE where name is the first field separated by "|" and source
# is the 9th field so that names in the rhMap and rhMapInfo tables are
# the same. The source is used to make the name unique.
cat << '_EOF_' > getRhInfo
#!/usr/bin/awk -f
#>z1396|14|418|5707|SSLP|f|cR|f|MGH|MPIEB|ATCCTTCAGCCACTCCTTCA|TGGAACCTGAAAAACACACG|
/^>/ {
sub(/>/,"",$0);
split(toupper($0), a, "\\|");
print a[1]"."a[9]"\tLG"a[2]"\t"a[3]"\t"a[4]"\t"a[5]"\t"a[9]"\t"a[10]"\t"a[11]"\t"a[12];
next;
}
'_EOF_'
# << keep emacs coloring happy
chmod +x getRhInfo
getRhInfo rhMap.headers2 > rhMapInfo.tab
# Sort headers by linkage group (LG) and by position
sort -k 2,2 -k 3,3n rhMapInfo.tab > rhMapInfoSorted.tab
wc -l rhMapInfoSorted.tab
# 11514 rhMapInfoSorted.tab
# Need to add ZFIN IDs - data received on 2006-06-23
# rhSeqWithZdbNameToRachel.zip
unzip rhSeqWithZdbNameToRachel.zip
tail +3 rhSeqWithZdbNameToRachel.txt \
| awk 'BEGIN {OFS= "\t"} {print $1, $7}' \
| sort | uniq > rhSeqZfinIds.txt
# translate names to upper case
cat rhSeqZfinIds.txt | tr '[a-z]' '[A-Z]' > rhSeqZfinIds.format.txt
# then map these marker names and ZFIN IDs to markers in
# rhMapInfoSorted.tab. Also remove spaces - some of the primer sequences
# have spaces (hartera, 2007-02-14)
cat << 'EOF' > mapZfinIds.pl
#!/usr/bin/perl -w
use strict;
my ($zf, $rh, %zfinIds);
$zf = $ARGV[0]; # file of ZFIN IDs and marker names
$rh = $ARGV[1]; # rhMapInfo.tab file
open (ZFIN, $zf) || die "Can not open $zf :$!\n";
open (RH, $rh) || die "Can not open $rh : $!\n";
while (<ZFIN>){
my ($line, @fi);
chomp;
$line = $_;
@fi = split(/\t/, $line);
# store ZFIN ID in hash keyed by marker name
$zfinIds{$fi[1]} = $fi[0];
}
close ZFIN;
# read in the markers from rhMapInfo file
while (<RH>){
my ($li, @f, $marker, @m, $mName, $j, $i);
$mName = "";
$zf = "";
chomp;
$li = $_;
@f = split(/\t/, $li);
$marker = $f[0];
# split by "."
@m = split(/\./, $marker);
# remove the extension after the last "."
$mName = $m[0];
if (($mName ne "") && (exists($zfinIds{$mName}))) {
$zf = $zfinIds{$mName};
}
for ($j = 1; $j < $#m; $j++){
$mName = $mName . "." . $m[$j];
}
if (($mName ne "") && (exists($zfinIds{$mName}))) {
$zf = $zfinIds{$mName};
}
print "$f[0]\t$zf";
# print other fields and remove spaces
for ($i = 1; $i <= $#f; $i++){
$f[$i] =~ s/\s//g;
print "\t$f[$i]";
}
if ($#f == 6){
print "\t\t";
}
print "\n";
}
'EOF'
chmod +x mapZfinIds.pl
perl mapZfinIds.pl rhSeqZfinIds.format.txt rhMapInfoSorted.tab \
> rhMapInfoWithZfinIds.tab
# There are 1867 markers with no ZFIN ID
wc -l rhMapInfo*
# 11514 rhMapInfo.tab
# 11514 rhMapInfoSorted.tab
# 11514 rhMapInfoWithZfinIds.tab
# When loading, found that 1942.C has only 1 primer. Problem with
# rhMapPrimers100306.txt. There was a new line between the primers
# for this file so remove it there and in rhMap100306.fa and then
# process it again (now this was done at an earlier step, 2007-02-14).
# Create a table with RH map item information including type, source,
# origin and primer sequences.
# already created rhMapInfo.sql, rhMapInfo.c and rhMapInfo.h files
# using autosql - see danRer3.txt. None of the assemblies with RH
# map on the RR have this rhMapInfo table so it can be redefined.
# load these into a table called rhMapInfo2 - this is rhMapInfo
# with an extra column for the ZFIN ID.
# Use autosql to create a .sql file.
ssh hgwdev
# rename the information table and make it zebrafish specific
# (2007-02-08, hartera)
cat << 'EOF' > ~/kent/src/hg/lib/rhMapZfishInfo.as
table rhMapZfishInfo
"Zebrafish Radiation Hybrid map information"
(
string name; "Name of Radiation Hybrid (RH) map marker"
string zfinId; "ZFIN ID for the marker"
string linkageGp; "Linkage group to which the marker was mapped"
uint position; "Position number in RH map for this linkage group"
uint distance; "Distance from the top of linkage group (cR)"
string markerType; "Type of marker"
string source; "Source of marker"
string mapSite; "Institution that mapped the marker"
string leftPrimer; "Forward primer sequence"
string rightPrimer; "Reverse primer sequence"
)
'EOF'
# << happy emacs
# create .sql, .c and .h files using autoSql
cd ~/kent/src/hg/lib
autoSql rhMapZfishInfo.as rhMapZfishInfo
mv rhMapZfishInfo.h ../inc
# edit rhMapZfishInfo.sql and add an index (INDEX(zfinId)).
# commit these files (*.as, *sql, *.c and *.h) to CVS replacing
# the original rhMapInfo* files.
# make changes to hgc so that it prints the ZFIN ID in addition to the
# other rhMapZfishInfo fields.
# reload table with new name (2007-02-08, hartera):
cd /cluster/data/danRer4/bed/ZonLab/rhMap
hgsql -e 'drop table rhMapInfo;' danRer4
# reloaded the rhMapZfishInfo table (2007-02-08, hartera)
hgsql -e 'drop table rhMapZfishInfo;' danRer4
hgLoadSqlTab danRer4 rhMapZfishInfo ~/kent/src/hg/lib/rhMapZfishInfo.sql \
rhMapInfoWithZfinIds.tab
# add code to hgc.c to print ZFIN ID, if available, on the details page
# together with the other marker-related information.
# added track to trackDb.ra in trackDb/zebrafish/danRer4 with a URL for
# the ZFIN IDs to link to the relevant page at http://www.zfin.org
# and added an html page for the track.
# Added the rhMapZfishInfo.h file to the makefile in src/hg/lib
# and replaced rhMapInfo with rhMapZfishInfo in src/hg/hgc/hgc.c
# RH MAP STATISTICS
# Get some stats for Yi Zhou at Harvard (2007-03-20 & 2007-03-28)
# Of the 11514 markers with sequence information, 10818 aligned (94%)
# using a filter for 85% sequence identity and all portions of all
# alignments for a sequence must be within 0.5% of the identity of the
# best alignments for each portion of the marker. The query must have at
# least 0.33 of the sequence aligned and at least 16 bases must not be in
# repeat regions.
cd /cluster/data/danRer4/bed/ZonLab/rhMap
mkdir stats
cd stats
hgsql -e 'select count(distinct(qName)) from rhMap;' danRer4
# 10818
hgsql -N -e 'select qName from rhMap;' danRer4 | sort | uniq -c \
| sort -nr > qNames.count
# send this list too
# 1701 markers have 2 or more BLAT alignment that pass the filter.
hgsql -N -e 'select name, linkageGp from rhMapZfishInfo;' danRer4 \
> markers.linkageGroups
hgsql -N -e 'select qName, tName from rhMap;' danRer4 > rhMap.align.chroms
ssh kkstore04
cd /cluster/data/danRer4/bed/ZonLab/rhMap/stats
sed -e 's/LG/chr/' markers.linkageGroups > markers.rhMap.chroms
# some marker names contain "LG"
awk '{print $1}' markers.linkageGroups | grep "LG"
# there are 18 and all begin with "TLG"
sed -e 's/Tchr/TLG/' markers.rhMap.chroms > markers.rhMap.chroms2
sort markers.rhMap.chroms2 | uniq > markers.rhMap.chroms.sort
wc -l markers.rhMap.chroms*
# 11514 markers.rhMap.chroms
# 11514 markers.rhMap.chroms.sort
# 11514 markers.rhMap.chroms2
# same when uniqued
sort rhMap.align.chroms | uniq > rhMap.align.chroms.sort
wc -l rhMap.align*
# 13025 rhMap.align.chroms
# 11344 rhMap.align.chroms.sort
# Find how well the RH map and Zv6 agree in terms of chromosome
# assignment given that linkage group number is the same as the
# chromosome number.
comm -23 rhMap.align.chroms.sort markers.rhMap.chroms.sort \
> diffChromInGenome
# need to find just those in rhMap.align.chroms.sort that are
# in rhMap.
awk '{print $1}' rhMap.align.chroms.sort | sort | uniq > rhMap.align.names
foreach n (`cat rhMap.align.names`)
echo $n
grep -w $n markers.rhMap.chroms.sort >> markers.rhMap.chroms.aligned
end
# 10818 in markers.rhMap.chroms.aligned
# 10818 rhMap.align.names
# then compare this list to the ones that are aligned to the genome
comm -13 rhMap.align.chroms.sort markers.rhMap.chroms.aligned \
> diffChromInRHMap
wc -l diffChromInRHMap
# 1392 diffChromInRHMap
# these are the markers that have a different chromosome (linkage group)
# assigned in the RH map to that found by BLAT alignment of the marker
# sequence to the genome. This list shows the linkage groups (chr) in the
# RH map then generate a list of where these align in the genome
# These are markers that have at least one alignment to the same chrom
# as in the linkage map. They may be aligning to other chroms too.
awk '{print $1}' diffChromInRHMap > diffChromInRHMap.names
foreach n (`cat diffChromInRHMap.names`)
echo $n
grep -w $n rhMap.align.chroms.sort >> rhMap.genomeAlign.diffInRHmap
end
wc -l rhMap.genomeAlign.diffInRHmap
# 1562 rhMap.genomeAlign.diffInRHmap
# This is the list of markers that differ in chrom between the RH map
# and genome alignment with the list of chroms to which they are
# aligned by BLAT in an alignment of the marker sequence to the genome.
# There are more lines in this file because some markers align more than
# once to the genome so they appear more than once in the file.
# Therefore of those markers aligned, 10818, there are 1392 (12.9%)
# that are aligning to a different chromosome.
# Some of these may be aligning to chrUn_random or chrNA_random
grep random rhMap.genomeAlign.diffInRHmap | awk '{print $1}' \
| sort | uniq > diffInRHmap.alignedToRandom
wc -l diffInRHmap.alignedToRandom
# 142 diffInRHmap.alignedToRandom
# Of the markers with different chroms in the genome alignment and the
# linkage map, 142 (1.3% of 10818) are aligning to chrUn_random or
# chrNA_random so the sequence containing these markers has
# not yet been placed on a chromosome.
#########################################################################
## Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
hgsql -h genome-testdb hgcentraltest \
-e "update dbDb set orderKey = 450 where name = 'danRer4';"
##########################################################################
# GenBank gbMiscDiff table (markd 2007-01-10)
# Supports `NCBI Clone Validation' section of mgcGenes details page
# genbank release 157.0 now contains misc_diff fields for MGC clones
# reloading mRNAs results in gbMiscDiff table being created.
./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna danRer4
#########################################################################
# BLASTZ/CHAIN/NET oryLat1 (DONE - 2007-01-19,20 - Hiram)
ssh kkstore04
mkdir /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
cd /cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
cat << '_EOF_' > DEF
# Zebrafish vs. Medaka
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Zebrafish danRer4, no randoms or Un in this sequence
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
SEQ1_LIMIT=30
# TARGET: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
# chrUn in Scaffolds for this alignment run
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl DEF -verbose=2 \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/danRer4OryLat1 > do.log 2>&1 &
# real 556m6.806s
cat fb.danRer4.chainOryLat1Link.txt
# 209746583 bases of 1626093931 (12.899%) in intersection
cd /cluster/data/danRer4/bed
ln -s blastz.oryLat1.2007-01-19 blastz.oryLat1
## swap to oryLat1 - also in oryLat1.txt
mkdir /cluster/data/oryLat1/bed/blastz.swap.danRer4
cd /cluster/data/oryLat1/bed/blastz.swap.danRer4
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/danRer4/bed/blastz.oryLat1.2007-01-19/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > swap.log 2>&1 &
cat fb.oryLat1.chainDanRer4Link.txt
# 156014546 bases of 700386597 (22.275%) in intersection
cd /cluster/data/oryLat1/bed
ln -s blastz.swap.danRer4 blastz.danRer4
#########################################################################
# BLASTZ/CHAIN/NET fr2 (DONE - 2007-01-29 - Hiram)
ssh kkstore04
mkdir /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
cd /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
cat << '_EOF_' > DEF
# Zebrafish vs. Fugu
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Zebrafish danRer4, no randoms or Un in this sequence
SEQ1_DIR=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/danRer4/danRer4.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
SEQ1_LIMIT=30
# QUERY: Fugu fr2
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/danRer4/bed/blastz.fr2.2007-01-29
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl DEF -verbose=2 \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/danRer4Fr2 > do.log 2>&1 &
## recover from pk kluster problems and finish blastz job
time doBlastzChainNet.pl DEF -verbose=2 \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=cat -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/danRer4Fr2 > cat.log 2>&1 &
## recover from kki kluster problems and finish chain job
time doBlastzChainNet.pl DEF -verbose=2 \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=chainMerge -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/danRer4Fr2 > chainMerge.log 2>&1 &
# real 554m13.214s
## swap
mkdir /cluster/data/fr2/bed/blastz.danRer4.swap
cd /cluster/data/fr2/bed/blastz.danRer4.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/danRer4/bed/blastz.fr2.2007-01-29/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# running 2007-01-30 - 16:35
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/danRer4/bed/blastz.fr2.2007-01-29/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/danRer4/bed/blastz.fr2.2007-01-29
time nice -n +19 featureBits danRer4 chainFr2Link \
> fb.danRer4.chainFr2Link.txt 2>&1
# 138918185 bases of 1626093931 (8.543%) in intersection
time nice -n +19 featureBits fr2 chainDanRer4Link \
> fb.fr2.chainDanRer4Link.txt 2>&1
# 80963231 bases of 393312790 (20.585%) in intersection
# ASZ (3-22-2007)this process failed to create four tables, so I created
# them an left them empty (as discussed with Hiram).
CREATE TABLE `danRer4`.`chrUn_random_chainFr2` (
`bin` smallint( 5 ) unsigned NOT NULL default '0',
`score` double NOT NULL default '0',
`tName` varchar( 255 ) NOT NULL default '',
`tSize` int( 10 ) unsigned NOT NULL default '0',
`tStart` int( 10 ) unsigned NOT NULL default '0',
`tEnd` int( 10 ) unsigned NOT NULL default '0',
`qName` varchar( 255 ) NOT NULL default '',
`qSize` int( 10 ) unsigned NOT NULL default '0',
`qStrand` char( 1 ) NOT NULL default '',
`qStart` int( 10 ) unsigned NOT NULL default '0',
`qEnd` int( 10 ) unsigned NOT NULL default '0',
`id` int( 10 ) unsigned NOT NULL default '0',
KEY `bin` ( `bin` ) ,
KEY `id` ( `id` )
) TYPE = MYISAM ;
CREATE TABLE `danRer4`.`chrUn_random_chainFr2Link` (
`bin` smallint( 5 ) unsigned NOT NULL default '0',
`tName` varchar( 255 ) NOT NULL default '',
`tStart` int( 10 ) unsigned NOT NULL default '0',
`tEnd` int( 10 ) unsigned NOT NULL default '0',
`qStart` int( 10 ) unsigned NOT NULL default '0',
`chainId` int( 10 ) unsigned NOT NULL default '0',
KEY `bin` ( `bin` ) ,
KEY `chainId` ( `chainId` )
) TYPE = MYISAM ;
CREATE TABLE `danRer4`.`chrNA_random_chainFr2` (
`bin` smallint( 5 ) unsigned NOT NULL default '0',
`score` double NOT NULL default '0',
`tName` varchar( 255 ) NOT NULL default '',
`tSize` int( 10 ) unsigned NOT NULL default '0',
`tStart` int( 10 ) unsigned NOT NULL default '0',
`tEnd` int( 10 ) unsigned NOT NULL default '0',
`qName` varchar( 255 ) NOT NULL default '',
`qSize` int( 10 ) unsigned NOT NULL default '0',
`qStrand` char( 1 ) NOT NULL default '',
`qStart` int( 10 ) unsigned NOT NULL default '0',
`qEnd` int( 10 ) unsigned NOT NULL default '0',
`id` int( 10 ) unsigned NOT NULL default '0',
KEY `bin` ( `bin` ) ,
KEY `id` ( `id` )
) TYPE = MYISAM ;
CREATE TABLE `danRer4`.`chrNA_random_chainFr2Link` (
`bin` smallint( 5 ) unsigned NOT NULL default '0',
`tName` varchar( 255 ) NOT NULL default '',
`tStart` int( 10 ) unsigned NOT NULL default '0',
`tEnd` int( 10 ) unsigned NOT NULL default '0',
`qStart` int( 10 ) unsigned NOT NULL default '0',
`chainId` int( 10 ) unsigned NOT NULL default '0',
KEY `bin` ( `bin` ) ,
KEY `chainId` ( `chainId` )
) TYPE = MYISAM ;
###########################################################################
# CREATE LIFTOVER FROM danRer4 TO danRer5
# (DONE, 2007-09-21 - 2007-09-22, hartera)
ssh kkstore04
mkdir /cluster/data/danRer4/bed/blat.danRer5
cd /cluster/data/danRer4/bed/blat.danRer5
time nice doSameSpeciesLiftOver.pl danRer4 danRer5 \
-bigClusterHub pk \
-ooc /san/sanvol1/scratch/danRer4/danRer4_11.ooc \
-buildDir=/cluster/data/danRer4/bed/blat.danRer5 >& do.log &
# 0.337u 0.208s 4:58:26.59 0.0% 0+0k 0+0io 28pf+0w
# Remove symbolic link to liftOver chains and copy over the file
rm ../liftOver/danRer4ToDanRer5.over.chain.gz
cp -p danRer4ToDanRer5.over.chain.gz ../liftOver
# a link in /usr/local/apache/htdocs/goldenPath/danRer5/liftOver has
# already been made to this file and md5sum.txt needs to be updated
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
md5sum *.gz > md5sum.txt
md5sum *.gz > ../../goldenPath/liftOver/md5sum.txt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/danRer4/liftOver
ln -s /cluster/data/danRer5/bed/liftOver/danRer4ToDanRer5.over.chain.gz .
#############################################################################
# CONTRAST GENES (2007-10-02 markd)
# recieved predictions from Sam Gross <ssgross@stanford.edu>
cd /cluster/data/danRer4/bed/contrastGene/
wget http://www.stanford.edu/~ssgross/contrast.danRer4.bed
# this is a custom track, not a pure BED
tail +2 contrast.danRer4.bed | hgLoadBed -tab danRer4 contrastGene stdin
# verify
# load track db (ra and contrastGene.html are global
# request push of contrastGene
###########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
danRer4.upstreamGeneTbl = refGene
danRer4.upstreamMaf = multiz7way /hive/data/genomes/danRer4/bed/multiz7way/species.lst