src/hg/makeDb/doc/danRer5.txt 1.17
1.17 2009/08/10 16:38:35 hartera
Documented production of dog canFam2 chains, nets and downloads.
Index: src/hg/makeDb/doc/danRer5.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer5.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/danRer5.txt 21 Jul 2009 21:01:41 -0000 1.16
+++ src/hg/makeDb/doc/danRer5.txt 10 Aug 2009 16:38:35 -0000 1.17
@@ -1,3147 +1,3162 @@
# for emacs: -*- mode: sh; -*-
# Danio Rerio (zebrafish) from Sanger, version Zv7 (released 07/13/07)
# Project website:
# http://www.sanger.ac.uk/Projects/D_rerio/
# Assembly notes:
# http://www.sanger.ac.uk/Projects/D_rerio/
# ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_assembl_information.shmtl
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
###########################################################################
# DOWNLOAD SEQUENCE (DONE, 2007-07-16, hartera)
# MOVE FILES TO SEPARATE DIRECTORY (DONE, 2007-07-20, hartera)
ssh kkstore06
mkdir /cluster/store4/danRer5
ln -s /cluster/store4/danRer5 /cluster/data
cd /cluster/data/danRer5
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/README
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_chr.agp
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_contigs.fa
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_scaffold.agp
wget --timestamp \
ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv7release/Zv7_scaffolds.fa
# move assembly files to a separate directory
mkdir Zv7release
mv *.agp *.fa README ./Zv7release/
###########################################################################
# CREATE AGP FILES FOR RANDOMS (chrNA and chrUn)
# (2007-08-16 - 2007-08-18, hartera)
ssh kkstore06
mkdir /cluster/data/danRer5/Zv7release/randoms
cd /cluster/data/danRer5/Zv7release/randoms
# first find the contigs assigned to chromosomes:
awk '{if ($5 !~ /N/) print $6}' ../Zv7_chr.agp | sort | uniq \
> chromContigs.txt
# get list of all contigs:
awk '{if ($5 !~ /N/) print $6}' ../Zv7_scaffold.agp | sort | uniq \
> allContigs.txt
# find contigs not assigned to a chromosome
comm -23 allContigs.txt chromContigs.txt > contigsRandomsAndAccs.txt
# get all those that are in the Zv7_scaffold.agp and get a list of
# scaffold names.
foreach c (`cat contigsRandomsAndAccs.txt`)
grep -w $c ../Zv7_scaffold.agp >> contigsRandomsScafs.agp
end
# check that all the contigs/clones names in contigsRandomsAndAccs.txt
# are also in contigsRandomsScafs.agp
awk '{if ($6 != 100) print $6}' contigsRandomsScafs.agp \
| sort | uniq > names.sort
wc -l names.sort contigsRandomsAndAccs.txt
# 19845 names.sort
# 19845 contigsRandomsAndAccs.txt
comm -12 contigsRandomsAndAccs.txt names.sort | wc -l
# 19845
# get list of scaffolds names
awk '{print $1}' contigsRandomsScafs.agp | sort -k 1.7,1n | uniq \
> scaffoldsRandoms.txt
# make an AGP of the randoms scaffolds only
grep -w -f scaffoldsRandoms.txt ../Zv7_scaffold.agp > randomsScaffold.agp
# check that we have all the scaffolds to be selected
awk '{print $1}' randomsScaffold.agp | sort | uniq > names2.sort
sort scaffoldsRandoms.txt > scafsTmp.sort
wc -l scafsTmp.sort names2.sort
# 5010 scafsTmp.sort
# 5010 names2.sort
comm -12 scafsTmp.sort names2.sort | wc -l
# 5010
# all scaffolds are in the new AGP file that occur in scaffoldsRandoms.txt
# get the list of contigs from the agp
awk '{if ($5 !~ /N/) print $6}' randomsScaffold.agp | sort | uniq \
> randomContigs.list
# extract the FASTA sequences for just these contigs
faSomeRecords ../Zv7_contigs.fa randomContigs.list Zv7contigs_random.fa
# remove excess part of the names for the contigs with accessions
perl -pi.bak -e 's/^(>[A-Z]{2}[0-9]+\.[0-9]+)\s.+$/$1/' Zv7contigs_random.fa
# check that all the contigs from the list are in the FASTA file
grep '>' Zv7contigs_random.fa | sed -e 's/^>//' | sort | uniq \
> Zv7contigs.list
wc -l *.list
# 19736 Zv7contigs.list
# 19845 randomContigs.list
comm -13 Zv7contigs.list randomContigs.list > notFound
wc -l notFound
# 109 notFound
# (2007-08-06, hartera)
# RE-DONE (2007-08-18, hartera)
# These are present in the original, but there are several sequeunces for
# each accession since there are Ns in the sequence. Need to fix the names
# in the AGP to match each part of the sequence in the FASTA file.
# First, get the list of headers from the FASTA for all contigs
grep '>' ../Zv7_contigs.fa > contigs.headers
# remove ">"
perl -pi.bak -e 's/>//' contigs.headers
foreach a (`cat notFound`)
grep $a contigs.headers >> contigsHeaders.notFound
end
awk '{print $1}' contigsHeaders.notFound > contigsHeaders.notFound.accs
perl -pi.bak -e 's/_.+$//' contigsHeaders.notFound.accs
sort contigsHeaders.notFound.accs | uniq \
> contigsHeaders.notFound.accs.sort
sort notFound | uniq > notFound.sort
wc -l notFound.sort
# 109 notFound.sort
wc -l contigsHeaders.notFound.accs.sort
# 109 contigsHeaders.notFound.accs.sort
comm -12 notFound.sort contigsHeaders.notFound.accs.sort | wc -l
# 109
# So all the not Found accessions are in the list of contig headers
# accessions: contigsHeaders.notFound.accs.sort
# Then extract the names for the accession parts
# e.g. BX649254.13_01364 zK228P6.01364 BX649254 1 32480
# and add them to the AGP file for the correct lines for these
# components. The last 2 fields above are the start and end coordinates
# relative to the accession (BX649254.13 in this case). Also, add
# 50,000 Ns between scaffolds.
# Wrote program to do this: agpAddCtgNamesAndGaps.c
/cluster/home/hartera/bin/x86_64/agpAddCtgNamesAndGaps \
contigsHeaders.notFound randomsScaffold.agp \
randomsScafFixed.agp >& agpScafs.log
awk '{print $1}' randomsScaffold.agp | sort | uniq > scafs.lst.uniq
wc -l scafs.lst.uniq
# 5010 scafs.lst.uniq
wc -l *.agp
# 46688 contigsRandomsScafs.agp
# 40641 randomsScafFixed.agp
# 35633 randomsScaffold.agp
# 35633 + 5010 = 40643 but there are 2 less gap rows since there are none
# at the ends of the random chromosomes. So the number of lines in the
# AGP file is correct: 40641.
# get the list of contigs again for the randoms from the FASTA headers
# and check all of these are present in the AGP file and vice versa.
cp contigs.headers contigs.names
# remove excess part of the names for the contigs with accessions
perl -pi.bak -e \
's/^([A-Z]{2}[0-9]+\.[0-9]+_?[0-9]*\.?[0-9]*)\s.+$/$1/' \
contigs.names
sort contigs.names | uniq > contigs.names.sort
awk '{print $6}' randomsScafFixed.agp | sort | uniq > contigsFromAgp.sort
wc -l contigs*.sort
# 60092 contigs.names.sort
# 20351 contigsFromAgp.sort
comm -13 contigs.names.sort contigsFromAgp.sort
# CR293502.4
# CR356227.33
# CR388165.16
# CR854948.10
# CR931788.11
# CR954226.7
# CT573234.3
# CT573263.4
# CT737182.2
# CT997808.4
# These accessions are not matched from the headers to the AGP file.
# e.g. CR293502.4
# CR293502.4_00001 zK31A1.00001 CR293502 1 27131
# CR293502.4_02478 zK31A1.02478 CR293502 27232 29631
# CR293502.4_01210.0 zK31A1.01210.0 CR293502 119649 233137
# The last 2 fields above are coordinates relative to the accession and should
# match to fields 7 and 8 in the relevant lines of the AGP file but, in this
# case, they do not.
# in the AGP file:
# Zv7_scaffold2558 1944730 1960624 49 D CR293502.4 11237
# 27131 +
# Zv7_scaffold2558 1960625 1960724 50 N 100 fragment no
# Zv7_scaffold2558 1960725 1963124 51 D CR293502.4 27232
# 29631 +
# Zv7_scaffold2558 1963125 1963224 52 N 100 fragment no
# Zv7_scaffold2558 1963225 1995007 53 D CR293502.4 201355
# 233137 -
# Co-ordinates are relative to CR293502.4 in fields 7 and 8
grep CR293502.4 randomsScaffold.agp > CR293502.4.agp
# E-mailed Tina Eyre (te3@sanger.ac.uk) and Ian Sealy (is1@sanger.ac.uk) at
# Sanger to ask them about these discrepancies and how to fix it (2007-08-09).
# Received the Zv7_all_scaffolds.agp file on 2007-08-14 from Ian Sealy
# (is1@sanger.ac.uk). This contains the unfinished clones and should not be
# shared with the public. The contig/clone names match up to those for clone
# name fragments in the Zv7_contigs.fa file.
# Received Zv7_all_scaffolds.agp:
grep CR293502.4 ../Zv7_all_scaffolds.agp > CR293502.4.allScafs.agp
# Zv7_scaffold2558 1944730 1960624 49 U CR293502.4_00001
# 11237 27131 + zK31A1.00001 15895 27131
# Zv7_scaffold2558 1960725 1963124 51 U CR293502.4_02478
# 12400 + zK31A1.02478 2400 2400
# Zv7_scaffold2558 1963225 1995007 53 U CR293502.4_01210.0
# 81707 113489 - zK31A1.01210.0 31783 113489
# Coordinates in fields 7 and 8 of this file are relative to the clone
# fragment names in field 6.
foreach f (CR293502*.agp)
awk \
'{if ($5 !~ /N/) print "faFrag ", $6".fa", $7-1, $8, $6$7"-"$8".fa";}' \
$f >> faFrag${f}
end
chmod +x faFrag*
awk '{print $6}' CR293502.4.allScafs.agp > ctgList.txt
foreach f (`cat ctgList.txt`)
echo $f > list
faSomeRecords ../Zv7_contigs.fa list ${f}.fa
end
faFragCR293502.4.agp
# Wrote 15895 bases to CR293502.411237-27131.fa
# Wrote 2400 bases to CR293502.427232-29631.fa
# Wrote 31783 bases to CR293502.4201355-233137.fa
faFragCR293502.4.allScafs.agp
# Wrote 15895 bases to CR293502.4_0000111237-27131.fa
# Wrote 2400 bases to CR293502.4_024781-2400.fa
# Wrote 31783 bases to CR293502.4_01210.081707-113489.fa
# When diff on each pair of files of the same size, the sequence is
# identical, only the headers are different.
# Decided to base assembly on scaffolds not contigs (2007-08-22)
##########################################################################
# ALL CHROMS AGP (2007-08-18, hartera)
ssh kkstore06
cd /cluster/data/danRer5/Zv7release
awk '{if ($5 !~ /N/) print $6;}' Zv7_all_scaffolds.agp | sort | uniq \
> contigNamesFromAllScafs.sort
# compare to contig names from FASTA file
comm -13 contigNamesFromAllScafs.sort ./randoms/contigs.names.sort
# no difference: all contig names from AGP file are in the FASTA file
comm -23 contigNamesFromAllScafs.sort ./randoms/contigs.names.sort \
> notInAllScafs
wc -l notInAllScafs
# 3924 notInAllScafs
grep "Zv7_NA" notInAllScafs | wc -l
# 3924
# So the only ones not in FASTA file are the 3924 Zv7_NA contigs
# get clone names without underscore and extension
# remove excess part of the names for the contigs with accessions
cp contigNamesFromAllScafs.sort contigNamesFromAllScafs2.sort
perl -pi.bak -e \
's/^([A-Z]{2}[0-9]+\.[0-9]+)_?[0-9]*\.?[0-9]*$/$1/' \
contigNamesFromAllScafs2.sort
grep -v "Zv7_NA" contigNamesFromAllScafs2.sort \
| sort | uniq > contigNamesFromAllScafs2NoNAs.sort
# get list of contigs and clones in randoms only
awk '{if ($5 !~ /N/) print $6;}' ./randoms/randomsScaffold.agp \
| sort | uniq > randomsContigsNames.sort
# remove randoms scaffolds from the list of contigs/clones from
# Zv7_all_scaffolds.agp
comm -13 randomsContigsNames.sort contigNamesFromAllScafs2NoNAs.sort \
> contigNamesFromAllScafs2NoRandoms.txt
sort contigNamesFromAllScafs2NoRandoms.txt | uniq \
> contigNamesFromAllScafs2NoRandoms.sort
# then get the compare this list to a list of clones/contigs
# from Zv7_chr.agp
awk '{if ($5 !~ /N/) print $6;}' Zv7_chr.agp | sort | uniq \
> chromsAgpContigs.sort
comm -23 contigNamesFromAllScafs2NoRandoms.sort chromsAgpContigs.sort \
| wc -l
# 0
# So there are no new contigs in the Zv7_all_scaffolds.agp file that
# are not randoms or Zv7_NA.
# Try agpAddCtgNamesAndGaps.c on Zv7_chr.agp and see if all
# clone fragments can be found in the FASTA file.
cp ./randoms/contigs.headers .
# get the names from the headers
awk '{print $1}' contigs.headers | sort | uniq > contigsNames.headers.sort
# get the contig/clone names from the Zv7_chr.agp file: sorted file is
# chromContigs.txt
comm -13 contigsNames.headers.sort ./randoms/chromContigs.txt \
> contigsInChromAgpOnly.txt
wc -l contigsInChromAgpOnly.txt
# 575 contigsInChromAgpOnly.txt
# Get FASTA file headers for just this set of contigs. These are ones
# with fragment that are named XXNNNN_NN e.g. BX511136.3_00285.0
grep -f contigsInChromAgpOnly.txt contigs.headers \
> contigsInChromAgpOnly.headers
/cluster/home/hartera/bin/x86_64/agpAddCtgNamesAndGaps \
contigsInChromAgpOnly.headers Zv7_chr.agp \
chrFixed.agp >& agpChroms.log
# check if all the contig/clone names in the AGP file have now been
# found in the FASTA file.
sort contigs.names | uniq > contigs.names.sort
# get contig/clone names from fixed AGP file
awk '{if ($5 !~ /N/) print $6}' chrFixed.agp | sort | uniq \
> contigsFromChrFixedAgp.sort
# get list of names in the FASTA contig headers
cp ./randoms/contigs.names.sort .
wc -l contigs*.sort
# 60092 contigs.names.sort
# 39659 contigsFromChrFixedAgp.sort
comm -13 contigs.names.sort contigsFromChrFixedAgp.sort \
> notFoundInChrFixedAgp.txt
wc -l notFoundInChrFixedAgp.txt
# 334 notFoundInChrFixedAgp.txt
echo BX005112.16 > list
nice faSomeRecords Zv7_contigs.fa list BX005112.16_02613.fa
faSize BX005112.16_02613.fa
# 171673 bases (0 N's 171673 real 171673 upper 0 lower) in 1 sequences in
# 1 files
grep BX005112.16 Zv7_chr.agp
# chr21 31909678 32080531 1296 D BX005112.16 820
# 171673 +
# chr21 32080632 32084318 1298 D BX005112.16 171774
# 175460 +
grep BX005112.16 chrFixed.agp
# chr21 1104388161 1104559014 1296 D BX005112.16 820
# 171673 +
# chr21 1104559115 1104562801 1298 D BX005112.16_02934
# 171774 175460 +
grep BX005112.16 Zv7_all_scaffolds.agp
# Zv7_scaffold2077 678529 849382 54 U BX005112.16_02613
# 820 171673 + zK85G15.02613 170854 171673
# Zv7_scaffold2077 849483 853169 56 U BX005112.16_02934
# 1 3687 + zK85G15.02934 3687 3687
grep BX005112.16 contigs.headers
# BX005112.16_02613 zK85G15.02613 BX005112 1 171673
# BX005112.16_02934 zK85G15.02934 BX005112 171774 175460
echo BX005112.16 > list2
nice faSomeRecords Zv7_scaffolds.fa list2 BX005112.fa
# not found, these accedssions are not in the FASTA file.
# In order to create the chroms, need to extract the relevant coords from
# the accessions to create the FASTA file.
# Now basing assembly on scaffolds instead of contigs so create
# AGP files for scaffolds.
##########################################################################
# CREATE A SCAFFOLDS AGP WITH SCAFFOLDS ONLY FOR RANDOMS
# (2007-08-22 and 2007-08-24, hartera)
# Make AGP using just the unmapped scaffolds and not making virtual
# chromosomes for unmapped scaffolds so that this is the same as the
# way Ensembl handles these. Therefore there will be 25 chromosomes, chrM,
# plus 5010 unmapped scaffolds.
ssh kkstore06
# make agps and fasta directories
mkdir -p /cluster/data/danRer5/Zv7release/agps
mkdir -p /cluster/data/danRer5/Zv7release/fasta
# move assemmbly FASTA files to this directory
cd /cluster/data/danRer5/Zv7release
mv Zv7_*.fa ./fasta/
cd /cluster/data/danRer5/Zv7release/randoms
# get list of scaffolds for randoms - one for Un_random and one for
# NA_random scaffolds.
awk '{if ($1 ~ /Zv7_scaffold/) print $1;}' randomsScaffold.agp \
| sort -k 1.7,1n | uniq > Un_randomScafs.list
awk '{if ($1 ~ /Zv7_NA/) print $1;}' randomsScaffold.agp \
| sort -k 1.7,1n | uniq > NA_randomScafs.list
wc -l *randomScafs.list
# 4844 NA_randomScafs.list
# 166 Un_randomScafs.list
# 5010 total
# get sequences for just these scaffolds
foreach f (NA Un)
faSomeRecords ../fasta/Zv7_scaffolds.fa ${f}_randomScafs.list \
Zv7${f}_randomScafs.fa
end
# check that they are all there
foreach f (NA Un)
grep '>' Zv7${f}_randomScafs.fa > ${f}_Random.headers
end
wc -l *Random.headers
# 4844 NA_Random.headers
# 166 Un_Random.headers
# 5010 total
perl -pi.bak -e 's/>//' *Random.headers
foreach f (NA Un)
sort -k 1.7,1n ${f}_Random.headers | uniq > ${f}_Random.headers.sort
end
comm -12 NA_Random.headers.sort NA_randomScafs.list | wc -l
# 4844
comm -12 Un_Random.headers.sort Un_randomScafs.list | wc -l
# 166
# Total is 4844 + 166 = 5010
# so all the sequences in the scaffolds lists are in the FASTA files.
# Make an AGP from these scaffolds FASTA sequences with 50000 Ns
# inserted between scaffolds.
foreach c (NA_random Un_random)
scaffoldFaToAgp -scaffoldGapSize=0 Zv7${c}Scafs.fa
end
# scaffold gap size is 0, total scaffolds: 4844
# chrom size is 117689868
# writing Zv7NA_randomScafs.agp
# writing Zv7NA_randomScafs.gap
# writing Zv7NA_randomScafs.lft
# scaffold gap size is 0, total scaffolds: 166
# chrom size is 45800611
# writing Zv7Un_randomScafs.agp
# writing Zv7Un_randomScafs.gap
# writing Zv7Un_randomScafs.lft
# Create AGP with just the scaffolds
# sort NA by scaffold number:
# first remove gap lines:
grep -w -v "N" Zv7NA_randomScafs.agp > Zv7NA_randomScafsNoGaps.agp
grep -w -v "N" Zv7Un_randomScafs.agp > Zv7Un_randomScafsNoGaps.agp
sort -k 6.7n -k 6.8n -k 6.9,6.10n -k 6.10,6.11n \
Zv7NA_randomScafsNoGaps.agp > Zv7NA_randomScafsSorted.agp
foreach f (Zv7Un_randomScafsNoGaps.agp Zv7NA_randomScafsSorted.agp)
set g = $f:r
echo $g
awk 'BEGIN {OFS="\t"} {print $6, $7, $8, "1", "W", $6, $7, $8, "+"}' \
$f > ${g}2.agp
end
wc -l Zv7*_randomScafs*agp
# 9687 Zv7NA_randomScafs.agp
# 4844 Zv7NA_randomScafsNoGaps.agp
# 4844 Zv7NA_randomScafsSorted.agp
# 4844 Zv7NA_randomScafsSorted2.agp
# 331 Zv7Un_randomScafs.agp
# 166 Zv7Un_randomScafsNoGaps.agp
# 166 Zv7Un_randomScafsNoGaps2.agp
# 4844 + 166 = 5010 -> total unmapped scaffolds
cat Zv7NA_randomScafsSorted2.agp Zv7Un_randomScafsNoGaps2.agp \
> Zv7AllRandomScafs.agp
wc -l Zv7AllRandomScafs.agp
# 5010 Zv7AllRandomScafs.agp
# move scaffolds AGP to agps directory:
mv Zv7AllRandomScafs.agp /cluster/data/danRer5/Zv7release/agps/
# move sequences to FASTA directory
mv Zv7*_randomScafs.fa \
/cluster/data/danRer5/Zv7release/fasta/
#######################################################################
# PROCESS CHROMOSME AGP AND SCAFFOLDS AGP TO CREATE A SCAFFOLDS TO
# CHROMOSOMES AGP FILE (DONE, 2007-08-24, hartera)
# The Zv7_chr.agp file contains a mapping of contigs to chromsomes
# and the Zv7_scaffold.agp file contains a mapping of contigs to scaffolds.
# To build the Browser and assemble chromosomes, we need an AGP file
# that maps scaffolds to chromsomes.
# A program, createScaffoldsAgp.c was written to create this AGP file.
# in kent/src/hg/oneShot/createScaffoldsAgp/.
ssh kkstore06
cd /cluster/data/danRer5/Zv7release/agps
createScaffoldsAgp ../Zv7_scaffold.agp ../Zv7_chr.agp \
Zv7ScafToChrom.agp >& createAgp.log
wc -l *.agp
# 5010 Zv7AllRandomScafs.agp
# 4943 Zv7ScafToChrom.agp
# 9953 total
###########################################################################
# CHECK AGP FILES AND FASTA CONTENTS AND SIZE CONSISTENCY
# (DONE, 2007-08-22, hartera)
#
# Check that all the scaffolds in AGPs are in the FASTA file and vice versa
# get list of scaffolds in AGP files: Zv7ScafToChrom.agp,
# chrNA_random.scaffolds.agp and chrUn_random.scaffolds.agp
ssh kkstor06
cd /cluster/data/danRer5/Zv7release/assembly/agps
foreach f (*.agp)
awk '{if ($5 !~ /N/) print $6}' $f >> allScafsFromAgps.txt
end
sort allScafsFromAgps.txt | uniq > allScafsFromAgps.sort
grep '>' ../fasta/Zv7_scaffolds.fa | sed -e 's/>//' | sort | uniq \
> scafsHeaders.sort
wc -l *.sort
# 7494 allScafsFromAgps.sort
# 7494 scafsHeaders.sort
comm -12 allScafsFromAgps.sort scafsHeaders.sort | wc -l
# 7494
# 7494 are common to both files so they the AGP files contain all the
# scaffolds in the FASTA file and vice versa.
# create one AGP file for the assembly
cat Zv7ScafToChrom.agp Zv7AllRandomScafs.agp > Zv7ScafToChromAndRandom.agp
cd /cluster/data/danRer5/Zv7release/fasta
# make a scaffolds directory to work in
mkdir scaffolds
cd scaffolds
faSize -detailed ../Zv7_scaffolds.fa > Zv7.scaffolds.sizes
# Check that these sizes correspond to the sizes in the scaffolds agp file
# use script compareSizes2.pl
cat << '_EOF_' > compareSizes2.pl
#!/usr/bin/perl -w
use strict;
my ($file, $agp);
$file = $ARGV[0];
$agp = $ARGV[1];
open(FILE, $file) || die "Can not open $file: $!\n";
open(AGP, $agp) || die "Can not open $agp: $!\n";
open(OUT, ">log.txt") || die "Can not create log.txt: $!\n";
my ($l, @f, $name, $size, %scafsHash);
while (<FILE>)
{
$l = $_;
@f = split(/\t/, $l);
$name = $f[0];
$size = $f[1];
$scafsHash{$name} = $size;
}
close FILE;
while (<AGP>)
{
my ($line, @fi, $scaf, $end);
$line = $_;
if ($line =~ /Zv/)
{
@fi = split(/\t/, $line);
$scaf = $fi[5];
$end = $fi[7];
if (exists($scafsHash{$scaf}))
{
if ($scafsHash{$scaf} == $end)
{
print OUT "$scaf - ok\n";
}
else
{
print OUT "$scaf - different size to sequence\n";
}
}
else
{
print OUT "$scaf - does not exist in list of sizes\n";
}
}
}
close AGP;
close OUT;
'_EOF_'
# << happy emacs
chmod +x compareSizes2.pl
perl compareSizes2.pl Zv7.scaffolds.sizes \
../../agps/Zv7ScafToChromAndRandom.agp
grep different log.txt
grep not log.txt
# these are all consistent with the sequence sizes
# check that the co-ordinates in the agp files are consistent:
# field 2 is the start position, field 3 is the end and field 8 is the size
# so check that this is consistent.
cd /cluster/data/danRer5
awk '{if ($5 !~ /N/ && (($3-$2+1) != $8)) print $6;}' \
../../agps/Zv7AllRandomScafs.agp > Zv7.scaffolds.coordCheck
# this file is empty so they are ok. do the same for the contigs to
# chromsomes .agp file
awk '{if ($5 !~ /N/ && (($3-$2+1) != ($8-$7 +1))) print $6;}' \
../../Zv7_chr.agp > Zv7.contigsToChroms.coordCheck
# this file is empty so ok
# in order to create the scaffolds to chroms AGP file with
# createScaffoldsAgp, the coordinates were checked between the
# Zv7_scaffold.agp and Zv7_chr.agp files.
cd ..
rm -r scaffolds
#######################################################################
# MAKE GENOME DB FROM CHROMOSOMES AND UNMAPPED SCAFFOLDS
# (DONE, 2007-08-24, hartera)
# CHANGE dbDb ENTRY MADE BY SCRIPT TO DISPLAY SAME DEFAULT POSITION
# AS FOR DANRER4, GENE vdac2 (DONE, 2007-09-10, hartera)
# CHANGE dbDb ENTRY SO THAT THE FULL NAME OF THE SANGER INSTITUTE IS GIVEN
# IN THE SOURCENAME FIELD (DONE, 2007-09-22, hartera)
# CHANGED FRAG IN GOLD TABLE FROM GI NUMBER TO ACCESSION FOR chrM
# (DONE, 2007-10-01, hartera)
# Since there will be a mixture of chroms and unmapped scaffolds, the
# chroms must be built first being inputting the assembly sequence to
# makeGenomeDb.pl
ssh kkstore06
cd /cluster/data/danRer5/Zv7release/
cd fasta
agpToFa -simpleMultiMixed ../agps/Zv7ScafToChrom.agp all \
Zv7_chromosomes.fa ./Zv7_scaffolds.fa
# check the chroms are all there
grep '>' Zv7_chromosomes.fa | sed -e 's/>//' > headers
# all chroms 1-25 are there
rm headers
# check agp and FASTA
checkAgpAndFa ../agps/Zv7ScafToChrom.agp Zv7_chromosomes.fa
# All AGP and FASTA entries agree - both files are valid
# cat together chromosomes FASTA and scaffolds FASTA:
cat Zv7_chromosomes.fa Zv7NA_randomScafs.fa Zv7Un_randomScafs.fa \
> Zv7ChromsAndRandomScafs.fa
# To find the gi number for chrM, go to http://www.ncbi.nih.gov/ and search
# Nucleotide for "Danio mitochondrion genome".
# That shows gi number: 15079186, accession: NC_002333.2, 16596 bp
# also GI:8576324 and accession: AC024175
cd /cluster/data/danRer5/
cat << 'EOF' > danRer5.config.ra
db danRer5
clade vertebrate
scientificName Danio rerio
commonName Zebrafish
assemblyDate Jul. 2007
assemblyLabel Sanger Centre, Danio rerio Sequencing Project Zv7
orderKey 449
dbDbSpeciesDir zebrafish
# NC_002333.2
mitoAcc 15079186
agpFiles /cluster/data/danRer5/Zv7release/agps/Zv7ScafToChromAndRandom.agp
fastaFiles /cluster/data/danRer5/Zv7release/fasta/Zv7ChromsAndRandomScafs.fa
'EOF'
# << keep emacs coloring happy
# corrected wget for mitochondron genome so:
# http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=15079186&retmode=text
# remove directories created
rm -r TemporaryTrackDbCheckout bed html jkStuff
rm dbDbInsert.sql
ssh hgwdev
cd /cluster/data/danRer5/
hgsql -e 'delete from dbDb where name="danRer5";' hgcentraltest
ssh kkstore06
cd /cluster/data/danRer5
# Run with -debug to see what it would do / check for issues:
makeGenomeDb.pl danRer5.config.ra -debug -verbose=3
makeGenomeDb.pl danRer5.config.ra >& makeGenomeDb.log &
# Took about 10 minutes.
tail -f makeGenomeDb.log
# Follow the directions at the end of the log after
#NOTES -- STUFF THAT YOU WILL HAVE TO DO --
# Search for '***' notes in each file in and make corrections (sometimes the
# files used for a previous assembly might make a better template):
# description.html /cluster/data/danRer5/html/{trackDb.ra,gap.html,gold.html}
# Then cd ../.. (to trackDb/) and
# - edit makefile to add danRer5 to DBS.
# - (if necessary) cvs add zebrafish
# - cvs add zebrafish/danRer5
# - cvs add zebrafish/danRer5/*.{ra,html}
# - cvs ci -m "Added danRer5 to DBS." makefile
# - cvs ci -m "Initial descriptions for danRer5." zebrafish/danRer5
# - (if necessary) cvs ci zebrafish
# - Run make update DBS=danRer5 and make alpha when done.
# - (optional) Clean up /cluster/data/danRer5/TemporaryTrackDbCheckout
# - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.
# makeGenomeDb.pl creates the db for the genome, makes the hgcentraltest
# entries and loads the gold (scaffolds) and gap tables. Creates the GC
# percent, short match and restriction enzyme tracks. It also creates
# a danRer5 directory in the trackDb/zebrafish directory with a trackDb.ra
# file and a template for the assembly descripton, and for the
# gold and gap track tracks
# Then need to load in fragment gaps manually for gap table.
# (2007-09-10, hartera), Change the default Browser position in dbDb
# so that it displays vdac2 - chr13:14,786,820-14,801,993 - same as for
# danRer4.
ssh hgwdev
hgsql -h genome-testdb -e \
'update dbDb set defaultPos = "chr13:14,786,820-14,801,993" \
where name = "danRer5";' hgcentraltest
# (2007-09-22, hartera), Change the source name for danRer5 to give
# Sanger's full name and use similar format to other genome assembly
# entries in dbDb.
hgsql -h genome-testdb -e \
'update dbDb set sourceName = "Wellcome Trust Sanger Institute, Zv7
assembly" where name = "danRer5";' \
hgcentraltest
# (2007-10-01, hartera), Added the accession instead of the gi number
# for the frag field for chrM in the gold table. gi number is gi|15079186
# and accession is NC_002333.2. This is the same as AC024175.3 which was
# used previously.
ssh hgwdev
hgsql -e \
'update gold set frag = "NC_002333.2" where chrom = "chrM";' danRer5
##########################################################################
# REBUILD GAP TABLE WITH FRAGMENT GAPS AS WELL AS CONTIG GAPS
# (DONE, 2007-08-24, hartera)
# It is confusing not to display the fragment gaps in the gap track -
# these are gaps within contigs.
ssh kkstore06
cd /cluster/data/danRer5/Zv7release/
# need to use the contigs to chroms AGP
# also copy over the contigs to scaffolds AGP for the unmapped scaffolds
cp /cluster/data/danRer5/Zv7release/randoms/randomsScaffold.agp \
/cluster/data/danRer5/Zv7release/agps/
mkdir /cluster/data/danRer5/gap
awk '{if ($5 ~ /N/) print;}' Zv7_chr.agp \
> /cluster/data/danRer5/gap/chroms.gap
# 34449 gaps
awk '{if ($5 ~ /N/) print;}' ./agps/randomsScaffold.agp \
> /cluster/data/danRer5/gap/randomScafs.gap
# 15278 gaps
cat chroms.gap randomScafs.gap > danRer5.gap
wc -l /cluster/data/danRer5/gap/*.gap
# 34449 /cluster/data/danRer5/gap/chroms.gap
# 15278 /cluster/data/danRer5/gap/randomScafs.gap
# 49727 /cluster/data/danRer5/gap/danRer5.gap
rm chroms.gap randomScafs.gap
# 2459 is current count in table.
ssh hgwdev
cd /cluster/data/danRer5/
hgLoadGap -unsplit danRer5 /cluster/data/danRer5/gap/danRer5.gap
hgsql -e 'select count(*) from gap;' danRer5
# 49727
# So all the gaps were loaded.
###########################################################################
# REPEATMASKER RUN (DONE, 2007-08-25 - 2007-08-28, hartera)
# RE-RUN REPEATMASKER (DONE, 2007-08-30 - 2007-09-05, hartera)
# The sequence was under RepeatMasked, because the new zebrafish repeats
# library with the unclassified repeats must be distributed to all the
# nodes for pk.
# NESTED REPEATS TRACK RE-CREATED AFTER FIX TO extractNestedRepeats.pl
# SCRIPT (DONE, 2007-09-19, angie)
# RepeatMasker,v 1.19 2007/05/23 (May 17 2007 (open-3-1-8) version of
# RepeatMasker) with RepeatMasker library RELEASE 20061006.
# Added zebunc.ref (Zebrafish Unclassified) repeats from RepBase12.07
# from the Genetic Information Research Institute (GIRI,
# http://www.girinst.org/server/RepBase/index.php). Some repeats were
# removed from zebunc.ref because they mask out real genes (advised by
# Zebrafish bioinformatics group at Sanger) - more details below.
# Download the zebunc.ref unclassified repeats file for zebrafish
# from RepBase:
ssh kkstore06
cd /cluster/data/danRer5/
gunzip repeatmaskerlibraries-20061006.tar.gz
# no unclassified zebrafish repeats here
# Download zebunc.ref from
# http://www.girinst.org/server/RepBase/RepBase12.07.fasta/zebunc.ref
# last updated 21-Aug-2007 16:21 451K
# copy the one used for danRer4 and see if it is any different
cp /cluster/bluearc/RepeatMasker060320/Libraries/zebunc.ref.txt \
zebuncref2006.txt
diff zebunc.ref zebuncref2006.txt
# no difference so check format is still correct for the current
# version of RepeatMasker on pk /scratch/data/RepeatMasker/Libraries/
# format still looks the same.
ssh pk
mkdir /tmp/danRer5
cd /tmp/danRer5
faOneRecord /cluster/data/danRer5/Zv7release/fasta/Zv7_contigs.fa \
Zv7_scaffold1.1 > Zv7scaffold1.1.fa
# Run RepeatMasker on this to get it to create the danio library
/scratch/data/RepeatMasker/RepeatMasker -ali -s -species danio \
Zv7scaffold1.1.fa
cp /cluster/bluearc/RepeatMasker060320/Libraries/zebunc.ref.format \
/scratch/data/RepeatMasker/Libraries/
# Add this to the specieslib created for danio
cd /scratch/data/RepeatMasker/Libraries/
cat zebunc.ref.format \
>> /scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib
grep '>' specieslib | wc -l
# 1184
# then the RepeatMasker script
# Need to re-do this and add the extra zebrafish repeats to the danio
# libraries on /cluster/bluearc/scratch/data/RepeatMasker/
# and get it rsynced to all the parasol hubs, nodes, workhorse machines
# and fileservers.
# E-mail from Kerstin Howe about zebrafish repeats on Aug 31, 2007
# states that some of these unknown repeats were removed because they
# align to real genes. Save this e-mail as README.Repeats
# (2007-09-03, hartera)
ssh kkstore06
cd /cluster/data/danRer5/
# get list of zebunc.ref sequence names
grep '>' zebunc.ref | sed -e 's/>//' > zebunc.ref.names
# make a list of the repeats that Sanger removed: unknownRepsRemoved.txt
# Sanger removed some of these repeats because they masked out real genes
# get list list and remove them from list of repeat names.
grep -w -v -f unknownRepsRemoved.txt zebunc.ref.names \
> zebuncFixed.ref.names
wc -l *.names
# 958 zebunc.ref.names
# 943 zebuncFixed.ref.names
# select just those remaining names from the zebunc.ref FASTA file
faSomeRecords zebunc.ref zebuncFixed.ref.names zebuncFixed.ref.fa
grep '>' zebuncFixed.ref.fa | wc -l
# 943
# then add these repeats to the library for zebrafish on bluearc
ssh hgwdev
# do a dummy run first to create the specieslib for zebrafish
/cluster/bluearc/scratch/data/RepeatMasker/RepeatMasker \
-species 'danio' /dev/null
# then cat the new library to the specieslib for danio
cat /cluster/data/danRer5/zebuncFixed.ref.fa >> \
/cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib
cd /cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio
grep '>' specieslib | grep "Dr | wc -l
# 943
# (2007-09-04, hartera)
# also copy to /cluster/bluearc/RepeatMasker
# do a dummy run first to create the specieslib for zebrafish
/cluster/bluearc/RepeatMasker/RepeatMasker -species 'danio' /dev/null
cp -p \
/cluster/bluearc/scratch/data/RepeatMasker/Libraries/20061006/danio/specieslib\
/cluster/bluearc/RepeatMasker/Libraries/20061006/danio/
# ask cluster-admins for an rsync from
# /cluster/bluearc/scratch/data/RepeatMasker/
# to /scratch/data/RepeatMasker/ on all machines (parasol hubs, nodes,
# fileservers, workhorse machines)
ssh hgwdev
# cleanup
rm /gbdb/danRer5/danRer5.2bit
rm -r /san/sanvol1/scratch/danRer5/RMPart
rm /san/sanvol1/scratch/danRer5/danRer5.2bit
rm /san/sanvol1/scratch/danRer5/*.ooc
hgsql -e 'drop table rmsk;' danRer5
ssh kkstore06
# clean up
cd /cluster/data/danRer5
rm danRer5.rmsk.2bit danRer5.rmskTrf.2bit danRer5.2bit
mv RepeatMasker.2007-08-25/ OldRepeatMask
## use screen for this
time nice doRepeatMasker.pl -bigClusterHub=pk -species danio \
danRer5 >& repeatmask.log &
tail -f repeatmask.log
# 0.338u 0.267s 20:33:39.53 0.0% 0+0k 0+0io 5pf+0w
# Checked part of danRer5.rmsk.2bit with twoBitToFa and it looks masked.
## From previous run of RepeatMasker, this is fixed now:
# Output is in
# /scratch/tmp/doRepeatMasker.cluster.s15477/chr3:16000000-16500000.fa.out
# Checked in danRer4 RM run files and some of them have "*" at the end of
# a line e.g. /cluster/data/danRer4/1/chr1_11/chr1_11_10.fa.out
# Also, the IDs in the current run do not run consecutively
# e.g. 1,2,3,4,5,6,2,7,8 etc.
# E-mailed Angie to ask for advice
# (2007-08-27, hartera)
# liftUp is fixed so it will skip this repeat in the liftUp step when
# it finds a "*" but no ID.
# After the repeat DNA11TA1_DR with ID 821, there is a line with a
# repeat but no ID, just a "*" at the end which confuses liftUp.
# Angie sent this problem as an example to Robert Hubley to fix the bug.
# Here is the offending line:
# 294 9.8 0.0 0.0 chr3:16000000-16500000 372065 372105 (127895) +
# DNA11TA1_DR DNA/Tc1 164 204 (0) *
# Angie fixed liftUp to just warn when it finds a "*" instead of an ID.
# NOTE: the Nested Repeats track is created automatically by the
# doRepeatMasker.pl script. When the script was run to create the
# RepeatMasker track it was found that the extractNestedRepeats.pl
# needed fixing which was done by angie.
# The Nested Repeats track was then re-created (angie, 2007-09-19)
cd /cluster/data/danRer5/bed/RepeatMasker.2007-09-04
extractNestedRepeats.pl danRer5.fa.out > danRer5.nestedRepeats.bed
hgLoadBed danRer5 nestedRepeats danRer5.nestedRepeats.bed \
-sqlTable=\$HOME/kent/src/hg/lib/nestedRepeats.sql
###########################################################################
# SIMPLE REPEAT (TRF) TRACK (DONE, 2007-08-25 - 2007-08-27, hartera)
# TRF can be run in parallel with RepeatMasker on the file server
# since it doesn't require masked input sequence.
ssh kolossus
## use screen for this
mkdir /cluster/data/danRer5/bed/simpleRepeat
cd /cluster/data/danRer5/bed/simpleRepeat
time nice twoBitToFa ../../danRer5.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=simpleRepeat.bed -tempDir=/scratch/tmp \
>& trf.log &
# Took about 8 hours to run
tail -f trf.log
# Crashed on kolossus so re-run as before on kki:
# Broken pipe twoBitToFa ../../danRer5.unmasked.2bit
# stdout |
# 12.260u 4.233s 8:05:02.78 0.0% 0+0k 0+0io 0pf+0w
# Segmentation fault trfBig -trf=/cluster/bin/i386/trf stdin
# /dev/null -bedAt=simpleRepeat.bed ...
# This time split the sequence up and run on kki
ssh kkstore06
cd /cluster/data/danRer5/
mkdir splitNoMask
cd splitNoMask
twoBitToFa ../danRer5.unmasked.2bit danRer5.unmasked.fa
faSplit byname danRer5.unmasked.fa .
rm danRer5.unmasked.fa
# Copy the split sequences to iscratch dir on kkr1u00
ssh kkr1u00
rm -r /iscratch/i/danRer5/splitNoMask
mkdir -p /iscratch/i/danRer5/splitNoMask
foreach s (/cluster/data/danRer5/splitNoMask/*.fa)
echo "Copying $s ..."
cp $s /iscratch/i/danRer5/splitNoMask/
end
# Count files transferred
foreach f (/iscratch/i/danRer5/splitNoMask/*.fa)
ls $f >> seq.lst
end
wc -l seq.lst
# 5036 seq.lst
# correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
rm seq.lst
# rsync to cluster machines
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
end
## use screen
ssh kki
mkdir -p /cluster/data/danRer5/bed/simpleRepeat
cd /cluster/data/danRer5/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << keep emacs coloring happy
chmod +x runTrf
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << keep emacs coloring happy
foreach f (/iscratch/i/danRer5/splitNoMask/*.fa)
ls -1S $f >> genome.lst
end
gensub2 genome.lst single gsub jobList
/parasol/bin/para create jobList
# 5036 jobs written to batch
/parasol/bin/para try, check, push, check etc...
/parasol/bin/para time
# still crashing on one sequence:
# sh: line 1: 23369 Segmentation fault /cluster/bin/i386/trf
# /tmp/Zv7_scaffold2487.tf 2 7 7 80 10 50 2000 -m -d
# can't open /tmp/Zv7_scaffold2487.tf.2.7.7.80.10.50.2000.mask
# Completed: 5035 of 5036 jobs
# Crashed: 1 jobs
# CPU time in finished jobs: 31429s 523.82m 8.73h 0.36d 0.001 y
# IO & Wait Time: 12949s 215.81m 3.60h 0.15d 0.000 y
# Average job time: 9s 0.15m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 3818s 63.63m 1.06h 0.04d
# Submission to last job: 5013s 83.55m 1.39h 0.06d
# test this sequence
ssh kkstore06
cd /cluster/data/danRer5/bed/simpleRepeat
mkdir noMaskSplit
mkdir run2
cd noMaskSplit
faSplit -minGapSize=100 -lift=scaf2487.lft gap \
/cluster/data/danRer5/splitNoMask/Zv7_scaffold2487.fa \
10000 scaf2487_
ssh kkr1u00
mkdir /iscratch/i/danRer5/splitScaf2487/
cp /cluster/data/danRer5/bed/simpleRepeat/noMaskSplit/*.fa \
/iscratch/i/danRer5/splitScaf2487/
ls /iscratch/i/danRer5/splitScaf2487/*.fa | wc -l
# 250
# rsync to all iServers
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
end
ssh kki
cd /cluster/data/danRer5/bed/simpleRepeat/run2
cp ../runTrf .
cp ../gsub .
mkdir trf
ls -1S /iscratch/i/danRer5/splitScaf2487/*.fa > genome.lst
gensub2 genome.lst single gsub jobList
/parasol/bin/para create jobList
# 250 jobs written to batch
/parasol/bin/para try, check, push, check etc...
/parasol/bin/para time
# Completed: 249 of 250 jobs
# Crashed: 1 jobs
#CPU time in finished jobs: 73s 1.22m 0.02h 0.00d 0.000 y
#IO & Wait Time: 622s 10.36m 0.17h 0.01d 0.000 y
#Average job time: 3s 0.05m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 23s 0.38m 0.01h 0.00d
#Submission to last job: 63s 1.05m 0.02h 0.00d
/parasol/bin/para problems >& problems
# trf/scaf2487_082.bed does not exist
# stderr:
# sh: line 1: 7469 Segmentation fault /cluster/bin/i386/trf
# /tmp/scaf2487_082.tf 2 7 7 80 10 50 2000 -m -d
# can't open /tmp/scaf2487_082.tf.2.7.7.80.10.50.2000.mask
# Failed sequence looks fine - just A,C,G,T or N.
# Downloaded the latest version of trf (v4.0, previous was v3.21)
# for both 64 bit and i386. Tried 64 bit version on the scaf2487_082.fa
# and it works fine. So now run on kolossu.
mv /cluster/data/danRer5/bed/simpleRepeat \
/cluster/data/danRer5/bed/OldsimpleRepeat
# run on the pk cluster, too slow on kolossus using the 2bit sequence file.
ssh pk
mkdir -p /san/sanvol1/scratch/danRer5/splitNoMask
# copy the split sequence files to the san - 1 for each chorm and 1
# for each unmapped scaffold
foreach f (/cluster/data/danRer5/splitNoMask/*.fa)
cp $f /san/sanvol1/scratch/danRer5/splitNoMask
end
mkdir /cluster/data/danRer5/bed/simpleRepeat
cd /cluster/data/danRer5/bed/simpleRepeat
mkdir trf
cat << '_EOF_' > runTrf
#!/bin/csh -fe
#
set path1 = $1
set inputFN = $1:t
set outpath = $2
set outputFN = $2:t
mkdir -p /tmp/$outputFN
cp $path1 /tmp/$outputFN
pushd .
cd /tmp/$outputFN
/cluster/bin/i386/trfBig -trf=/cluster/bin/x86_64/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/tmp
popd
rm -f $outpath
cp -p /tmp/$outputFN/$outputFN $outpath
rm -fr /tmp/$outputFN/*
rmdir --ignore-fail-on-non-empty /tmp/$outputFN
'_EOF_'
# << keep emacs coloring happy
chmod +x runTrf
cat << '_EOF_' > gsub
#LOOP
./runTrf {check in line+ $(path1)} {check out line trf/$(root1).bed}
#ENDLOOP
'_EOF_'
# << keep emacs coloring happy
foreach f (/san/sanvol1/scratch/danRer5/splitNoMask/*.fa)
ls -1S $f >> genome.lst
end
/parasol/bin/gensub2 genome.lst single gsub jobList
/parasol/bin/para create jobList
# 5036 jobs written to batch
/parasol/bin/para try, check, push, check etc...
/parasol/bin/para time
# Completed: 5036 of 5036 jobs
# CPU time in finished jobs: 45132s 752.20m 12.54h 0.52d 0.001 y
# IO & Wait Time: 16660s 277.67m 4.63h 0.19d 0.001 y
# Average job time: 12s 0.20m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 5804s 96.73m 1.61h 0.07d
# Submission to last job: 6042s 100.70m 1.68h 0.07d
# (2007-08-27, hartera)
# the input to trf was the chromosomes and scaffolds so no liftUp needed.
# cat all the files together
cat ./trf/*.bed > simpleRepeat.bed
# load table
ssh hgwdev
cd /cluster/data/danRer5/bed/simpleRepeat
hgLoadBed danRer5 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
# Loaded 966523 elements of size 16
###########################################################################
# PROCESS SIMPLE REPEATS INTO A MASK (DONE, 2007-08-28, hartera)
# RE-DONE AFTER RE-RUNNING REPEATMASKER (DONE, 2007-09-05, hartera)
# The danRer5.rmsk.2bit is already masked with RepeatMasker output.
# After the simpleRepeats track has been built, make a filtered
# version of the trf output for masking: keep trfs with period <= 12.
ssh kkstore06
cd /cluster/data/danRer5/bed/simpleRepeat
rm -r trfMask
rm trfMask.bed
mkdir trfMask
foreach f (trf/*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
cat ./trfMask/*.bed > trfMask.bed
# don't need to lift up. these are already in chrom and unmapped
# scaffold coordinates.
## Add trfMask to repeat masked sequence
ssh kkstore06
cd /cluster/data/danRer5
# cleanup old sequence:
rm danRer5.rmskTrf.2bit danRer5.2bit
cat << '_EOF_' > addTrf.csh
#!/bin/csh -efx
#This script will fail if any of its commands fail.
set DB = danRer5
set WORK_DIR = /cluster/data/${DB}
cd ${WORK_DIR}
set inputTwoBit = ${WORK_DIR}/${DB}.rmsk.2bit
set outputTwoBit = ${WORK_DIR}/${DB}.rmskTrf.2bit
cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \
| twoBitMask -add -type=.bed ${inputTwoBit} stdin ${outputTwoBit}
twoBitToFa ${outputTwoBit} stdout | faSize stdin > faSize.${DB}.rmskTrf.txt
'_EOF_'
# << happy emacs
chmod +x ./addTrf.csh
time ./addTrf.csh
# Warning: BED file stdin has >=13 fields which means it might contain
# block coordinates, but this program uses only the first three fields
# (the entire span -- no support for blocks).
# This is ok, there are no blocks, only the first 3 fields are
# standard BED format.
# 44.550u 6.030s 0:50.66 99.8% 0+0k 0+0io 0pf+0w
cat faSize.danRer5.rmskTrf.txt
# 1440582308 bases (4986636 N's 1435595672 real 724596899 upper 710998773
# lower) in 5036 sequences in 1 files
# Total size: mean 286056.9 sd 3643515.2 min 2000 (Zv7_NA5415) max 70371393
# (chr5) median 7336
# N count: mean 990.2 sd 9834.4
# U count: mean 143883.4 sd 1862761.1
# L count: mean 141183.2 sd 1772056.1
# %49.35 masked total, %49.53 masked real
# this is similar to previous masking of zebrafish assembly sequence
# for danRer4: %44.10 masked total, %48.94 masked real
ln -s danRer5.rmskTrf.2bit danRer5.2bit
# add symlink to /gbdb/danRer5 to symlink the danRer5.2bit file
ssh hgwdev
rm /gbdb/danRer5/danRer5.2bit
ln -s /cluster/data/danRer5/danRer5.rmskTrf.2bit \
/gbdb/danRer5/danRer5.2bit
# copy over to the san
rm /san/sanvol1/scratch/danRer5/danRer5.2bit
cp -p /cluster/data/danRer5/danRer5.rmskTrf.2bit \
/san/sanvol1/scratch/danRer5/danRer5.2bit
# also copy to iscratch
ssh kkr1u00
rm /iscratch/i/danRer5/danRer5.2bit
cp -p /cluster/data/danRer5/danRer5.rmskTrf.2bit \
/iscratch/i/danRer5/danRer5.2bit
# rsync to iServers
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer5/ \
kkr${R}u00:/iscratch/i/danRer5/
end
###########################################################################
# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE, 2007-08-28, hartera)
# RE-RUN AFTER RE-RUNNING REPEATMASKER AND RE-MASKING SEQUENCE
# (DONE, 2007-09-05, hartera)
# Use -repMatch=512 (based on size -- for human we use 1024, and
# the zebrafish genome is ~50% of the size of the human genome
# zebrafish is about 1.44 Gb and human is 3.1 Gb.
ssh kolossus
rm -r /cluster/data/danRer5/bed/ooc
mkdir /cluster/data/danRer5/bed/ooc
cd /cluster/data/danRer5/bed/ooc
time blat /cluster/data/danRer5/danRer5.2bit \
/dev/null /dev/null -tileSize=11 -makeOoc=danRer5_11.ooc -repMatch=512
# 75.647u 3.159s 1:21.04 97.2% 0+0k 0+0io 2pf+0w
# Wrote 40297 overused 11-mers to danRer5_11.ooc
# then make 10.ooc file for overused 10mers, repMatch = 4096
# for humans so use 2048 for danRer5
time blat /cluster/data/danRer5/danRer5.2bit \
/dev/null /dev/null -tileSize=10 -makeOoc=danRer5_10.ooc -repMatch=2048
# 72.078u 2.587s 1:14.83 99.7% 0+0k 0+0io 0pf+0w
# Wrote 9589 overused 10-mers to danRer5_10.ooc
# copy the *.ooc files over to the san for cluster runs and Genbank run
rm /san/sanvol1/scratch/danRer5/*.ooc
cp -p *.ooc /san/sanvol1/scratch/danRer5/
# also copy to iscratch
ssh kkr1u00
rm /iscratch/i/danRer5/*.ooc
cp -p /cluster/data/danRer5/bed/ooc/*.ooc /iscratch/i/danRer5/
# rsync to iServers
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer5/*.ooc \
kkr${R}u00:/iscratch/i/danRer5/
end
###########################################################################
# CREATE LIFT FILE SEPARATED BY NON-BRIDGED GAPS
# (DONE, 2007-08-28, hartera)
# This is needed for the GenBank run. Use Hiram's gapToLift program
ssh hgwdev
cd /cluster/data/danRer5/jkStuff
gapToLift danRer5 nonBridgedGap.lft
# copy over to the san
cp -p nonBridgedGap.lft /san/sanvol1/scratch/danRer5
# there are 14973 rows in this file, only 370 for mouse
# that's ok.
###########################################################################
# AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN
# (DONE, 2007-08-28, hartera)
# RE-RUN AFTER RE-DOING REPEATMASKING OF GENOME SEQUENCE
# (DONE, 2007-09-05, hartera)
ssh hgwdev
cd $HOME/kent/src/hg/makeDb/genbank
cvs update -dP
# edit etc/genbank.conf to add danRer5 and commit to CVS
# done already for first run so no need to update.
# danRer5 (zebrafish)
danRer5.serverGenome = /cluster/data/danRer5/danRer5.2bit
danRer5.clusterGenome = /iscratch/i/danRer5/danRer5.2bit
danRer5.ooc = /iscratch/i/danRer5/danRer5_11.ooc
danRer5.lift = /cluster/data/danRer5/jkStuff/nonBridgedGap.lft
danRer5.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
danRer5.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
danRer5.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
danRer5.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
danRer5.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
danRer5.downloadDir = danRer5
danRer5.mgcTables.default = full
danRer5.mgcTables.mgc = all
danRer5.perChromTables = no
# end of section added to etc/genbank.conf
cvs commit -m "Added danRer5." etc/genbank.conf
# also added the perChromTables line afterwards. This means that there
# will not be one table per chromosome. Would be too many tables as
# there are 5010 scaffolds.
# update /cluster/data/genbank/
make etc-update
# ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
# danRer genome information
ssh kkstore06
cd /cluster/data/genbank
# for re-run, remove everything under these directories:
rm -r /cluster/data/genbank/data/aligned/genbank.161.0/danRer5/*
rm -r /cluster/data/genbank/data/aligned/refseq.24/danRer5/*
time nice bin/gbAlignStep -initial danRer5 &
# 3691.965u 996.052s 4:08:37.03 31.4% 0+0k 0+0io 32pf+0w
# logFile: var/build/logs/2007.09.05-10:04:19.danRer5.initalign.log
# check log file - looks ok, it has finished (last line in log file)
# kkstore06.cse.ucsc.edu 2007.09.05-14:12:56 danRer5.initalign: finish
# then load database
sh hgwdev
cd /cluster/data/genbank
# -drop parameter will ensure that old tables are dropped before loading
time nice ./bin/gbDbLoadStep -drop -initialLoad danRer5 &
# logFile: var/dbload/hgwdev/logs/2007.09.05-14:36:03.dbload.log
# 1108.286u 310.389s 41:23.51 57.1% 0+0k 0+0io 16pf+0w
# This time ran smoothly and the only problem was an NFS issue with
# removing directories at the end - the load went ok though.
# On the first time loading the GenBank alignments, it failed:
# failed because there were too many open files, trying to load a table
# per chrom and per scaffold so add the danRer5.perChromTables = n
# line to genbank.conf file (see above) so that it will all be loaded into
# one table. Too many table otherwise, as there are 25 chroms, chrM and
# 5010 scaffolds.
# Solution: remove lock file:
# rm /cluster/data/genbank/var/dbload/hgwdev/run/dbload.lock
# then run again
###########################################################################
# ENSEMBL GENES TRACK FOR PROTEIN-CODING GENES FROM
# ENSEMBL VERSION 46 (AUG 2007) (DONE, 2007-09-05, markd)
# Compare peptides translated from genome to those downloaded from
# BioMart for these transcripts and LOAD ensPep TABLE WITH PEPTIDE
# SEQUENCES FOR THE PROTEIN-CODING ENSEMBL GENES
# (DONE, 2007-09-24 - 2007-09-25, hartera)
# ADD AUTOSQL CODE FOR ensInfo TABLE AND ADD CODE TO HGC TO PRINT
# THIS ADDITIONAL INFORMATION ON THE DETAILS PAGE FOR EACH GENE
# (DONE, 2007-09-28 and 2007-10-03, hartera)
# Used the programs created by Robert Baertsch to download Ensembl
# genes from Ensembl's ftp site, load it into the danRer5 database.
ssh hgwdev
mkdir /cluster/data/danRer5/bed/ensembl46
cd /cluster/data/danRer5/bed/ensembl46
# Download Ensembl genes from ftp site and create tracks in browser:
# current version is danio_rerio core_46_7
hgLoadEnsembl danio_rerio core_46_7 danRer5
# these problems were detected, add to problem.ids file:
genePredCheck -db=danRer5 ensGene.gp
Error: ensGene.gp:2720: ENSDART00000019661 exonFrame on non-CDS exon 14
Error: ensGene.gp:3874: ENSDART00000027605 exonFrame on non-CDS exon 0
Error: ensGene.gp:9416: ENSDART00000062681 exonFrame on non-CDS exon 12
checked: 31743 failed: 3
hgLoadEnsembl -f problem.ids danio_rerio core_46_7 danRer5
# hgLoadEnsembl creates the ensGene, ensGeneXref, ensGtp and ensInfo
# tables but no ensPep peptide sequence table.
# Compare the peptides as translated from the genomic sequence of the
# CDS for each transcript and the peptides for the transcripts downloaded
# from BioMart (2007-09-23 - 2007-09-24, hartera)
ssh kkstore06
mkdir /cluster/data/danRer5/bed/ensembl46/peptides
ssh hgwdev
cd /cluster/data/danRer5/bed/ensembl46/peptides
hgsql -N -e 'select distinct(name) from ensGene;' danRer5 | sort \
> ensGene.txIds.sort
# First get peptides from the genome translation of the genes defined in
# ensGene. This does not work for the mitochondrial genome. The ability
# to deal with vertebrate mitochondrial genetic code for translation
# of genes on chrM was added (markd).
getRnaPred -peptides -genePredExt danRer5 ensGene all ensGeneTablePep.fa
# Took about 10 minutes
# Then download the peptide sequences from BioMart and compare
# differences may include selenocysteines and translationnal frameshifts
# Get the ensembl peptide sequences from
# http://www.ensembl.org/biomart/martview
# Follow this sequence:
# 1) Choose the Ensembl Genes 46 as the database and then
# Danio Rerio genes (ZFISH7) as the dataset.
# 2) Click on the Attributes link in the left panel. Select sequences.
# 3) Expand the SEQUENCES section and choose Peptide as type of sequence
# to export and then expand the Header Information section and select
# Ensembl Gene ID from Gene Attributes and Ensembl Transcript ID
# and Ensembl Peptide ID from Transcript Attributes
# 4) Click on the Filters link in the left panel and expand the GENE
# section. Select the Gene type box and then select protein_coding as
# these are the only genes with an associated protein sequence.
# 5) Click on the Results link in the top black menu bar and
# choose FASTA for the output and export all results to
# Compressed file (.gz). Select unique results only.
# save the file as ensembl46Pep.fasta.gz and move to
# /cluster/data/danRer5/bed/ensembl46
ssh kkstore06
cd /cluster/data/danRer5/bed/ensembl46/peptides
# unzip the Enseml peptides file downloaded from BioMart
gunzip ensembl46Pep.fasta.gz
grep '>' ensembl46Pep.fasta | wc -l
# 31743
grep '>' ensembl46Pep.fasta > headers
awk 'BEGIN {FS="|"} {print $1;}' headers | sort | uniq \
> pepBioMart.txIds.sort
perl -pi.bak -e 's/>//' pepBioMart.txIds.sort
# compare list of transcript IDs from the ensGene table to those for the
# downloaded peptides from BioMart.
comm -13 ensGene.txIds.sort pepBioMart.txIds.sort
# from the BioMart peptide download there were 3 extra:
# ENSDART00000019661
# ENSDART00000027605
# ENSDART00000062681
# These were the problem IDs that were removed from the set - see above.
comm -23 ensGene.txIds.sort pepBioMart.txIds.sort
# no difference
# change headers so that they only have the Ensembl transcript ID.
awk 'BEGIN{FS="|"} {if ($1 ~ /ENSDART/) print $1; else print;}' \
ensembl46Pep.fasta > ensembl46PepTxIdsOnly.fasta
# use faCmp (~/kent/src/utils/faCmp/faCmp.c) to compare the two files.
# it requires the same number of sequences in both files.
# Use faFilter to remove the ones not in ensGeneTablePep.fa
faFilter -v -name=ENSDART00000019661 ensembl46PepTxIdsOnly.fasta tmp1.fa
faFilter -v -name=ENSDART00000027605 tmp1.fa tmp2.fa
faFilter -v -name=ENSDART00000062681 \
tmp2.fa ensembl46PepTxIdsOnlyFilt.fasta
grep '>' ensembl46PepTxIdsOnlyFilt.fasta | wc -l
# 31740
rm tmp*
# faCmp assumed that DNA sequences were being compared. An option was
# added so that protein sequences could be compared (markd added this).
faCmp -sortName -peptide \
ensGeneTablePep.fa ensembl46PepTxIdsOnlyFilt.fasta \
>& ensTablevsBioMartFasta.diff
# ENSDART00000038434 in ensGeneTablePep.fa differs from ENSDART00000038434 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 114 (X != V)
# ENSDART00000046903 in ensGeneTablePep.fa differs from ENSDART00000046903 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 71 (X != T)
# ENSDART00000047515 in ensGeneTablePep.fa differs from ENSDART00000047515 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 149 (X != L)
# ENSDART00000049170 in ensGeneTablePep.fa differs from ENSDART00000049170 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 172 (X != S)
# ENSDART00000080260 in ensGeneTablePep.fa differs from ENSDART00000080260 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 353 (X != G)
# ENSDART00000081978 in ensGeneTablePep.fa differs from ENSDART00000081978 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 21 (X != R)
# ENSDART00000082954 in ensGeneTablePep.fa differs from ENSDART00000082954 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 747 (X != A)
# ENSDART00000083022 in ensGeneTablePep.fa differs from ENSDART00000083022 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 344 (X != L)
# ENSDART00000105255 in ensGeneTablePep.fa differs from ENSDART00000105255 at
# ensembl46PepTxIdsOnlyFilt.fasta at base 427 (X != A)
# There are differences in just 9 of the protein sequences where we have an
# "X" because there is an "N" in the genome sequence in the codon for the
# amino acid whereas Ensembl used transcript evidence to predict the
# missing base. Since there are so few differences, then the predicted
# translation from the genome is good enough without loading an ensPep table.
# Load anyway for completeness since there are some differences. Use
# "generic" as the type for loading using hgPepPred, not "ensembl" since
# all the IDs were removed from the header line but the transcript ID.
# If type "ensembl" is defined, it expects at least 3 IDs separated by "|".
ssh hgwdev
cd /cluster/data/danRer5/bed/ensembl46/peptides
# load the BioMart downloaded Ensembl 46 peptide sequences into database:
hgPepPred danRer5 generic ensPep ensembl46PepTxIdsOnlyFilt.fasta
hgsql -e 'select count(*) from ensPep;' danRer5
# 31740
# All the sequences were loaded.
# Added a zebrafish/danRer5/trackDb.ra entry for ensGene with the
# archive setting, aug2007, so that URLs for Ensembl Genes point to the
# correct archive. Added the Esnembl version number to the longlabel and
# as a dataVersion setting (Ensembl Relese 46).
# clean up
ssh kkstore06
cd /cluster/data/danRer5/bed/ensembl46/peptides
rm tmp*.fa headers ensPep.tab ensembl46PepTxIdsOnly.fasta *.bak
rm pepBioMart.txIds.sort ensGene.txIds.sort
# (2007-09-28, hartera)
# Add code to use ensInfo table, use autoSql. There is already a
# ensInfo.sql table specified in ~/kent/src/hg/lib
ssh hgwdev
cat << 'EOF' > $HOME/kent/src/hg/lib/ensInfo.as
table ensInfo
"ensembl Genes track additional information"
(
string name; "Ensembl transcript ID"
string otherId; "other ID"
string geneId; "Ensembl gene ID"
string class; "Ensembl gene type"
string geneDesc; "Ensembl gene description"
string status; "Ensembl gene confidence"
)
'EOF'
cd $HOME/kent/src/hg/lib/ensInfo.as
# move the current ensInfo.sql out of the way
mv ensInfo.sql tmp.sql
autoSql ensInfo.as ensInfo
# renmae autogenerated file and reinstate original file as ensInfo.sql
# Difference is that this file has a longblob for description whereas
# the description field in the autoSql generated code just has a
# varchar[255] which may not be big enough in all cases.
mv ensInfo.sql ensInfoTemp.sql
mv tmp.sql ensInfo.sql
mv ensInfo.h ../inc/
# commit ../inc/ensInfo.h, ensInfo.c and ensInfo.as to CVS
# add ensInfo.o to makefile and commmit to CVS
cd $HOME/kent/src/hg/
# add code to hgc.c to doEnsemblGene function to write information
# from the ensInfo table on the details page for each gene.
# (2007-10-03, hartera). Check whether the name2 field in ensGene is
# always the same as otherId in the ensInfo table.
ssh hgwdev
cd /cluster/data/danRer5/bed/ensembl46
hgsql -N -e 'select name, name2 from ensGene;' danRer5 \
| sort | uniq > names.ensGene.sort
hgsql -N -e 'select name, otherId from ensInfo;' danRer5 \
| sort | uniq > names.ensInfo.sort
wc -l names*.sort
# 31740 names.ensGene.sort
# 31991 names.ensInfo.sort
comm -12 names.ensGene.sort names.ensInfo.sort | wc -l
# 31740
# So the name, name2 pair in ensGene is the same as the name, otherId
# pair in ensInfo. ensInfo has more entries than ensGene because
# it includes pseudogene and non-coding RNAs. hgc already prints the
# Alternate Name using name2 from ensGene so do not need to print out
# the otherId field from ensInfo.
###########################################################################
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR danRer5 (DONE, 2006-09-11, hartera)
ssh hgwdev
# DNA port is "0", trans prot port is "1"
echo 'insert into blatServers values("danRer5", "blat8", "17784", "1",
"0"); insert into blatServers values("danRer5", "blat8", "17785", "0",
"1");' \
| hgsql hgcentraltest
# this enables blat and isPcr, isPcr is enabled by loading blat server
# with tilesize=5 (ask for this when request blat servers from
# cluster admin).
# if you need to delete those entries
echo 'delete from blatServers where db="danRer5";' | hgsql hgcentraltest
###########################################################################
# VEGA GENES (DONE, 2007-09-05 - 2007-09-12, hartera)
# Data provided by Tina Eyre from Sanger: te3@sanger.ac.uk
# GTF file sent on 2007-08-30, Vegav27
ssh kkstore06
mkdir -p /cluster/data/danRer5/bed/vega27/data/
cd /cluster/data/danRer5/bed/vega27/data
# 2 files: vegav27.gtf.zip vegav27_information.txt.zip
unzip vegav27.gtf.zip
mv ./lustre/work1/ensembl/te3/otter_vega_builds/vega/070523/vegav27.gtf .
rm -r lustre
unzip vegav27_information.txt.zip
mv \
./lustre/work1/ensembl/te3/otter_vega_builds/vega/070523/vegav27_information.txt
\ .
rm -r lustre
# asked for the vegav27_information.txt file to be in the format
# required to load the table directory.
# Check the format then prepare data files to load database tables.
# Can grep for "otter" to get only gene description lines from the GTF
# file - the rest is output and header from program that generated it.
# 2007-09-06, hartera
ssh kkstore06
cd /cluster/data/danRer5/bed/vega27
# get just lines describing genes and remove "chr" prefix from scaffolds
# NOTE: there are no NA scaffolds.
grep "otter" ./data/vegav27.gtf | sed -e 's/chrZv7_/Zv7_/' > vegav27Fixed.gtf
# vegaInfo is transcriptId, otterId, geneId, method and geneDesc
# Get otter transcript ID and otter gene ID:
awk 'BEGIN{FS=";"} {OFS="\t"} \
{if (($5 ~ /otter_gene_id/) && ($6 ~ /otter_transcript_id/)) \
print $5, $6;}' vegav27Fixed.gtf | sort | uniq > vega27OtterIds.txt
# remove the otter_gene_id and otter_transcript_id and
# extra spaces and quotation marks.
perl -pi.bak -e 's/\sotter_gene_id\s\"//;' vega27OtterIds.txt
perl -pi.bak -e 's/"\t\sotter_transcript_id\s"(OTTDART[0-9]+)"/\t$1/' \
vega27OtterIds.txt
wc -l vega27OtterIds.txt
# 12819 vega27OtterIds.txt
# get list of the otter gene IDs only
awk '{print $1}' vega27OtterIds.txt | sort | uniq > otterGeneIds.only
# then get list of otter gene IDs from information file
awk '{if ($1 ~ /OTTDARG/) print $1}' data/vegav27_information.txt \
| sort | uniq > infoOtterGeneIds.only
comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
# 8649
wc -l *.only
# 10374 infoOtterGeneIds.only
# 8649 otterGeneIds.only
# all the gene IDs from the GTF file but the information file contains
# 1725 are in the information file but not the GTF file
comm -13 otterGeneIds.only infoOtterGeneIds.only > geneIds.InfoOnly.txt
cat << 'EOF' > mergeTransIdAndInfo.pl
#!/usr/bin/perl -w
use strict;
my ($idsFile, $infoFile, %idsHash);
$idsFile = $ARGV[0];
$infoFile = $ARGV[1];
open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
open (INFO, $infoFile) || die "Can not open $idsFile: $!\n";
# Read in the otter gene IDs and transcript IDs
while (<IDS>)
{
my (@f, $line);
chomp;
$line = $_;
# split line on tab
@f = split(/\t/, $line);
# hash keyed by gene ID, stores transcript ID
# more than one ID so store as an array
if ($f[0] =~ /OTTDARG/)
{
# add to array in hash if it is an otter gene ID
push @{$idsHash{$f[0]}}, $f[1];
}
}
close IDS;
while (<INFO>)
{
my ($line, @info, $geneId, @transIds);
# read in information file and get the otter gene ID
$line = $_;
# split line on tab
@info = split(/\t/, $line);
if ($info[0] =~ /OTTDARG/)
{
$geneId = $info[0];
# look up transcript ID in hash
# if gene ID exists in the hash then print out the information
# file line with the correct transcript ID
if (exists($idsHash{$geneId}))
{
@transIds = @{$idsHash{$geneId}};
# rewrite line with transcript ID
foreach my $t (@transIds)
{
print "$t\t$line";
}
}
}
}
close INFO;
'EOF'
# << happy emacs
chmod +x mergeTransIdAndInfo.pl
mergeTransIdAndInfo.pl vega27OtterIds.txt ./data/vegav27_information.txt \
> vegav27InfoWithTransIds.txt
wc -l vega27OtterIds.txt
# 12819 vega27OtterIds.txt
wc -l vegav27InfoWithTransIds.txt
# 12820 vegav27InfoWithTransIds.txt
awk 'BEGIN {OFS="\t"} {print $2, $1}' vegav27InfoWithTransIds.txt \
| sort | uniq > infoWithTrans.Ids
sort vega27OtterIds.txt | uniq > otterIds.sort
comm -13 otterIds.sort infoWithTrans.Ids
comm -23 otterIds.sort infoWithTrans.Ids
# No difference, there must be an extra line duplicated
wc -l infoWithTrans.Ids
# 12819 infoWithTrans.Ids
awk 'BEGIN {OFS="\t"} {print $2, $1}' vegav27InfoWithTransIds.txt \
| sort | uniq -c | sort -nr > count
# 2 OTTDARG00000014371 OTTDART00000027056
# this appears twice in info file
grep OTTDARG00000014371 ./data/vegav27_information.txt
# OTTDARG00000014371 DKEY-26M21.1 ZDB-GENE-041210-211
# si:ch211-162i19.2 protein_coding novel protein NOVEL
# OTTDARG00000014371 DKEY-26M21.1 ZDB-GENE-041210-211
# si:ch211-162i19.2 protein_coding novel protein NOVEL
# remove one copy from the vegav27_information.txt file.
# so just sort and uniq the resulting information file with transcript IDs
sort vegav27InfoWithTransIds.txt | uniq > vegav27InfoWithTransIdsUniq.txt
wc -l vegav27InfoWithTransIdsUniq.txt
# 12819 vegav27InfoWithTransIdsUniq.txt
# There are extra IDs in the information file as it includes all Vega
# genes (including clones not in Ensembl). Those actually in the Vega
# v27 GTF are those that are found on clones that are in Ensembl.
# Vega includes only clone sequence and Zv7_NA scaffolds are WGS so
# these are not annotated in Vega. This information is from
# an e-mail from Tina Eyre (te3@sanger.ac.uk) on 2007-09-07.
# then use the info file to grab those genes that are pseudogenes, get the
# transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
# to a separate file.
# Next step is to find which transcripts are pseudogenes.
grep pseudogene vegav27InfoWithTransIdsUniq.txt | sort | uniq | wc -l
# found 58 pseudogenes so need to create the pseudogene track
# Get transcript IDs for pseudogenes.
grep pseudogene vegav27InfoWithTransIdsUniq.txt | awk '{print $1}' \
> pseudogenes.ids
grep -w -f pseudogenes.ids vegav27Fixed.gtf > vegav27Pseudogene.gtf
awk '{print $20}' vegav27Pseudogene.gtf | sort | uniq | wc -l
# 58
# Need to make the GTF file vegGene table by subtracting pseudogenes:
grep -vw -f pseudogenes.ids vegav27Fixed.gtf > vegaGenev27.gtf
wc -l vega*gtf
# 168381 vegaGenev27.gtf
# 168602 vegav27Fixed.gtf
# 221 vegav27Pseudogene.gtf
# Need to relabel IDs to get the name to be the otter transcript ID
# and name 2 to be the transcript_id (needs to be labeled as gene_id)
# Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
# groups the rows by this ID.
sed -e 's/gene_id/tmp_id/' vegaGenev27.gtf \
| sed -e 's/transcript_id/gene_id/' \
| sed -e 's/otter_transcript_id/transcript_id/' > vegaGenev27Format.gtf
# remove an extra line
grep -v "rachels_data" vegaGenev27Format.gtf > tmp
mv tmp vegaGenev27Format.gtf
# Do the same for the pseudogene GTF file:
sed -e 's/gene_id/tmp_id/' vegav27Pseudogene.gtf \
| sed -e 's/transcript_id/gene_id/' \
| sed -e 's/otter_transcript_id/transcript_id/' \
> vegaPseudogenev27Format.gtf
gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
vegaGenev27Format.gtf vegaGenev27.gp
genePredCheck vegaGenev27.gp
# genepred ok checked: 12761 failed: 0
gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
vegaPseudogenev27Format.gtf vegaPseudoGenev27.gp
genePredCheck vegaPseudoGenev27.gp
# genepred ok checked: 58 failed: 0
# load GTF files for Vega genes and pseudogenes:
ssh hgwdev
cd /cluster/data/danRer5/bed/vega27
# load vegaGene table
hgLoadGenePred -genePredExt danRer5 vegaGene vegaGenev27.gp
# load vegaPseudoGene table
hgLoadGenePred -genePredExt danRer5 vegaPseudoGene vegaPseudoGenev27.gp
hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer5
# There are 36, chr1-25 and 11 scaffolds are annotated:
# Zv7_scaffold2491
# Zv7_scaffold2498
# Zv7_scaffold2509
# Zv7_scaffold2516
# Zv7_scaffold2523
# Zv7_scaffold2533
# Zv7_scaffold2537
# Zv7_scaffold2551
# Zv7_scaffold2560
# Zv7_scaffold2633
# Zv7_scaffold2650
hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer5
# 12 chroms and 1 scaffolds have pseudogenes annotated
# chr2, chr3, chr4, chr5, chr9, chr12, chr13, chr18, chr19, chr20, chr24,
# Zv7_scaffold2509
# Only finished sequence is annotated by Vega
# Vega information tables:
# mySQL table definition and autosql-generated files created previously
# for zebrafish-specific information (vegaInfoZfish).
# Add clone_id to a separate table instead of this one. A tab-separated
# file of transcript ID and clone ID was provided by Tina Eyre
# (te3@sanger.ac.uk) at Sanger.
# Need to grep for the transcript IDs
# created a second table for the cloneId accessions since there
# are multiple ids for some VEGA genes. Otherwise, there would be
# a comma separated list in this field or many rows repeated but just
# different in the cloneId field. Associate transcript ID to clone IDs.
# Table definitions are in danRer4.txt.
# first check that the clone IDs file is complete.
# this is from Tina Eyre sent on 2007-09-07 and the file is
# vegav27_transcript_to_clone.txt.zip
ssh kkstore06
cd /cluster/data/danRer5/bed/vega27
unzip vegav27_transcript_to_clone.txt.zip
awk '{print $1}' vegav27_transcript_to_clone.txt | sort | uniq \
> transWithCloneId.txt
wc -l transWithCloneId.txt
# 12819 transWithCloneId.txt
awk '{print $1}' vegav27InfoWithTransIdsUniq.txt | sort | uniq \
> vegaInfoTransIds.txt
wc -l vegaInfoTransIds.txt
# 12819 vegaInfoTransIds.txt
# compare to vega27OtterIds.txt
comm -12 transWithCloneId.txt vegaInfoTransIds.txt | wc -l
# 12819
# so all the transcript IDs are in this list.
# sort this table:
sort vegav27_transcript_to_clone.txt | uniq > vegav27TransToClone.txt
# load these tables:
ssh hgwdev
cd /cluster/data/danRer5/bed/vega27
hgLoadSqlTab danRer5 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
vegav27InfoWithTransIdsUniq.txt
hgLoadSqlTab danRer5 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
vegav27TransToClone.txt
# Add trackDb.ra entry. Keep Vega Genes and Vega Pseudogenes separate as for
# human and add version number to description.
###########################################################################
# BLASTZ FOR MEDAKA (oryLat1) (DONE, 2007-09-10, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# No lineage-specific repeats for this species pair.
# Medaka also has no species-specific repeats in the RepeatMasker
# library so run this using dynamic masking.
ssh kkstore06
mkdir /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
cd /cluster/data/danRer5/bed
ln -s blastz.oryLat1.2007-09-10 blastz.oryLat1
cd /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
# Use Blastz parameters as for tetNig1 in danRer4.txt. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q
# matrix as medaka is quite distant from zebrafish. Blastz uses
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds for oryLat1 (and dynamic masking) and also use
# windowMasker plus SDust repeats since there are no species-specific
# RepeatMasker repeats for medaka. The random scaffolds were used in Blastz
# separately (oryLat1UnScaffolds.2bit) rather than as an artificial unordered
# chromosome to avoid getting Blastz alignments across non-contiguous
# scaffolds.
cat << '_EOF_' > DEF
# zebrafish (danRer5) vs. Medaka (oryLat1)
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - zebrafish (danRer5)
SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Medaka (oryLat1)
# (40M chunks covers the largest chroms in one # gulp)
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=50
SEQ2_LAP=0
BASE=/cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
## use screen
time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust \
-blastzOutRoot /san/sanvol1/scratch/danRer5OryLat1 \
`pwd`/DEF >& doBlastz.log &
# 0.154u 0.096s 6:13:59.67 0.0% 0+0k 0+0io 4pf+0w
# TEST PARAMETERS:
# RUN 1 - parameters above used for fugu queried against zebrafish before.
# BLASTZ_H=2500
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
#
# TRY WITH PARAMETERS USED FOR DANRER4 - Run 2:
# use parameters for oryLat1 in danRer4.txt. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q
# matrix as medaka is quite distant from zebrafish. Blastz uses
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds and dynamic masking.
# zebrafish (danRer5) vs. Medaka (oryLat1)
# BLASTZ=blastz.v7.x86_64
# BLASTZ_H=2000
# BLASTZ_Y=3400
# BLASTZ_L=6000
# BLASTZ_K=2200
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# Run 1:
featureBits danRer5 refGene:cds chainOryLat1Run1Link -enrichment
# refGene:cds 1.019%, chainOryLat1Run1Link 10.478%, both 0.895%, cover 87.79%,
# enrich 8.38x
# Run 2:
featureBits danRer5 refGene:cds chainOryLat1Link -enrichment
# refGene:cds 1.019%, chainOryLat1Link 11.164%, both 0.894%, cover 87.77%,
# enrich 7.86x
# Chains danRer4, parameters like Run 2:
featureBits danRer4 refGene:cds chainOryLat1Link -enrichment
# refGene:cds 0.952%, chainOryLat1Link 12.366%, both 0.813%, cover 85.43%,
# enrich 6.91x
# Run 1:
hgsql -e 'select count(*) from chainOryLat1Run1Link;' danRer5
+----------+
| count(*) |
+----------+
| 32786056 |
+----------+
# Run 2:
hgsql -e 'select count(*) from chainOryLat1Link;' danRer5
+----------+
| count(*) |
+----------+
| 46341892 |
+----------+
# Run 1:
hgsql -e 'select count(*) from chainOryLat1Run1;' danRer5
+----------+
| count(*) |
+----------+
| 2197571 |
+----------+
# Run 2:
hgsql -e 'select count(*) from chainOryLat1;' danRer5
+----------+
| count(*) |
+----------+
| 2961333 |
+----------+
# Nets Run 1:
featureBits danRer5 refGene:cds netOryLat1Run1 -enrichment
# refGene:cds 1.019%, netOryLat1Run1 65.343%, both 0.977%, cover 95.85%,
# enrich 1.47x
# Nets Run 2:
featureBits danRer5 refGene:cds netOryLat1 -enrichment
# refGene:cds 1.019%, netOryLat1 66.238%, both 0.974%, cover 95.63%, enrich
# 1.44x
# Nets danRer4, parameters like Run 2:
featureBits danRer4 refGene:cds netOryLat1 -enrichment
# refGene:cds 0.952%, netOryLat1 64.174%, both 0.888%, cover 93.29%, enrich
# 1.45x
# [1] Exit 25
# /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk
# 0.188u 0.132s 7:33:05.44 0.0% 0+0k 0+0io 3pf+0w
# Command failed:
# ssh -x hgwdev nice
# /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10/axtChain/installDownloads.csh
ssh hgwdev
# remove downloads and run install downloads again
cd /usr/local/apache/htdocs/goldenPath/danRer5/
rm -r vsOryLat1
rm -r liftOver
ssh kkstore06
cd /cluster/data/danRer5/bed/blastz.oryLat1.2007-09-10
# Run script again starting at download step
time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -continue download \
-blastzOutRoot /san/sanvol1/scratch/danRer5OryLat1 \
`pwd`/DEF >& doBlastz2.log &
# 0.095u 0.047s 17:35.63 0.0% 0+0k 0+0io 0pf+0w
## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
# ALIGNMENT DOWNLOADS FOR DANRER5 ON ORYLAT1 - documented in oryLat1.txt
###########################################################################
# SPLIT MASKED SEQUENCE INTO CHROM AND SCAFFOLD FASTA FILES
# (DONE, 2007-09-10, hartera)
ssh kkstore06
cd /cluster/data/danRer5/
mkdir splitSoftMask
cd splitSoftMask
twoBitToFa ../danRer5.2bit danRer5.softMasked.fa
faSplit byname danRer5.softMasked.fa .
rm danRer5.softMasked.fa
# Copy the chrom and scaffold sequences to /san/sanvol1/scratch/danRer5
mkdir -p /san/sanvol1/scratch/danRer5/splitSoftMask
foreach f (/cluster/data/danRer5/splitSoftMask/*.fa)
cp -p $f /san/sanvol1/scratch/danRer5/splitSoftMask/
end
foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
ls $f >> seq.lst
end
wc -l seq.lst
# 5036 seq.lst
# correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
rm seq.lst
# Copy the split sequences to iscratch dir on kkr1u00
ssh kkr1u00
rm -r /iscratch/i/danRer5/splitSoftMask
mkdir -p /iscratch/i/danRer5/splitSoftMask
foreach s (/cluster/data/danRer5/splitSoftMask/*.fa)
echo "Copying $s ..."
cp $s /iscratch/i/danRer5/splitSoftMask/
end
# Count files transferred
foreach f (/iscratch/i/danRer5/splitSoftMask/*.fa)
ls $f >> seq.lst
end
wc -l seq.lst
# 5036 seq.lst
# correct: 25 chroms, 1 chrM, and 5010 unmapped scaffolds.
rm seq.lst
# rsync to cluster machines
foreach R (2 3 4 5 6 7 8)
rsync -a --progress /iscratch/i/danRer5/ kkr${R}u00:/iscratch/i/danRer5/
end
###########################################################################
# AFFY ZEBRAFISH GENOME ARRAY CHIP TRACK (DONE, 2007-09-10 - 2007-09-12)
# With the first version of this track on danRer4, QA found a number of short
# alignments of <= 30 bp and there are a number in the <= 50bp range.
# These do not seem to be meaningful so filtering was changed to try to
# remove these alignments while retaining meaningful alignments.
# pslCDnaFilter was used with the same settings as used for the
# Genbank EST alignments for zebrafish.
# Also use -minIdentity=90 for Blat instead of -minIdentity=95 since as the
# higher minIdentity is causing alignments to be dropped that should not be.
# Blat's minIdentity seems to be more severe than that for pslReps or
# pslCDnaFilter as it takes insertions and deletions into account.
# These are Jim's recommendations.
# Load both consensus and target sequences to form a composite track. Users
# have asked for Affy target sequences in the past.
# Array chip consensus sequences already downloaded for danRer1
ssh hgwdev
cd /projects/compbio/data/microarray/affyZebrafish
mkdir -p /san/sanvol1/scratch/affy
# Affy Zebrafish consensus sequences already in this directory:
cp /projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/san/sanvol1/scratch/affy/
# Set up cluster job to align Zebrafish consensus sequences to danRer5
mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2007-09-10
ln -s /cluster/data/danRer5/bed/affyZebrafish.2007-09-10 \
/cluster/data/danRer5/bed/affyZebrafish
ssh pk
cd /cluster/data/danRer5/bed/affyZebrafish
ls -1 /san/sanvol1/scratch/affy/Zebrafish_consensus.fa > affy.lst
foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
ls -1 $f >> genome.lst
end
wc -l genome.lst
# 5036 genome.lst
# for output:
mkdir -p /san/sanvol1/scratch/danRer5/affy/psl
# use -repeats option to report matches to repeat bases separately
# to other matches in the PSL output.
echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
-ooc=/san/sanvol1/scratch/danRer5/danRer5_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer5/affy/psl/$(root1)_$(root2).psl}\n#ENDLOOP'
> template.sub
/parasol/bin/gensub2 genome.lst affy.lst template.sub para.spec
/parasol/bin/para create para.spec
# 5036 jobs written to batch
/parasol/bin/para try, check, push ... etc.
/parasol/bin/para time
# Completed: 5036 of 5036 jobs
#CPU time in finished jobs: 22117s 368.61m 6.14h 0.26d 0.001 y
#IO & Wait Time: 14290s 238.17m 3.97h 0.17d 0.000 y
#Average job time: 7s 0.12m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 840s 14.00m 0.23h 0.01d
#Submission to last job: 988s 16.47m 0.27h 0.01d
# need to do pslSort
ssh pk
cd /san/sanvol1/scratch/danRer5/affy
# Do sort and then best in genome filter.
# only use alignments that have at least
# 95% identity in aligned region.
# Previously did not use minCover since a lot of sequence is in
# Un and NA so genes may be split up so good to see all alignments.
# However, found a number of short alignments of <= 50 bp. These are
# not meaningful so maybe need to use minCover. If increased too much,
# then hits on poor parts of the assembly will be missed.
# use pslCDnaFilter with the same parameters as used for zebrafish
# Genbank EST alignments.
pslSort dirs raw.psl tmp psl
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 raw.psl \
affyZebrafishConsensus.psl
# seqs aligns
# total: 15295 459469
# drop minNonRepSize: 2791 391655
# drop minIdent: 2590 30197
# drop minCover: 1712 7112
# weird over: 237 1038
# kept weird: 176 228
# drop localBest: 1393 13532
# kept: 14971 16973
# 96.6% of sequences aligned. There are 15502 Affy consensus sequences.
# FOR DANRER4:
# seqs aligns
# total: 15272 828202
#drop minNonRepSize: 2763 741674
# drop minIdent: 2656 39188
# drop minCover: 2550 10784
# weird over: 359 1439
# kept weird: 277 347
# drop localBest: 2830 17737
# kept: 14952 18819
# Kept 97.9% of alignments. There are 15502 Affy sequences originally
# aligned so there are now 96.5% remaining.
# rsync these psl files
rsync -a --progress /san/sanvol1/scratch/danRer5/affy/*.psl \
/cluster/data/danRer5/bed/affyZebrafish/
ssh kkstore06
cd /cluster/data/danRer5/bed/affyZebrafish
# shorten names in psl file
sed -e 's/Zebrafish://' affyZebrafishConsensus.psl > tmp.psl
mv tmp.psl affyZebrafishConsensus.psl
pslCheck affyZebrafishConsensus.psl
# psl is good
# load track into database
ssh hgwdev
cd /cluster/data/danRer5/bed/affyZebrafish
hgLoadPsl danRer5 affyZebrafishConsensus.psl
# Add consensus sequences for Zebrafish chip
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/hgFixed/affyProbes
ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_consensus.fa \
/gbdb/hgFixed/affyProbes
# these sequences were loaded previously so no need to reload.
hgLoadSeq -abbr=Zebrafish: danRer5 \
/gbdb/hgFixed/affyProbes/Zebrafish_consensus.fa
# Clean up
rm batch.bak raw.psl
# check number of short alignments:
hgsql -e \
'select count(*) from affyZebrafishConsensus where (qEnd - qStart) <= 50;'\
danRer5
# 6
# There were 7 for danRer4 - this is ok.
### TARGET SEQUENCES SUBTRACK: (2007-09-11 - 2007-09-12)
### Create subtrack of target sequences, this is just the region of the
# consensus sequence that is used for designing the probesets.
# Array chip target sequences downloaded from
# http://www.affymetrix.com/support/technical/byproduct.affx?product=zebrafish
ssh hgwdev
# copy sequences to /projects/compbio/data/microarray/affyZebrafish
cd /projects/compbio/data/microarray/affyZebrafish
unzip Zebrafish_target.zip
grep '>' Zebrafish_target > target.headers
wc -l target.headers
# 15617 target.headers
# includes the control sequences
# remove these
grep -v "AFFX" target.headers > targetNoControls.headers
wc -l targetNoControls.headers
# 15502 targetNoControls.headers
mv Zebrafish_target Zebrafish_target.fa
grep '>' Zebrafish_target.fa | wc -l
# 15617 sequences
awk 'BEGIN {FS=";"} {print $1;}' targetNoControls.headers \
| sed -e 's/>//' > targetNoControls.ids
wc -l *.ids
# 15502 targetNoControls.ids
# remove semi-colon after name in the target FASTA file
sed -e 's/;//' Zebrafish_target > Zebrafish_target.fa
faSomeRecords Zebrafish_target.fa targetNoControls.ids \
Zebrafish_targetNoControls.fa
grep '>' Zebrafish_targetNoControls.fa | wc -l
# 15502
# Remove "Zebrafish:" from name, leave in "tg:" to differentiate
# sequences from the consensus ones.
perl -pi.bak -e 's/>target:Zebrafish:/>tg:/' Zebrafish_targetNoControls.fa
# make affy directory on the san if it does not exist already:
mkdir -p /san/sanvol1/scratch/affy
cp -p \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_targetNoControls.fa \
/san/sanvol1/scratch/affy/
# Set up cluster job to align Zebrafish consensus sequences to danRer5
# Directories below were set up above
# mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2006-09-10
# ln -s /cluster/data/danRer5/bed/affyZebrafish.2006-09-10 \
# /cluster/data/danRer5/bed/affyZebrafish
mkdir -p /cluster/data/danRer5/bed/affyZebrafish.2006-09-10/target
ssh pk
cd /cluster/data/danRer5/bed/affyZebrafish/target
ls -1 /san/sanvol1/scratch/affy/Zebrafish_targetNoControls.fa \
> affyTarget.lst
foreach f (/san/sanvol1/scratch/danRer5/splitSoftMask/*.fa)
ls -1 $f >> genome.lst
end
wc -l genome.lst
# 5036 genome.lst
# for output:
mkdir -p /san/sanvol1/scratch/danRer5/affy/pslTarget
# use -repeats option to report matches to repeat bases separately
# to other matches in the PSL output.
echo '#LOOP\n/cluster/bin/x86_64/blat -fine -repeats=lower -minIdentity=90
-ooc=/san/sanvol1/scratch/danRer5/danRer5_11.ooc $(path1) $(path2) {check out
line+ /san/sanvol1/scratch/danRer5/affy/pslTarget/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
/parasol/bin/gensub2 genome.lst affyTarget.lst template.sub para.spec
/parasol/bin/para create para.spec
# 5036 jobs written to batch
/parasol/bin/para try, check, push ... etc.
# Keeps crashing - can't write to directory - permissions look ok.
# Tried again (2007-09-12, hartera) and it worked fine.
/parasol/bin/para time
# Completed: 5036 of 5036 jobs
#CPU time in finished jobs: 17604s 293.40m 4.89h 0.20d 0.001y
#IO & Wait Time: 14325s 238.75m 3.98h 0.17d 0.000 y
#Average job time: 6s 0.11m 0.00h 0.00d
#Longest running job: 0s 0.00m 0.00h 0.00d
#Longest finished job: 767s 12.78m 0.21h 0.01d
#Submission to last job: 842s 14.03m 0.23h 0.01d
# need to do pslSort
ssh pk
cd /san/sanvol1/scratch/danRer5/affy
# Do sort and then best in genome filter.
pslSort dirs rawTarget.psl tmp pslTarget
# only use alignments that have at least 95% identity in aligned region.
pslCDnaFilter -localNearBest=0.005 -minQSize=20 -minNonRepSize=16 \
-ignoreNs -bestOverlap -minId=0.95 -minCover=0.15 rawTarget.psl \
affyZebrafishTarget.psl
# seqs aligns
# total: 15216 308151
# drop invalid: 1 1
# drop minNonRepSize: 2051 256676
# drop minIdent: 1917 20458
# drop minCover: 741 2951
# weird over: 143 607
# kept weird: 92 116
# drop localBest: 1306 11562
# kept: 14820 16503
# 95.6% of sequences aligned. There are 15502 Affy consensus sequences.
# rsync these psl files
rsync -a --progress /san/sanvol1/scratch/danRer5/affy/*Target.psl \
/cluster/data/danRer5/bed/affyZebrafish/target/
ssh kkstore06
cd /cluster/data/danRer5/bed/affyZebrafish/target
# shorten names in psl file
#sed -e 's/Zebrafish://' affyZebrafishTarget.psl > tmp.psl
#mv tmp.psl affyZebrafishConsensus.psl
pslCheck affyZebrafishTarget.psl
# psl is good
# load track into database
ssh hgwdev
cd /cluster/data/danRer5/bed/affyZebrafish/target
hgLoadPsl danRer5 affyZebrafishTarget.psl
# Add consensus target for Zebrafish chip
# Copy sequences to gbdb if they are not there already
mkdir -p /gbdb/hgFixed/affyProbes
ln -s \
/projects/compbio/data/microarray/affyZebrafish/Zebrafish_targetNoControls.fa \
/gbdb/hgFixed/affyProbes
# these sequences were loaded previously so no need to reload.
hgLoadSeq danRer5 \
/gbdb/hgFixed/affyProbes/Zebrafish_targetNoControls.fa
# Clean up
rm batch.bak rawTarget.psl
# check number of short alignments:
hgsql -e \
'select count(*) from affyZebrafishTarget where (qEnd - qStart) <= 50;'\
danRer5
# 33
# This is ok, these are shorter regions anyway.
# Added trackDb.ra entry to zebrafish/danRer5/trackDb.ra. This is
# a composite track consisting of 2 subtracks for the consensus
# sequence Blat alignments and the target sequence Blat alignments.
# The composite track is called affyZebrafish. Add searches for both
# tracks.
###########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE, 2007-09-11, hartera)
# ADDED A CHROMOSOMES DOWNLOAD DIRECOTRY (DONE, 2007-09-14, hartera)
ssh hgwdev
cd /cluster/data/danRer5
ln -s /cluster/data/danRer5/bed/RepeatMasker.2007-09-04/danRer5.fa.out \
/cluster/data/danRer5
/cluster/bin/scripts/makeDownloads.pl danRer5 -verbose=2 \
>& jkStuff/downloads.log &
tail -f jkStuff/downloads.log
# Edit README.txt files when done:
# *** All done!
# *** Please take a look at the downloads for danRer5 using a web browser.
# *** Edit each README.txt to resolve any notes marked with "***":
# /cluster/data/danRer5/goldenPath/database/README.txt
# /cluster/data/danRer5/goldenPath/bigZips/README.txt
# (The htdocs/goldenPath/danRer5/*/README.txt "files" are just links to
# those.)
# ADD CHROMOSOMES DOWNLOADS DIRECTORY
# Add chromosome and scaffolds downloads, these are already in
# /cluster/data/danRer5/splitSoftMask (2007-09-14, hartera)
# compress files:
ssh kkstore06
mkdir /cluster/data/danRer5/goldenPath/chromosomes
cd /cluster/data/danRer5/splitSoftMask
foreach c (chr*.fa)
gzip $c
end
# now scaffolds, compress together into one file
foreach s (Zv7_*.fa)
gzip -c $s >> unmappedScaffolds.fa.gz
end
mv *.gz /cluster/data/danRer5/goldenPath/chromosomes/
ssh hgwdev
# make symbolic links to the goldenPath downloads directory for danRer5
mkdir /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes
ln -s /cluster/data/danRer5/goldenPath/chromosomes/*.gz \
/usr/local/apache/htdocs/goldenPath/danRer5/chromosomes/
cd /usr/local/apache/htdocs/goldenPath/danRer5/chromosomes
md5sum *.gz > md5sum.txt
cp /usr/local/apache/htdocs/goldenPath/danRer4/chromosomes/README.txt \
/cluster/data/danRer5/goldenPath/chromosomes/
# edit README.txt to make it specific for the danRer5 (Sanger Zv7)
# assembly and then link it to the downloads directory.
ln -s /cluster/data/danRer5/goldenPath/chromosomes/README.txt \
/usr/local/apache/htdocs/goldenPath/danRer5/chromosomes/
########################################################################
# BLASTZ Swap Mouse mm9 (DONE - 2007-09-12 - Hiram)
ssh kkstore06
# the original
cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-11
cat fb.mm9.chainDanRer5Link.txt
# 48497464 bases of 2620346127 (1.851%) in intersection
# the swap
mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
cd /cluster/data/danRer5/bed/blastz.mm9.swap
time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
-chainMinScore=5000 \
/cluster/data/mm9/bed/blastzDanRer5.2007-09-11/DEF \
-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
> swap.log 2>&1 &
# real 9m47.163s
cat fb.danRer5.chainMm9Link.txt
# 34017483 bases of 1435609608 (2.370%) in intersection
########################################################################
# ADD CONTIGS TRACK (DONE, 2007-09-14, hartera)
# make ctgPos2 (contig name, size, chrom, chromStart, chromEnd) from
# chunks (contigs) agp files.
ssh kkstore06
mkdir -p /cluster/data/danRer5/bed/ctgPos2
# get chrM information from danRer4:
ssh hgwdev
cd /cluster/data/danRer5/bed/ctgPos2
hgsql -N -e 'select * from ctgPos2 where chrom = "chrM";' danRer4 \
> chrM.ctgPos2.txt
ssh kkstore06
cd /cluster/data/danRer5/bed/ctgPos2
# update chrM accession to NC_002333.2
perl -pi.bak -e 's/AC024175\.3/NC_002333\.2/' chrM.ctgPos2.txt
# ctgPos2 .sql .as .c and .h files exist - see makeDanRer1.doc
# get the contigs for the chroms, chr1-25:
awk 'BEGIN {OFS="\t"} \
{if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
/cluster/data/danRer5/Zv7release/Zv7_chr.agp >> ctgPos2.tab
# Add chrM to ctgPos2 file:
cat chrM.ctgPos2.txt >> ctgPos2.tab
# then do the same for the unmapped scaffolds:
awk 'BEGIN {OFS="\t"} \
{if ($5 != "N") print $6, $3-$2+1, $1, $2-1, $3, $5}' \
/cluster/data/danRer5/Zv7release/randoms/randomsScaffold.agp \
>> ctgPos2.tab
# load the ctgPos2 table
ssh hgwdev
cd /cluster/data/danRer5/bed/ctgPos2
# use hgLoadSqlTab as it gives more error messages than using
# "load data local infile ...".
hgLoadSqlTab danRer5 ctgPos2 \
~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
# check that all chroms and scaffolds are there
hgsql -N -e 'select distinct(chrom) from ctgPos2;' danRer5 \
| sort > chromsAndScafsCtgPos2.txt
hgsql -N -e 'select distinct(chrom) from chromInfo;' danRer5 \
| sort > chromsAndScafsChromInfo.txt
wc -l chroms*
# 5036 chromsAndScafsChromInfo.txt
# 5036 chromsAndScafsCtgPos2.txt
comm -12 chromsAndScafsCtgPos2.txt chromsAndScafsChromInfo.txt | wc -l
# 5036
# So all chroms and unmapped scaffolds are represented in the table.
# cleanup
ssh kkstore06
cd /cluster/data/danRer5/bed/ctgPos2
rm *.bak chromsAndScafs*
# create trackDb.ra entry and html page for ctgPos2 track.
# add search for the track and make sure the termRegex will handle
# contigs named "Zv7_scaffoldN.N" where N is an integer and all the
# accessions in the *.agp files.
###########################################################################
# CREATE MICROARRAY DATA TRACK BY ADDING ZON LAB WILD TYPE MICROARRAY DATA TO
# AFFY ZEBRAFISH ALIGNMENTS (DONE, 2007-09-16, hartera)
# Array data is for whole embryos of five wild type zebrafish strains.
# Data is in hgFixed (see hgFixed.doc) - from Len Zon's lab at Children's
# Hospital Boston. Contact: adibiase@enders.tch.harvard.edu
# see danRer4.txt and hgFixed.txt for the history of this track and the
# expression data.
ssh hgwdev
mkdir -p /cluster/data/danRer5/bed/ZonLab/wtArray
cd /cluster/data/danRer5/bed/ZonLab/wtArray
# Map the data to the consensus sequence alignments first.
# use AllRatio table for mapping. There are not many arrays in this
# dataset so using AllRatio will allow the selection of All Arrays
# from the track controls on the track description page. Also set up the
# Zebrafish microarrayGroups.ra so that the Medians of replicates or
# Means of replicates can also be selected for display.
# Create mapped data in zebrafishZonWT.bed.
hgMapMicroarray zebrafishZonWTAffyConsensus.bed \
hgFixed.zebrafishZonWTAllRatio \
/cluster/data/danRer5/bed/affyZebrafish/affyZebrafishConsensus.psl
# Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTAllRatio
# Mapped 14971, multiply-mapped 2002, missed 0, unmapped 646
hgLoadBed danRer5 affyZonWildType zebrafishZonWTAffyConsensus.bed
# Loaded 16973 elements of size 15
# then for the Affy Zebrafish chip target sequences, map these to the
# Wild Type expression data
hgMapMicroarray -pslPrefix="tg:" zebrafishZonWTAffyTarget.bed \
hgFixed.zebrafishZonWTAllRatio \
/cluster/data/danRer5/bed/affyZebrafish/target/affyZebrafishTarget.psl
# Loaded 15617 rows of expression data from hgFixed.zebrafishZonWTAllRatio
# Mapped 14820, multiply-mapped 1683, missed 0, unmapped 797
hgLoadBed danRer5 affyTargetZonWildType zebrafishZonWTAffyTarget.bed
# Loaded 16503 elements of size 15
# add trackDb.ra entry at trackDb/zebrafish/danRer5 level to create
# a composite affyZonWildType with alignments and array data for both
# Affy Zebrafish chip consensus sequences and target sequences - see
# danRer4.txt for more details of history of trackDb.ra and
# microarrayGroups.ra entries for this data.
###########################################################################
# BLASTZ FOR TETRAODON (tetNig1) (DONE, 2007-09-16, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# No lineage-specific repeats for this species pair.
# Tetraodon also has no species-specific repeats in the RepeatMasker
# library so run this using dynamic masking.
# The tetraodon 2bit file of scaffolds was used
# (tetNig1.randomContigs.sdTrf.2bit) - this contains sequences for
# scaffolds from randoms. These were used for Blastz rather than the
# random chroms so that alignments do not occur across non-contiguous
# scaffolds.
## use screen to run
ssh kkstore06
mkdir /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
cd /cluster/data/danRer5/bed
ln -s blastz.tetNig1.2007-09-16 blastz.tetNig1
cd /cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
# Use Blastz parameters for tetNig1 in danRer4.txt. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q
# matrix as tetraodon is quite distant from zebrafish. Blastz uses
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds for tetNig1 (and dynamic masking) and also use
# windowMasker plus SDust repeats since there are no species-specific
# RepeatMasker repeats for tetraodon.
cat << '_EOF_' > DEF
# zebrafish (danRer5) vs. tetraodon (tetNig1)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - zebrafish (danRer5)
SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
SEQ1_LIMIT=30
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Tetraodon (tetNig1)
# soft-masked chroms and random scaffolds in 2bit format
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift
SEQ2_CHUNK=1000000000
SEQ2_LAP=0
BASE=/cluster/data/danRer5/bed/blastz.tetNig1.2007-09-16
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
mkdir /san/sanvol1/scratch/danRer5TetNig1
time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust \
-blastzOutRoot /san/sanvol1/scratch/danRer5TetNig1 \
`pwd`/DEF >& doBlastz.log &
# Took about 5 hours 7 minutes to run.
featureBits danRer5 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 1.019%, chainTetNig1Link 6.359%, both 0.866%, cover 84.94%,
# enrich 13.36x
featureBits danRer4 refGene:cds chainTetNig1Link -enrichment
# refGene:cds 0.953%, chainTetNig1Link 7.345%, both 0.836%, cover 87.72%,
# enrich 11.94x
featureBits danRer5 refGene:cds netTetNig1 -enrichment
# refGene:cds 1.019%, netTetNig1 63.685%, both 0.961%, cover 94.30%, enrich
# 1.48x
featureBits danRer4 refGene:cds netTetNig1 -enrichment
# refGene:cds 0.953%, netTetNig1 63.389%, both 0.909%, cover 95.41%, enrich
# 1.51x
## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
# ALIGNMENT DOWNLOAD FOR DANRER5 ON TETNIG1 - documented in tetNig1.txt
###########################################################################
# BLASTZ FOR FUGU (fr2) (DONE, 2007-09-18 - 2007-09-19, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# No lineage-specific repeats for this species pair.
# Fugu also has no species-specific repeats in the RepeatMasker
# library so run this using dynamic masking.
# Usee the Fugu 2bit file of scaffolds (fr2.scaffolds.2bit) - this
# contains sequences for all the scaffolds of chrUn. The whole assembly
# is unmapped, unordered scaffolds. The individual scaffolds were
# used for Blastz rather than the random chroms so that alignments do not
# occur across non-contiguous scaffolds.
## use screen to run
ssh kkstore06
mkdir /cluster/data/danRer5/bed/blastz.fr2.2007-09-18
cd /cluster/data/danRer5/bed
ln -s blastz.fr2.2007-09-18 blastz.fr2
cd /cluster/data/danRer5/bed/blastz.fr2.2007-09-18
# Use Blastz parameters for fr1 in danRer4.txt. Using scaffolds makes this run
# slower so it is best to have the scaffolds in the query. Use HoxD55.q
# matrix as Fugu is quite distant from zebrafish. Blastz uses
# lineage-specfic repeats but there are none for these two species.
# Use soft-masked scaffolds for fr2 (and dynamic masking) and also use
# windowMasker plus SDust repeats since there are no species-specific
# RepeatMasker repeats for Fugu. Fugu is about 400.5 Mb of scaffolds plus
# chrM which is about 16.4 kb.
cat << '_EOF_' > DEF
# zebrafish (danRer5) vs. Fugu (fr2)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
BLASTZ=blastz.v7.x86_64
BLASTZ_H=2500
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET - zebrafish (danRer5)
SEQ1_DIR=/san/sanvol1/scratch/danRer5/danRer5.2bit
SEQ1_LEN=/cluster/data/danRer5/chrom.sizes
SEQ1_LIMIT=30
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY - Fugu (fr2)
# soft-masked chroms and random scaffolds in 2bit format
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/danRer5/bed/blastz.fr2.2007-09-18
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
chmod +x DEF
mkdir /san/sanvol1/scratch/danRer5Fr2
time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust \
-blastzOutRoot /san/sanvol1/scratch/danRer5Fr2 \
`pwd`/DEF >& doBlastz.log &
# Took about 6 hours and 50 minutes to run.
cat fb.danRer5.chainFr2Link.txt
# 104476108 bases of 1435609608 (7.277%) in intersection
# Compare coverage to refGene CDS:
featureBits danRer5 refGene:cds chainFr2Link -enrichment
# refGene:cds 1.019%, chainFr2Link 7.277%, both 0.886%, cover 86.93%,
# enrich 11.95x
featureBits danRer4 refGene:cds chainFr2Link -enrichment
# refGene:cds 0.955%, chainFr2Link 8.543%, both 0.810%, cover 84.73%,
# enrich 9.92x
featureBits danRer5 refGene:cds netFr2 -enrichment
# refGene:cds 1.019%, netFr2 64.210%, both 0.968%, cover 94.95%, enrich 1.48x
featureBits danRer4 refGene:cds netFr2 -enrichment
# refGene:cds 0.955%, netFr2 62.980%, both 0.885%, cover 92.68%, enrich 1.47x
## DO BLASTZ SWAP TO CREATE THE DANRER5 CHAINS, NETS, AXTNET, MAFNET AND
# ALIGNMENT DOWNLOAD FOR DANRER5 ON FR2 -document in fr2.txt
###########################################################################
# CREATE LIFTOVER CHAINS FROM danRer5 TO danRer4
# (DONE 2007-09-19 AND 2007-09-21, hartera)
ssh kkstore06
mkdir /cluster/data/danRer5/bed/blat.danRer4
cd /cluster/data/danRer5/bed/blat.danRer4
time nice doSameSpeciesLiftOver.pl danRer5 danRer4 \
-bigClusterHub pk \
-ooc /san/sanvol1/scratch/danRer5/danRer5_11.ooc \
-buildDir /cluster/data/danRer5/bed/blat.danRer4 >& do.log &
# 0.401u 0.253s 8:09:03.62 0.0% 0+0k 0+0io 11pf+0w
# Remove symbolic link to liftOver chains and copy over the file
rm ../liftOver/danRer5ToDanRer4.over.chain.gz
cp -p danRer5ToDanRer4.over.chain.gz ../liftOver
# a link in /usr/local/apache/htdocs/goldenPath/danRer5/liftOver has
# already been made to this file and md5sum.txt needs to be updated
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/danRer5/liftOver
md5sum *.gz > md5sum.txt
###########################################################################
# VEGA GENES UPDATE TO v28 (DONE, 2007-09-28 - 2007-10-01, hartera)
# Data provided by Tina Eyre from Sanger: te3@sanger.ac.uk
# UPDATE SENT ON 2007-09-27, Vega v28. RE-LOAD TABLES TO UPDATE THE
ssh kkstore06
mkdir -p /cluster/data/danRer5/bed/vega28/data/
cd /cluster/data/danRer5/bed/vega28/data
# 3 files: vegav28.gtf.zip vegav28_information.txt.zip
# vegav28_transcript_to_clone.txt.zip
unzip vegav28.gtf.zip
unzip vegav28_information.txt.zip
unzip vegav28_transcript_to_clone.txt.zip
# asked for the vegav28_information.txt file to be in the format
# required to load the table directory.
# Check the format then prepare data files to load database tables.
cd /cluster/data/danRer5/bed/vega28
# this time all the lines are describing genes and there is no extra
# header lines and output from the program that generated the file.
# NOTE: there are no NA scaffolds.
# Remove "chr" prefix from scaffolds
sed -e 's/chrZv7_/Zv7_/' ./data/vegav28.gtf > vegav28Fixed.gtf
# vegaInfo is transcriptId, otterId, geneId, method and geneDesc
# Get otter transcript ID and otter gene ID:
awk 'BEGIN{FS=";"} {OFS="\t"} \
{if (($5 ~ /otter_gene_id/) && ($6 ~ /otter_transcript_id/)) \
print $5, $6;}' vegav28Fixed.gtf | sort | uniq > vega28OtterIds.txt
# remove the otter_gene_id and otter_transcript_id and
# extra spaces and quotation marks.
perl -pi.bak -e 's/\sotter_gene_id\s\"//;' vega28OtterIds.txt
perl -pi.bak -e 's/"\t\sotter_transcript_id\s"(OTTDART[0-9]+)"/\t$1/' \
vega28OtterIds.txt
wc -l vega28OtterIds.txt
# 14001 vega28OtterIds.txt
# there were 12819 in Vega v27
# get list of the otter gene IDs only
awk '{print $1}' vega28OtterIds.txt | sort | uniq > otterGeneIds.only
# then get list of otter gene IDs from information file
awk '{if ($1 ~ /OTTDARG/) print $1}' data/vegav28_information.txt \
| sort | uniq > infoOtterGeneIds.only
comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
# 8489
wc -l *.only
# 10374 infoOtterGeneIds.only
# 9318 otterGeneIds.only
comm -23 otterGeneIds.only infoOtterGeneIds.only > geneIdsGtfNotInfo.txt
# also check the transcripts in transcript to clone are all there.
awk 'BEGIN {FS="\t"} {print $2;}' vega28OtterIds.txt \
| sort | uniq > otterTxIds.only
wc -l otterTxIds.only
# 14001 otterTxIds.only
awk 'BEGIN {FS="\t"} {print $1;}' ./data/vegav28_transcript_to_clone.txt \
| sort | uniq > vegaCloneTxIds.sort
wc -l vegaCloneTxIds.sort
# 14001 vegaCloneTxIds.sort
comm -12 otterTxIds.only vegaCloneTxIds.sort | wc -l
# 14001
# All otter transcript IDs in the GTF vegav28 gene file are in the
# vegav28_transcript_to_clone.txt file as expected.
wc -l geneIdsGtfNotInfo.txt
# 829 geneIdsGtfNotInfo.txt
# E-mailed Tina Eyre at Sanger - on 2007-09-28 - to ask about
# these otter gene Ids that are in the GTF file but not in the
# vegav28_information.txt file.
# Received correct file: 2007-10-01
# Check again
ssh kkstore06
cd /cluster/data/danRer5/bed/vega28/data
unzip vegav28_information.txt.zip
cd ..
# then get list of otter gene IDs from information file
awk '{if ($1 ~ /OTTDARG/) print $1}' ./data/vegav28_information.txt \
| sort | uniq > infoOtterGeneIds.only
comm -12 otterGeneIds.only infoOtterGeneIds.only | wc -l
# 9318
wc -l *.only
# 11115 infoOtterGeneIds.only
# 9318 otterGeneIds.only
# All IDs from the GTF file are contained in the information file.
# all the gene IDs from the GTF file but the information file contains
# 1797 are in the information file but not the GTF file
# Extra genes are included that are found on clones that do not map
# to Ensembl.
comm -13 otterGeneIds.only infoOtterGeneIds.only > geneIds.InfoOnly.txt
cat << 'EOF' > mergeTransIdAndInfo.pl
#!/usr/bin/perl -w
use strict;
my ($idsFile, $infoFile, %idsHash);
$idsFile = $ARGV[0];
$infoFile = $ARGV[1];
open (IDS, $idsFile) || die "Can not open $idsFile: $!\n";
open (INFO, $infoFile) || die "Can not open $idsFile: $!\n";
# Read in the otter gene IDs and transcript IDs
while (<IDS>)
{
my (@f, $line);
chomp;
$line = $_;
# split line on tab
@f = split(/\t/, $line);
# hash keyed by gene ID, stores transcript ID
# more than one ID so store as an array
if ($f[0] =~ /OTTDARG/)
{
# add to array in hash if it is an otter gene ID
push @{$idsHash{$f[0]}}, $f[1];
}
}
close IDS;
while (<INFO>)
{
my ($line, @info, $geneId, @transIds);
# read in information file and get the otter gene ID
$line = $_;
# split line on tab
@info = split(/\t/, $line);
if ($info[0] =~ /OTTDARG/)
{
$geneId = $info[0];
# look up transcript ID in hash
# if gene ID exists in the hash then print out the information
# file line with the correct transcript ID
if (exists($idsHash{$geneId}))
{
@transIds = @{$idsHash{$geneId}};
# rewrite line with transcript ID
foreach my $t (@transIds)
{
print "$t\t$line";
}
}
}
}
close INFO;
'EOF'
# << happy emacs
chmod +x mergeTransIdAndInfo.pl
mergeTransIdAndInfo.pl vega28OtterIds.txt ./data/vegav28_information.txt \
> vegav28InfoWithTransIds.txt
wc -l vega28OtterIds.txt
# 14001 vega28OtterIds.txt
wc -l vegav28InfoWithTransIds.txt
# 14001 vegav28InfoWithTransIds.txt
# so just sort and uniq the resulting information file with transcript IDs
sort vegav28InfoWithTransIds.txt | uniq > vegav28InfoWithTransIdsUniq.txt
wc -l vegav28InfoWithTransIdsUniq.txt
# 14001 vegav28InfoWithTransIdsUniq.txt
# So no duplicated lines.
# There are extra IDs in the information file as it includes all Vega
# genes (including clones not in Ensembl). Those actually in the Vega
# v27 GTF are those that are found on clones that are in Ensembl.
# Vega includes only clone sequence and Zv7_NA scaffolds are WGS so
# these are not annotated in Vega. This information is from
# an e-mail from Tina Eyre (te3@sanger.ac.uk) on 2007-09-07.
# then use the info file to grab those genes that are pseudogenes, get the
# transcript ID from the vegaIDs.txt file. Then grep out the pseudogenes
# to a separate file.
# Next step is to find which transcripts are pseudogenes.
grep pseudogene vegav28InfoWithTransIdsUniq.txt | sort | uniq | wc -l
# found 68 pseudogenes so need to create the pseudogene track
# Get transcript IDs for pseudogenes.
grep pseudogene vegav28InfoWithTransIdsUniq.txt | awk '{print $1}' \
> pseudogenes.ids
grep -w -f pseudogenes.ids vegav28Fixed.gtf > vegav28Pseudogene.gtf
awk '{print $20}' vegav28Pseudogene.gtf | sort | uniq | wc -l
# 68
# Need to make the GTF file vegGene table by subtracting pseudogenes:
grep -vw -f pseudogenes.ids vegav28Fixed.gtf > vegaGenev28.gtf
wc -l vega*gtf
# 178987 vegaGenev28.gtf
# 179249 vegav28Fixed.gtf
# 262 vegav28Pseudogene.gtf
# Need to relabel IDs to get the name to be the otter transcript ID
# and name 2 to be the transcript_id (needs to be labeled as gene_id)
# Also, relabel the otter_transcript_id to be transcript_id as ldHgGene
# groups the rows by this ID.
sed -e 's/gene_id/tmp_id/' vegaGenev28.gtf \
| sed -e 's/transcript_id/gene_id/' \
| sed -e 's/otter_transcript_id/transcript_id/' > vegaGenev28Format.gtf
# Do the same for the pseudogene GTF file:
sed -e 's/gene_id/tmp_id/' vegav28Pseudogene.gtf \
| sed -e 's/transcript_id/gene_id/' \
| sed -e 's/otter_transcript_id/transcript_id/' \
> vegaPseudogenev28Format.gtf
gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
vegaGenev28Format.gtf vegaGenev28.gp
genePredCheck vegaGenev28.gp
# checked: 13933 failed: 0
gtfToGenePred -genePredExt -infoOut=tmpInfo.txt \
vegaPseudogenev28Format.gtf vegaPseudogenev28.gp
genePredCheck vegaPseudogenev28.gp
# checked: 68 failed: 0
# load GTF files for Vega genes and pseudogenes:
ssh hgwdev
cd /cluster/data/danRer5/bed/vega28
hgsql -e 'drop table vegaGene;' danRer5
# load vegaGene table
hgLoadGenePred -genePredExt danRer5 vegaGene vegaGenev28.gp
hgsql -e 'drop table vegaPseudoGene;' danRer5
# load vegaPseudoGene table
hgLoadGenePred -genePredExt danRer5 vegaPseudoGene vegaPseudogenev28.gp
hgsql -N -e 'select distinct(chrom) from vegaGene;' danRer5
# For vegav28 there are 29, chr1-25 plue 4 scaffolds:
# Zv7_scaffold2498
# Zv7_scaffold2509
# Zv7_scaffold2516
# Zv7_scaffold2572
# For Vega v27, there were more scaffolds annotated so e-mailed Tina Eyre
# to ask why this is the case.
# There are 36, chr1-25 and 11 scaffolds are annotated:
# Zv7_scaffold2491
# Zv7_scaffold2498
# Zv7_scaffold2509
# Zv7_scaffold2516
# Zv7_scaffold2523
# Zv7_scaffold2533
# Zv7_scaffold2537
# Zv7_scaffold2551
# Zv7_scaffold2560
# Zv7_scaffold2633
# Zv7_scaffold2650
hgsql -N -e 'select distinct(chrom) from vegaPseudoGene;' danRer5
# 13 chroms and 1 scaffolds have pseudogenes annotated
# chr2, chr3, chr4, chr5, chr9, chr12, chr13, chr18, chr19, chr20, chr22, chr24,
# Zv7_scaffold2509
# Only finished sequence is annotated by Vega
# Vega information tables:
# mySQL table definition and autosql-generated files created previously
# for zebrafish-specific information (vegaInfoZfish).
# Add clone_id to a separate table instead of this one. A tab-separated
# file of transcript ID and clone ID was provided by Tina Eyre
# (te3@sanger.ac.uk) at Sanger.
# Need to grep for the transcript IDs
# created a second table for the cloneId accessions since there
# are multiple ids for some VEGA genes. Otherwise, there would be
# a comma separated list in this field or many rows repeated but just
# different in the cloneId field. Associate transcript ID to clone IDs.
# Table definitions are in danRer4.txt.
# Load the vega to clone ID information:
ssh kkstore06
cd /cluster/data/danRer5/bed/vega28
sort ./data/vegav28_transcript_to_clone.txt | uniq \
> vegav28TransToClone.txt
# load the vegaInfo and vegaToCloneId tables:
ssh hgwdev
cd /cluster/data/danRer5/bed/vega28
hgsql -e 'drop table vegaInfoZfish;' danRer5
hgLoadSqlTab danRer5 vegaInfoZfish ~/kent/src/hg/lib/vegaInfoZfish.sql \
vegav28InfoWithTransIdsUniq.txt
hgsql -e 'drop table vegaToCloneId;' danRer5
hgLoadSqlTab danRer5 vegaToCloneId ~/kent/src/hg/lib/vegaToCloneId.sql \
vegav28TransToClone.txt
# A peptide file was requested this time (vegav28_protein.fa) so load
# it into vegaPep:
ssh kkstore06
cd /cluster/data/danRer5/bed/vega28
# There are more peptides in the FASTA file than there are transcript IDs
# in the GTF file so get those peptides sequences for transcripts in GTF.
# otterTxIds.only is the list of transcript IDs (14,001 IDs).
faSomeRecords ./data/vegav28_protein.fa otterTxIds.only vega28Pep.fa
grep '>' vega28Pep.fa | wc -l
# 10110
grep '>' vega28Pep.fa | sed -e 's/>//' | sort | uniq > vega28PepTxIds.txt
comm -12 vega28PepTxIds.txt otterTxIds.only | wc -l
# 10110
comm -13 vega28PepTxIds.txt otterTxIds.only > otterTxNotInPeptides.txt
wc -l otterTxNotInPeptides.txt
# 3891 otterTxNotInPeptides.txt
# Checked a few and these appear to be novel processed transcripts or novel
# protein-coding so probably the novel ones do not have an associated
# protein - check with Tina Eyre.
ssh hgwdev
cd /cluster/data/danRer5/bed/vega28
hgPepPred danRer5 generic vegaPep vega28Pep.fa
# hgsql -e 'select count(*) from vegaPep;' danRer5
# 10110
# Check which type of transcripts do not have a peptide translation.
hgsql -N -e 'select distinct(name) from vegaPep;' danRer5 | \
sort > vegaPep.names.sort
hgsql -N -e 'select distinct(transcriptId) from vegaInfoZfish;' danRer5 \
| sort > vegaInfo.txId.sort
comm -12 vegaPep.names.sort vegaInfo.txId.sort | wc -l
# 10110
wc -l *.sort
# 14001 vegaInfo.txId.sort
# 10110 vegaPep.names.sort
comm -13 vegaPep.names.sort vegaInfo.txId.sort > vegaTx.inInfoNotPep.txt
wc -l vegaTx.inInfoNotPep.txt
# 3891 vegaTx.inInfoNotPep.txt
foreach t (`cat vegaTx.inInfoNotPep.txt`)
hgsql -N -e "select transcriptId, method, confidence from vegaInfoZfish
where transcriptId = '${t}';" danRer5 >> vegaTx.inInfoNotPep.type
end
awk 'BEGIN {OFS="\t"} {print $2, $3}' vegaTx.inInfoNotPep.type \
| sort | uniq -c | sort -nr > vegaTx.inInfoNotPep.counts
# Add trackDb.ra entry for trackDb/zebrafish/danRer5/trackDb.ra if it is not
# there already. Keep Vega Genes and Vega Pseudogenes separate as for
# human and add version number to description.
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-10-18)
ssh kkstore06
# bash if not using bash shell already
mkdir /cluster/data/danRer5/blastDb
cd /cluster/data/danRer5
awk '{if ($2 > 1000000) print $1}' chrom.sizes > great1M.lst
twoBitToFa danRer5.2bit stdout | faSomeRecords stdin great1M.lst temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa
twoBitToFa danRer5.2bit stdout | faSomeRecords -exclude stdin great1M.lst temp.fa
faSplit sequence temp.fa 156 blastDb/y
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
mkdir -p /cluster/bluearc//danRer5/blastDb
cd /cluster/data/danRer5/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /cluster/bluearc//danRer5/blastDb
done
mkdir -p /cluster/data/danRer5/bed/tblastn.hg18KG
cd /cluster/data/danRer5/bed/tblastn.hg18KG
echo /cluster/bluearc//danRer5/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 1456 query.lst
# we want around 200000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(200000/1456) = 267.372560
mkdir -p /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa
split -l 267 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/danRer5/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/danRer5/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer5/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit
ssh kk
cd /cluster/data/danRer5/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 75786 of 75786 jobs
# CPU time in finished jobs: 13289094s 221484.91m 3691.42h 153.81d 0.421 y
# IO & Wait Time: 6534562s 108909.36m 1815.16h 75.63d 0.207 y
# Average job time: 262s 4.36m 0.07h 0.00d
# Longest finished job: 1158s 19.30m 0.32h 0.01d
# Submission to last job: 37516s 625.27m 10.42h 0.43d
ssh kkstore06
cd /cluster/data/danRer5/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh pk
cd /cluster/data/danRer5/bed/tblastn.hg18KG/chainRun
para create chainSpec
para maxNode 30
para try, check, push, check etc.
# Completed: 138 of 138 jobs
# CPU time in finished jobs: 83388s 1389.80m 23.16h 0.97d 0.003 y
# IO & Wait Time: 43355s 722.58m 12.04h 0.50d 0.001 y
# Average job time: 918s 15.31m 0.26h 0.01d
# Longest finished job: 21693s 361.55m 6.03h 0.25d
# Submission to last job: 21713s 361.88m 6.03h 0.25d
ssh kkstore06
cd /cluster/data/danRer5/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/danRer5/bed/tblastn.hg18KG/blastHg18KG.psl
cd ..
pslCheck blastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/danRer5/bed/tblastn.hg18KG
hgLoadPsl danRer5 blastHg18KG.psl
# check coverage
featureBits danRer5 blastHg18KG
# 21056644 bases of 1435609608 (1.467%) in intersection
featureBits danRer4 blastHg18KG
# 21159392 bases of 1626093931 (1.301%) in intersection
featureBits danRer5 ensGene:cds blastHg18KG -enrichment
# ensGene:cds 2.114%, blastHg18KG 1.467%, both 1.077%, cover 50.92%, enrich 34.72x
featureBits danRer4 ensGene:cds blastHg18KG -enrichment
# ensGene:cds 2.230%, blastHg18KG 1.301%, both 1.058%, cover 47.46%, enrich 36.47x
ssh kkstore06
rm -rf /cluster/data/danRer5/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/danRer5/bed/tblastn.hg18KG/blastOut
#end tblastn
##########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-28 - Hiram)
cd /cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27
cat fb.oryLat2.chainDanRer5Link.txt
# 138070427 bases of 700386597 (19.713%) in intersection
# and for the swap
mkdir /cluster/data/danRer5/bed/blastz.oryLat2.swap
cd /cluster/data/danRer5/bed/blastz.oryLat2.swap
time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \
-swap -tRepeats=windowmaskerSdust \
-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
# real 58m15.303s
# had to finish the load nets manually, mistakenly included
# -qRepeats=windowmaskerSdust when that is not valid for danRer5
cat fb.danRer5.chainOryLat2Link.txt
# 158709399 bases of 1435609608 (11.055%) in intersection
# Then, continuing:
doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium \
/cluster/data/oryLat2/bed/blastzDanRer5.2008-08-27/DEF \
-continue=download -swap -tRepeats=windowmaskerSdust \
-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > download.log 2>&1 &
#########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
danRer5.upstreamGeneTbl = refGene
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
+# SWAP BLASTZ FOR DOG (canFam2) (DONE, 2009-08-08, hartera)
+# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
+# See canFam2.txt for creation of zebrafish chain and net tracks and downloads
+# on the dog genome assembly.
+ mkdir /hive/data/genomes/danRer5/bed/blastz.canFam2.swap
+ cd /hive/data/genomes/danRer5/bed/blastz.canFam2.swap
+
+ time nice doBlastzChainNet.pl -chainMinScore=5000 -chainLinearGap=loose \
+ -swap /hive/data/genomes/canFam2/bed/blastz.danRer5.2009-08-07/DEF \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
+ -chainMinScore=5000 -chainLinearGap=loose \
+ >& doBlastz.swap.log &
+ cat fb.danRer5.chainCanFam2Link.txt
+ # 32053647 bases of 1435609608 (2.233%) in intersection
+