src/hg/makeDb/doc/danRer6.txt 1.5
1.5 2009/07/23 19:12:38 galt
minor correction, thanks Hiram
Index: src/hg/makeDb/doc/danRer6.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer6.txt,v
retrieving revision 1.4
retrieving revision 1.5
diff -b -B -U 1000000 -r1.4 -r1.5
--- src/hg/makeDb/doc/danRer6.txt 10 Jul 2009 02:42:07 -0000 1.4
+++ src/hg/makeDb/doc/danRer6.txt 23 Jul 2009 19:12:38 -0000 1.5
@@ -1,719 +1,719 @@
##################################################
#
# danRer6 = Danio rerio Zv8 Dec. 2008
#
# by Galt Barber 2009-06-17
#
# Danio Rerio (zebrafish) from Sanger, version Zv8 (released 2008-12-12)
# Project website:
# http://www.sanger.ac.uk/Projects/D_rerio/
# Assembly notes:
# http://www.sanger.ac.uk/Projects/D_rerio/
# http://www.sanger.ac.uk/Projects/D_rerio/Zv8_assembly_information.shtml
# TODO this comment is copied over and may need removal
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
###############################################
# set up main genome directory
ssh hgwdev
cd /hive/data/genomes
mkdir danRer6
cd danRer6
###########################################################################
# DOWNLOAD SEQUENCE (DONE, 2009-06-17, galt)
cd /hive/data/genomes/danRer6
mkdir download
cd download
# get sequence from Sanger
# (See below, they later give us corrected files at another location)
wget --timestamping -r -np -l 2 -nd -L 'ftp://ftp.ensembl.org/pub/assembly/zebrafish/Zv8release'
# matches ftp site sizes
ls -l --block-size=K
-rw-rw-r-- 1 galt protein 1K Apr 9 15:56 README
-rw-rw-r-- 1 galt protein 3999K Dec 12 2008 Zv8_chr.agp
-rw-rw-r-- 1 galt protein 1553865K Dec 12 2008 Zv8_contigs.fa
-rw-rw-r-- 1 galt protein 6475K Dec 12 2008 Zv8_scaffold.agp
-rw-rw-r-- 1 galt protein 1470539K Dec 12 2008 Zv8_scaffolds.fa
-rw-rw-r-- 1 galt protein 1471257K Apr 9 15:55 Zv8_toplevel.fa
# Zv8_chr.agp maps contigs (in Zv8_contigs.fa) to chr1-25
# Oddly enough, the contig names are
# all in the form of scaffoldName.1, ...
# so that any original contig name is lost.
# Zv8_scaffold.agp maps contigs to scaffolds,
# producing Zv8_scaffolds.fa
# Not entirely sure yet about Zv8_toplevel.fa
# but it seems to be more scaffolds only.
# I think only the configs.fa is useful fasta.
# There is nothing that maps the scaffolds to
# the chroms except that they gave the contig names
# like scaffold1.1 etc. This connection via name
# parts is not official agp structure, it's a hack.
# People want to see the detailed internal gap structure
# of the scaffolds even inside their chroms, so that
# creates a minor problem.
# Additionally, we want to include the chroms for sure,
# but we want to also include any other scaffolds not
# already included in the chroms. I am going to write
# a c program to merge the chrom and scaffold agps
# so that we don't duplicate scaffolds that are in
# the chroms. It will have to rely on the name
# hack they provide to connect scaffold to chrom.
# This agp/fasta structure has existed at least since Zv7
# and I don't know if Sanger or others use it more
# generally.
# I created a new program agpSangerUnfinished in kent/src/hg
# because Sanger wants to use unfinished clones but can't
# officially report it that way, so they hack their agp file
# but they don't change contigs.fa to match.
# compile it
cd kent/src/hg/agpSangerUnfinished
make
rehash
cd /hive/data/genomes/danRer6/download
# fix unfinished names and coordinates
agpSangerUnfinished Zv8_chr.agp Zv8_contigs.fa Zv8_chr.fix.agp
agpSangerUnfinished Zv8_scaffold.agp Zv8_contigs.fa Zv8_scaffold.fix.agp
# I created a new program agpMergeChromScaf in kent/src/hg
# as mentioned above for merging scaffold data
# nog alreadhromScaf Zv8_
# compile it
cd kent/src/hg/agpMergeChromScaf
make
rehash
cd /hive/data/genomes/danRer6/download
agpMergeChromScaf Zv8_chr.fix.agp Zv8_scaffold.fix.agp danRer6.agp
# Now make final fasta from fixed/merged agp
agpAllToFaFile danRer6.agp Zv8_contigs.fa danRer6.fa
# check it out!
faToTwoBit danRer6.fa danRer6.2bit
checkAgpAndFa danRer6.agp danRer6.2bit >& checkAgpAndFa.log
tail -1 checkAgpAndFa.log
#All AGP and FASTA entries agree - both files are valid
-------------------------------------------------------------------
#RE-DOING with their corrected clone names: (done 2009-06-24 galt)
ftp://ftp.sanger.ac.uk/pub/zfish/jt8/rt
"
I should stress that the sequence content of the FASTA file has not changed
from the versions you
already have; I have just altered the headers for the first set of clones
mentioned in the previous
email. Only the contig-level FASTA file is affected; the scaffold-level and
top-level FASTA files are
not concerned with the names of clones, so they remain as before.
"
# move old version out of the way
cd /hive/data/genomes/danRer6
mv download download0
mkdir download
cd download
# get sequence from Sanger
# James Torrance <jt8@sanger.ac.uk>
# created this version of Zv8 with fixed clone-ids.
wget --timestamping -r -np -l 2 -nd -L 'ftp://ftp.sanger.ac.uk/pub/zfish/jt8/rt'
[hgwdev:download> ll
-rw-rw-r-- 1 galt protein 1051781 Jun 23 17:23 Zv8_chr.agp.gz
-rw-rw-r-- 1 galt protein 468741052 Jun 23 22:43 Zv8_contigs.fa.gz
-rw-rw-r-- 1 galt protein 1683358 Jun 23 17:23 Zv8_scaffold.agp.gz
-rw-rw-r-- 1 galt protein 155 Jun 23 22:44 md5sum.dat
[hgwdev:download> cat md5sum.dat
1bb31f1f3290038066f74680a854ff69 Zv8_chr.agp.gz
c4cc4c9574721ce57fde0c79357a4a13 Zv8_contigs.fa.gz
fc4f9261b77fd177096f0cc9c8718a8d Zv8_scaffold.agp.gz
[hgwdev:download> md5sum *.gz
1bb31f1f3290038066f74680a854ff69 Zv8_chr.agp.gz
c4cc4c9574721ce57fde0c79357a4a13 Zv8_contigs.fa.gz
fc4f9261b77fd177096f0cc9c8718a8d Zv8_scaffold.agp.gz
# they match! no corruption during transfer
# decompress
gunzip *.gz
[hgwdev:download> ll
-rw-rw-r-- 1 galt protein 4094225 Jun 23 17:23 Zv8_chr.agp
-rw-rw-r-- 1 galt protein 1591157262 Jun 23 22:43 Zv8_contigs.fa
-rw-rw-r-- 1 galt protein 6630120 Jun 23 17:23 Zv8_scaffold.agp
# repeat all of the above steps from the first section
# (after the wget) down to where it says
#All AGP and FASTA entries agree - both files are valid
#######################################################################
# MAKE GENOME DB FROM CHROMOSOMES AND UNMAPPED SCAFFOLDS
# (DONE, 2009-06-24, galt)
# CHANGE dbDb ENTRY MADE BY SCRIPT TO DISPLAY SAME DEFAULT POSITION
# AS FOR DANRER5, GENE vdac2 (DONE, 2007-09-10, hartera)
# To find the accession number for chrM, go to http://www.ncbi.nih.gov/ and search
# Nucleotide for "Danio mitochondrion genome".
# That shows accession: NC_002333.2, 16596 bp
# wget for mitochondron genome so:
# http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&rettype=fasta&id=NC_002333.2&retmode=text
ssh hgwdev
cd /hive/data/genomes/danRer6/
cat << 'EOF' > danRer6.config.ra
db danRer6
clade vertebrate
scientificName Danio rerio
commonName Zebrafish
assemblyDate Dec. 2008
assemblyLabel Wellcome Trust Sanger Institute, Zv8 assembly (CAAK00000000.5)
-orderKey 449
+orderKey 448
genomeCladePriority 90
dbDbSpeciesDir zebrafish
mitoAcc NC_002333.2
agpFiles /hive/data/genomes/danRer6/download/danRer6.agp
fastaFiles /hive/data/genomes/danRer6/download/danRer6.fa
taxId 7955
'EOF'
# << keep emacs coloring happy
ssh hgwdev
cd /hive/data/genomes/danRer6/
# makeGenomeDb.pl creates the db for the genome, makes the hgcentraltest
# entries and loads the gold and gap tables. Creates the GC
# percent, short match and restriction enzyme tracks. It also creates
# a danRer6 directory in the trackDb/zebrafish directory with a trackDb.ra
# file and a template for the assembly descripton, and for the
# gold and gap track tracks
# Run makeGenomeDb.pl
makeGenomeDb.pl danRer6.config.ra > & makeGenomeDb.pl.out &
# Follow the directions at the end of the log
tail -20 makeGenomeDb.log
#----
#NOTES -- STUFF THAT YOU WILL HAVE TO DO --
#cd /hive/data/genomes/danRer6/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/zebrafish/danRer6
#
#Search for '***' notes in each file in and make corrections (sometimes the
#files used for a previous assembly might make a better template):
# description.html /hive/data/genomes/danRer6/html/{trackDb.ra,gap.html,gold.html}
#
#Then cd ../.. (to trackDb/) and
# - edit makefile to add danRer6 to DBS.
# - (if necessary) cvs add zebrafish
# - cvs add zebrafish/danRer6
# - cvs add zebrafish/danRer6/*.{ra,html}
# - cvs ci -m "Added danRer6 to DBS." makefile
# - cvs ci -m "Initial descriptions for danRer6." zebrafish/danRer6
# - (if necessary) cvs ci zebrafish
# - Run make update DBS=danRer6 and make alpha when done.
# - (optional) Clean up /hive/data/genomes/danRer6/TemporaryTrackDbCheckout
# - cvsup your ~/kent/src/hg/makeDb/trackDb and make future edits there.
#----
cd /hive/data/genomes/danRer6/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/zebrafish/danRer6
# save in case
mkdir orig
cp * orig/
# start with version from danRer5
cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/description.html .
cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/gap.html .
cp ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer5/gold.html .
# edit them for danRer6
vi description.html
vi gap.html
vi gold.html
cd ../..
vi makefile # - edit makefile to add danRer6 to DBS.
cvs add zebrafish/danRer6
cvs add zebrafish/danRer6/*.{ra,html}
cvs ci -m "Added danRer6 to DBS." makefile
cvs ci -m "Initial descriptions for danRer6." zebrafish/danRer6
make update DBS=danRer6
make alpha DBS=danRer6
# (2009-06-24, galt), Change the default Browser position in dbDb
# so that it displays vdac2 - chr13:14,786,820-14,801,993 - same as for
# danRer5.
ssh hgwdev
hgsql -h genome-testdb \
-e 'update dbDb set defaultPos = "chr13:14,786,820-14,801,993" where name = "danRer6";' \
hgcentraltest
# temporarily add in a 2bit file so we can run the browser:
ln -s `pwd`/danRer6.unmasked.2bit /gbdb/danRer6/danRer6.2bit
###########################################################
# Repeatmasking (DONE 2009-06-26 galt)
#
#
ssh hgwdev
cd /hive/data/genomes/danRer6/
screen
# repeat mask
doRepeatMasker.pl danRer6 > & doRepeatMasker.pl.out &
# about 12 hours.
#/cluster/bin/scripts/extractNestedRepeats.pl danRer6.fa.out
#RepeatMasker bug?: Undefined id, line 330298 of input:
# 261 27.7 3.1 1.2 Zv8_NA4692 12290 12450 (10778) C ERV1-N3-I_DR-int LTR (824) 3767 3604
#At least one ID was missing (see warnings above) -- please report to Robert
#Hubley. -continue at your disgression.
# Seems to only be one line that is a problem, and it may even have
# been duplicated, so remove the problem line from danRer6.fa.out
#261 27.7 3.1 1.2 Zv8_NA4692 12290 12450 (10778) C ERV1-N3-I_DR-int LTR (824) 3767 3604
#267 24.9 1.8 1.2 Zv8_NA4692 12300 12313 (10915) C ERV1-N3-I_DR-int LTR (824) 3722 3604 298373
# keep the second of these lines, remove the first (line 330298) which is missing the last column.
# since danRer6.nestedRepeats.bed did get created,
# and doCat.csh appears to have finished,
# I am going to ignore this error and continue
doRepeatMasker.pl -continue mask \
-buildDir /hive/data/genomes/danRer6/bed/RepeatMasker.2009-06-26/ danRer6 \
>> & doRepeatMasker.pl.out &
cat bed/RepeatMasker.2009-06-26/faSize.rmsk.txt
#1512402306 bases (5525484 N's 1506876822 real 729112148 upper 777764674 lower)
#in 11724 sequences in 1 files
[hgwdev:danRer6> featureBits -countGaps danRer6 rmsk
#777997590 bases of 1512402306 (51.441%) in intersection
# simple repeat masker trf
doSimpleRepeat.pl danRer6 > & doSimpleRepeat.pl.out &
#[1] Exit 25 doSimpleRepeat.pl danRer6 >& doSimpleRepeat.pl.out
#oops messed up on chrM producing no output, liftUp produced nothing
#this is not a problem, and the bug is in the automated process,
#they should either tolerate nothing to do and create an empty output,
#or just filter chrM out of the input.
doSimpleRepeat.pl -continue filter \
-buildDir /hive/data/genomes/danRer6/bed/simpleRepeat.2009-06-29/ danRer6 \
>> & doSimpleRepeat.pl.out &
featureBits -countGaps danRer6 simpleRepeat
#111749881 bases of 1512402306 (7.389%) in intersection
# make final masked .2bit
twoBitMask danRer6.rmsk.2bit -add bed/simpleRepeat.2009-06-29/trfMask.bed danRer6.2bit
#Warning: BED file bed/simpleRepeat.2009-06-29/trfMask.bed has >=13 fields
#which means it might contain block coordinates, but this program uses only the
#first three fields (the entire span -- no support for blocks).
#(this seems to be normal/ok)
############################################################################
# prepare cluster data (DONE - 2009-07-06 - Galt)
ssh hgwdev
cd /hive/data/genomes/danRer6
# create gbdb symlink
rm /gbdb/danRer6/danRer6.2bit
ln -s `pwd`/danRer6.2bit /gbdb/danRer6/
calc '1024*1.14/2.88'
#405
blat danRer6.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=405
#Wrote 71088 overused 11-mers to 11.ooc
mkdir /hive/data/staging/data/danRer6
cp -p danRer6.2bit /hive/data/staging/data/danRer6
cp -p 11.ooc /hive/data/staging/data/danRer6
cp -p chrom.sizes /hive/data/staging/data/danRer6
# ask admin to sync this directory: /hive/data/staging/data/danRer6/
# to the kluster nodes /scratch/data/danRer6/
############################################################################
# running cpgIsland (DONE - 2009-07-07 - Galt)
ssh hgwdev
cd /hive/data/genomes/danRer6
mkdir bed/cpgIsland
cd bed/cpgIsland
cvs -d /projects/compbio/cvsroot checkout -P hg3rdParty/cpgIslands
cd hg3rdParty/cpgIslands
# comment out the following two lines if it compiles cleanly
# some day (there were some other fixups too, adding include lines)
sed -i -e "s#\(extern char\* malloc\)#// \1#" cpg_lh.c
make
#warning: incompatible implicit declaration of built-in function
# ignore the warnings
cd ../../
ln -s hg3rdParty/cpgIslands/cpglh.exe
# make hardmasked fasta files
mkdir -p hardMaskedFa
bash
cut -f1 ../../chrom.sizes | while read C
do
echo ${C}
twoBitToFa ../../danRer6.2bit:$C stdout \
| maskOutFa stdin hard hardMaskedFa/${C}.fa
done
#exit bash
exit
cut -f1 ../../chrom.sizes > chr.list
cat << '_EOF_' > template
#LOOP
./runOne $(root1) {check out line results/$(root1).cpg}
#ENDLOOP
'_EOF_'
# << happy emacs
cat << '_EOF_' > runOne
#!/bin/csh -fe
./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$
mv /scratch/tmp/$1.$$ $2
'_EOF_'
# << happy emacs
chmod +x runOne
mkdir results
pk
cd /hive/data/genomes/danRer6/bed/cpgIsland
gensub2 chr.list single template jobList
para create jobList
para try
para check
para push
para time
#Completed: 11724 of 11724 jobs
#CPU time in finished jobs: 111s 1.85m 0.03h 0.00d 0.000 y
#IO & Wait Time: 73830s 1230.50m 20.51h 0.85d 0.002 y
#Average job time: 6s 0.11m 0.00h 0.00d
#Longest finished job: 41s 0.68m 0.01h 0.00d
#Submission to last job: 1141s 19.02m 0.32h 0.01d
exit # return to hgwdev
# should be in /hive/data/genomes/danRer6/bed/cpgIsland
# Transform cpglh output to bed +
catDir results | awk '{ \
$2 = $2 - 1; \
width = $3 - $2; \
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", \
$1, $2, $3, $5,$6, width, \
$6, width*$7*0.01, 100.0*2*$6/width, $7, $9); \
}' > cpgIsland.bed
# took around 2 minutes
hgLoadBed danRer6 cpgIslandExt -tab \
-sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
#Loaded 13361 elements of size 10
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-07-09 braney )
# bash if not using bash shell already
cd /cluster/data/danRer6
mkdir /cluster/data/danRer6/blastDb
awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
twoBitToFa -seqList=1meg.lst danRer6.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa 1meg.lst
awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
twoBitToFa -seqList=less1meg.lst danRer6.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
rm temp.fa less1meg.lst
cd blastDb
for i in *.fa
do
/hive/data/outside/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 1513
mkdir -p /cluster/data/danRer6/bed/tblastn.hg18KG
cd /cluster/data/danRer6/bed/tblastn.hg18KG
echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 1513 query.lst
# we want around 250000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)
# 36727/(250000/1513) = 222.271804
mkdir -p kgfa
split -l 222 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
wc kg.lst
# 166 166 2158 kg.lst
mkdir -p blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/danRer6/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/danRer6/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
exit
ssh swarm
cd /cluster/data/danRer6/bed/tblastn.hg18KG
gensub2 query.lst kg.lst blastGsub blastSpec
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 251158 of 251158 jobs
# CPU time in finished jobs: 8587545s 143125.76m 2385.43h 99.39d 0.272 y
# IO & Wait Time: 885281s 14754.68m 245.91h 10.25d 0.028 y
# Average job time: 38s 0.63m 0.01h 0.00d
# Longest finished job: 1167s 19.45m 0.32h 0.01d
# Submission to last job: 9852s 164.20m 2.74h 0.11d
ssh swarm
cd /cluster/data/danRer6/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS ../blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
para create chainSpec
para try, check, push, check etc.
# Completed: 166 of 166 jobs
# CPU time in finished jobs: 107801s 1796.69m 29.94h 1.25d 0.003 y
# IO & Wait Time: 23945s 399.08m 6.65h 0.28d 0.001 y
# Average job time: 794s 13.23m 0.22h 0.01d
# Longest finished job: 25056s 417.60m 6.96h 0.29d
# Submission to last job: 25085s 418.08m 6.97h 0.29d
cd /cluster/data/danRer6/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
cd ..
pslCheck blastHg18KG.psl
# checked: 43305 failed: 0 errors: 0
# load table
ssh hgwdev
cd /cluster/data/danRer6/bed/tblastn.hg18KG
hgLoadPsl danRer6 blastHg18KG.psl
# check coverage
featureBits danRer6 blastHg18KG
# 19115900 bases of 1506896106 (1.269%) in intersection
featureBits danRer6 blastHg18KG refGene -enrichment
# blastHg18KG 1.269%, refGene 1.827%, both 0.659%, cover 51.97%, enrich 28.45x
rm -rf blastOut
#end tblastn
############################################################################
# create lift file on unBridged gaps for genbank splits (2009-07-07 - Hiram)
mkdir /hive/data/genomes/danRer6/bed/gap
cd /hive/data/genomes/danRer6/bed/gap
gapToLift danRer6 danRer6.unBridged.lift -bedFile=unBridged.lift.bed
cp -p danRer6.unBridged.lift ../../jkStuff
cp -p danRer6.unBridged.lift /hive/data/staging/data/danRer6
# ask admin to sync this directory: /hive/data/staging/data/danRer6/
# to the kluster nodes /scratch/data/danRer6/
###########################################################################
# AUTO UPDATE GENBANK MRNA AND EST AND ZGC GENES RUN
# (DONE, 2009-07-07, galt)
ssh hgwdev
cd $HOME/kent/src/hg/makeDb/genbank
cvs update -dP
# edit etc/genbank.conf to add danRer6 and commit to CVS
# done already for first run so no need to update.
# danRer6 (zebrafish)
danRer6.serverGenome = /hive/data/genomes/danRer6/danRer6.2bit
danRer6.clusterGenome = /scratch/data/danRer6/danRer6.2bit
danRer6.ooc = /scratch/data/danRer6/11.ooc
danRer6.lift = /hive/data/genomes/danRer6/jkStuff/danRer6.unBridged.lift
danRer6.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
danRer6.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
danRer6.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
danRer6.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
danRer6.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
danRer6.downloadDir = danRer6
danRer6.perChromTables = no
danRer6.refseq.mrna.xeno.load = yes
# end of section added to etc/genbank.conf
cvs commit -m "Added danRer6." etc/genbank.conf
# also added the perChromTables line afterwards. This means that there
# will not be one table per chromosome. Would be too many tables as
# there are 5010 scaffolds.
# update /cluster/data/genbank/
make etc-update
# ~/kent/src/hg/makeDb/genbank/src/lib/gbGenome.c already contains
# danRer genome information
ssh genbank
cd /cluster/data/genbank
nice bin/gbAlignStep -initial danRer6 &
#var/build/logs/2009.07.07-20:23:34.danRer6.initalign.log
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad danRer6 &
#var/dbload/hgwdev/logs/2009.07.08-09:59:18.dbload.log
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add danRer6 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added danRer6." etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# BLATSERVERS ENTRY (DONE - 2009-07-08 - Galt)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("danRer6", "blat14", "17790", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("danRer6", "blat14", "17791", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
# Making download files (DONE - 2009-07-08 - Galt)
ssh hgwdev
cd /hive/data/genomes/danRer6
ln -s bed/RepeatMasker.2009-06-26/danRer6.fa.out .
cd bed
ln -s simpleRepeat.2009-06-29 simpleRepeat
cd ..
makeDownloads.pl danRer6 >& downloads.log
# *** Please take a look at the downloads for danRer6 using a web browser.
# *** The downloads url is: http://hgwdev.cse.ucsc.edu/goldenPath/danRer6.
# *** Edit each README.txt to resolve any notes marked with "***":
# /hive/data/genomes/danRer6/goldenPath/database/README.txt
# /hive/data/genomes/danRer6/goldenPath/bigZips/README.txt
# (The htdocs/goldenPath/danRer6/*/README.txt "files" are just links to those.)