src/hg/makeDb/doc/caeJap2.txt 1.3
1.3 2009/09/21 20:26:39 hiram
Turned on updates for genbank
Index: src/hg/makeDb/doc/caeJap2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/caeJap2.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/makeDb/doc/caeJap2.txt 23 Jul 2009 21:40:01 -0000 1.2
+++ src/hg/makeDb/doc/caeJap2.txt 21 Sep 2009 20:26:39 -0000 1.3
@@ -1,349 +1,350 @@
# for emacs: -*- mode: sh; -*-
# Caenorhabditis japonica
# Washington University School of Medicine GSC and Sanger Institute
# WUSTL version 4.0.1 Jan 2009
# $Id$
########################################################################
# Download sequence (DONE - 2009-07-22 - Hiram)
mkdir -p /hive/data/genomes/caeJap2/wustl
cd /hive/data/genomes/caeJap2/wustl
wget --timestamping \
ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/ASSEMBLY
wget --timestamping \
ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/README
wget --timestamping \
ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_japonica/assembly/Caenorhabditis_japonica-4.0.1/output/*.*
cat << '_EOF_' > superToAgp.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %superSizes;
open (FH, "<faCount.supercontigs.txt") or die "can not read faCount.supercontigs
.txt";
while (my $line = <FH>) {
next if ($line =~ m/^#/);
next if ($line =~ m/^total/);
my ($name, $size, $rest) = split('\s+',$line,3);
$superSizes{$name} = $size;
}
close (FH);
my %contigSizes;
open (FH, "<faCount.contigs.txt") or die "can not read faCount.contigs.txt";
while (my $line = <FH>) {
next if ($line =~ m/^#/);
next if ($line =~ m/^total/);
my ($name, $size, $rest) = split('\s+',$line,3);
$contigSizes{$name} = $size;
}
close (FH);
my $superName = "";
my $contigPart = "";
my $chromStart = 1;
my $chromEnd = 1;
my $id = 1;
my $a = "";
my $b = "";
my $size = 0;
my $skipSuper = 0;
open (CB, ">Contig.bed") or die "can not write to Contig.bed";
open (FH, "zcat supercontigs.gz|") or die "can not read supercontigs.gz";
while (my $line = <FH>) {
next if ($line =~ m/^\s*$/);
chomp $line;
if ($line =~ m/^supercontig /) {
($a, $b) = split('\s+', $line);
$superName = $b;
$superName =~ s/Superc/C/;
if (!exists($superSizes{$superName})) {
$skipSuper = 1;
next;
} else {
$skipSuper = 0;
}
my $superEnd = $chromStart + $superSizes{$superName} - 1;
if ($chromEnd > 1) {
printf CB "chrUn\t%d\t%d\t%s\n", $chromStart-1+1000, $superEnd+1000,
$superName;
} else {
printf CB "chrUn\t%d\t%d\t%s\n", $chromStart-1, $superEnd,
$superName;
}
if ($chromEnd > 1) {
$size = 1000;
$chromEnd = $chromStart + $size - 1;
printf "chrUn\t%d\t%d\t%d\tN\t%d\tcontig\tno\n", $chromStart,
$chromEnd, $id++, $size;
$chromStart = $chromEnd + 1;
}
next;
} elsif ($line =~ m/^contig /) {
next if ($skipSuper > 0);
($a, $b) = split('\s+', $line);
$contigPart = $b;
die "can not find size for $contigPart"
if (!exists($contigSizes{$contigPart}));
$size = $contigSizes{$contigPart};
$chromEnd = $chromStart + $size - 1;
printf "chrUn\t%d\t%d\t%d\tW\t%s\t1\t%d\t+\n", $chromStart,
$chromEnd, $id++, $contigPart, $size;
$chromStart = $chromEnd + 1;
next;
} elsif ($line =~ m/^gap /) {
next if ($skipSuper > 0);
my ($g, $gapSize, $gapDev, $star, $x) = split('\s+', $line);
$size = $gapSize;
$size = 10 if ($size < 0);
die "gap size is 0" if ($size == 0);
$chromEnd = $chromStart + $size - 1;
printf "chrUn\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", $chromStart,
$chromEnd, $id++, $size;
$chromStart = $chromEnd + 1;
} else {
die "do not recognize line: $line";
}
}
close (FH);
close (CB);
'_EOF_'
# << happy emacs
chmod +x superToAgp.pl
./superToAgp.pl > chrUn.agp
qaToQac contigs.fa.qual.gz contigs.qac
qacAgpLift chrUn.agp contigs.qac chrUn.qual.qac
# and create a ctgPos2 file to show ContigN locations
awk '{
size = $3-$2
printf "%s\t%d\tchrUn\t%d\t%d\tW\n", $4, size, $2, $3
}' Contig.bed > ctgPos2.tab
########################################################################
# initial genome browser build (DONE - 2009-07-23 - Hiram)
cd /hive/data/genomes/caeJap2
cat << '_EOF_' > caeJap2.config.ra
# Config parameters for makeGenomeDb.pl:
db caeJap2
clade worm
genomeCladePriority 10
scientificName Caenorhabditis japonica
commonName C. japonica
assemblyDate Jan. 2009
assemblyLabel Washington University School of Medicine GSC C. japonica 4.0.1
orderKey 881
mitoAcc none
fastaFiles /hive/data/genomes/caeJap2/wustl/contigs.fa.gz
agpFiles /hive/data/genomes/caeJap2/wustl/chrUn.agp
qualFiles /hive/data/genomes/caeJap2/wustl/chrUn.qual.qac
dbDbSpeciesDir worm
taxId 281687
'_EOF_'
# << happy emacs
# verify sequence and AGP specs are OK
makeGenomeDb.pl -stop=agp caeJap2.config.ra
makeGenomeDb.pl -continue=db caeJap2.config.ra > makeGenomeDb.db.log 2>&1
ln -s `pwd`/caeJap2.unmasked.2bit /gbdb/caeJap2/caeJap2.2bit
# your personal browser should be functioning now
########################################################################
# Repeat Masker (DONE - 2009-07-23 - Hiram)
# repeat masker was run, but it did not mask much, so that table
# was removed so that windowmaskerSdust could be the masking track
# when fetching DNA from the browser
mkdir /hive/data/genomes/caeJap2/bed/repeatMasker
cd /hive/data/genomes/caeJap2/bed/repeatMasker
doRepeatMasker.pl -buildDir=`pwd` caeJap2 > do.log 2>&1
cat faSize.rmsk.txt
# 170655152 bases (41359841 N's 129295311 real 126749989 upper 2545322 lower)
# in 1 sequences in 1 files
# %1.49 masked total, %1.97 masked real
ssh hgwdev
hgsql -e "drop table chrUn_rmsk;" caeJap2
# this leaves the interrupted repeats track showing on genome-test
########################################################################
# Simple Repeats (DONE - 2009-07-23 - Hiram)
mkdir /hive/data/genomes/caeJap2/bed/simpleRepeat
cd /hive/data/genomes/caeJap2/bed/simpleRepeat
doSimpleRepeat.pl -buildDir=`pwd` caeJap2 > do.log 2>&1
########################################################################
# Window Masker (DONE - 2009-07-23 - Hiram)
mkdir /hive/data/genomes/caeJap2/bed/windowMasker
cd /hive/data/genomes/caeJap2/bed/windowMasker
doWindowMasker.pl -buildDir=`pwd` caeJap2 > do.log 2>&1
# load this initial data to get ready to clean it
ssh hgwdev
cd /hive/data/genomes/caeJap2/bed/windowMasker
hgLoadBed caeJap2 windowmaskerSdust windowmasker.sdust.bed.gz
featureBits -countGaps caeJap2 windowmaskerSdust
# 98788858 bases of 170655152 (57.888%) in intersection
# eliminate the gaps from the masking
featureBits caeJap2 -not gap -bed=notGap.bed
time nice -n +19 featureBits caeJap2 windowmaskerSdust notGap.bed \
-bed=stdout | gzip -c > cleanWMask.bed.gz
# 57429460 bases of 129295754 (44.417%) in intersection
# reload track to get it clean
hgLoadBed caeJap2 windowmaskerSdust cleanWMask.bed.gz
# Loaded 874463 elements of size 4
featureBits -countGaps caeJap2 windowmaskerSdust
# 57429460 bases of 170655152 (33.652%) in intersection
# mask the sequence with this clean mask
zcat cleanWMask.bed.gz \
| twoBitMask ../../caeJap2.unmasked.2bit stdin \
-type=.bed caeJap2.cleanWMSdust.2bit
twoBitToFa caeJap2.cleanWMSdust.2bit stdout | faSize stdin \
> caeJap2.cleanWMSdust.faSize.txt
cat caeJap2.cleanWMSdust.faSize.txt
# 170655152 bases (41359841 N's 129295311 real 71865902 upper
# 57429409 lower) in 1 sequences in 1 files
# %33.65 masked total, %44.42 masked real
#########################################################################
# MASK SEQUENCE WITH WM+TRF (DONE - 2009-07-23 - Hiram)
cd /hive/data/genomes/caeJap2
twoBitMask -add bed/windowMasker/caeJap2.cleanWMSdust.2bit \
bed/simpleRepeat/trfMask.bed caeJap2.2bit
twoBitToFa caeJap2.2bit stdout | faSize stdin \
> faSize.caeJap2.wmskSdust.TRF.txt
cat faSize.caeJap2.wmskSdust.TRF.txt
# 170655152 bases (41359841 N's 129295311 real 71749466 upper
# 57545845 lower) in 1 sequences in 1 files
# %33.72 masked total, %44.51 masked real
# create symlink to gbdb
ssh hgwdev
rm /gbdb/caeJap2/caeJap2.2bit
ln -s `pwd`/caeJap2.2bit /gbdb/caeJap2/caeJap2.2bit
########################################################################
# add ctgPos2 track (DONE - 2008-04-03 - Hiram)
ssh hgwdev
cd /hive/data/genomes/caeJap2/wustl
hgLoadSqlTab caeJap2 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab
#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2009-07-23 - Hiram)
# Use -repMatch=100 (based on size -- for human we use 1024, and
# worm size is ~5.1% of human judging by gapless caeJap2 vs. hg18 genome
# size from featureBits. So we would use 50, but that yields a very
# high number of tiles to ignore, especially for a small more compact
# genome. Bump that up a bit to be more conservative.
cd /hive/data/genomes/caeJap2
blat caeJap2.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=jkStuff/caeJap2.11.ooc -repMatch=100
# Wrote 14991 overused 11-mers to jkStuff/caeJap2.11.ooc
cd /hive/data/genomes/caeJap2/jkStuff
# note the full size of chrUn
tail ../chrom.sizes
# chrUn 170655152
# use that full size in the following awk:
awk '{
printf "%d\t%s\t%d\tchrUn\t170655152\n", $2, $4, $3-$2
}' ../wustl/Contig.bed > caeJap2.chrUn.lift
#########################################################################
## Make maskedContigs (DONE - 2009-07-23 - Hiram)
mkdir /hive/data/genomes/caeJap2/maskedContigs
cd /hive/data/genomes/caeJap2/maskedContigs
ln -s ../jkStuff/caeJap2.chrUn.lift ./supers.lift
cp -p /hive/data/genomes/caeJap1/maskedContigs/lft2BitToFa.pl .
time nice -n +19 ./lft2BitToFa.pl ../caeJap2.2bit supers.lift > supers.fa
faToTwoBit supers.fa caeJap2.supers.2bit
# real 4m18.841s
# verify size is not broken from this procedure, should be same number
# of real, upper and lower. Only the N count and total count changes
twoBitToFa caeJap2.supers.2bit stdout | faSize stdin
# 163120152 bases (33824841 N's 129295311 real 71749466 upper
# 57545845 lower) in 7536 sequences in 1 files
# %35.28 masked total, %44.51 masked real
twoBitToFa ../caeJap2.2bit stdout | faSize stdin
# 170655152 bases (41359841 N's 129295311 real 71749466 upper
# 57545845 lower) in 1 sequences in 1 files
# %33.72 masked total, %44.51 masked real
twoBitInfo caeJap2.supers.2bit stdout \
| sort -k2rn > caeJap2.supers.sizes
# copy all of this stuff to the klusters:
cd /hive/data/genomes/caeJap2
mkdir /hive/data/staging/data/caeJap2
cp -p jkStuff/caeJap2.11.ooc jkStuff/caeJap2.chrUn.lift \
maskedContigs/caeJap2.supers.2bit maskedContigs/caeJap2.supers.sizes \
chrom.sizes caeJap2.2bit /hive/data/staging/data/caeJap2
#########################################################################
# GENBANK AUTO UPDATE (DONE - 2009-07-23 - Hiram)
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add caeJap2 just before caeJap1
# caeJap2 (C. remanei)
caeJap2.serverGenome = /hive/data/genomes/caeJap2/caeJap2.2bit
caeJap2.clusterGenome = /scratch/data/caeJap2/caeJap2.2bit
caeJap2.ooc = /scratch/data/caeJap2/caeJap2.11.ooc
caeJap2.lift = /scratch/data/caeJap2/caeJap2.chrUn.lift
caeJap2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter}
caeJap2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
caeJap2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
caeJap2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
caeJap2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter}
caeJap2.refseq.mrna.native.load = yes
caeJap2.refseq.mrna.xeno.load = yes
caeJap2.refseq.mrna.xeno.loadDesc = yes
caeJap2.genbank.mrna.xeno.load = yes
caeJap2.genbank.est.native.load = yes
caeJap2.genbank.est.native.loadDesc = no
caeJap2.downloadDir = caeJap2
caeJap2.perChromTables = no
cvs ci -m "Added caeJap2" etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial caeJap2 &
# logFile: var/build/logs/2009.07.23-14:13:01.caeJap2.initalign.log
-XXX - running Thu Jul 23 14:14:03 PDT 2009
- # real 124m10.365s
+ # real 137m55.074s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caeJap2
- # logFile: var/dbload/hgwdev/logs/2008.04.04-10:07:40.dbload.log
+ # logFile: var/dbload/hgwdev/logs/2009.08.10-16:42:51.dbload.log
# real 16m14.090s
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add caeJap2 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added caeJap2 - C. japonica" etc/align.dbs etc/hgwdev.dbs
make etc-update
+ # done - 2009-09-21 - Hiram
+#########################################################################