src/hg/makeDb/doc/ce1.txt 1.4
1.4 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/ce1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce1.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/ce1.txt 25 Nov 2009 18:29:18 -0000 1.3
+++ src/hg/makeDb/doc/ce1.txt 25 Nov 2009 21:48:38 -0000 1.4
@@ -1,1542 +1,1542 @@
# for emacs: -*- mode: sh; -*-
# This file describes how to make the browser database for the
# worm C. elegans
# Currently 2003-05-28 this file is in a state of flux as it is being
# worked out.
DOWNLOAD SEQUENCE (DONE 2003-05-12 - Hiram)
# next machine
ssh eieio
mkdir -p /cluster/store5/worm/ce1/sanger
mkdir -p /cluster/store5/worm/ce1/tmp
cd /cluster/store5/worm/ce1/sanger
wget -o ce1.fetch.log -r -l1 --no-directories \
ftp://ftp.sanger.ac.uk/pub/wormbase/current_release/CHROMOSOMES/
# Takes about five minutes
# This current_release is updated every two weeks. Although the
# sequence is quite stable at this time and changes very little.
# Check that you have some valid files. Should be chroms I II III IV V X
# in dna, gff and agp formats
ls -ogrt
# get out of this download data directory, and create your home symlink
# shortcut
cd ..
ln -s /cluster/store5/worm/ce1 ~/ce1
cd ~/ce1
# There is a Mitochondria sequence. It can be found at NCBI
# This search sequence at NCBI in 'nucleotide' selects one
# entry: X54252:
# Caenorhabditis Elegans[Organism] AND mitochondria AND MTCE
# I'd like to figure out a wget command to fetch this, but doing
# that manually via the WEB browser is not such a big deal. And
# besides, once it is fetched, it is stable (haven't seen it
# change in years) so you can
# merely copy from a previous Ce build. A copy has been manually
# placed in ~/ce1/ncbi/x54252.fa
#
# translate to unzipped .fa, all upper case, and
# rename the agp files so hgGoldGap can find them
mkdir sangerFa
sed -e "s/^>.*/>chrM/" ./ncbi/x54252.fa > sangerFa/chrM.fa
foreach f (I II III IV V X)
echo -n "${f} "
zcat sanger/CHROMOSOME_${f}.dna.gz | tr '[a-z]' '[A-Z]' | \
sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${f}.fa
mkdir sangerFa/${f}
ln -s ~/ce1/sanger/CHROMOSOME_${f}.agp sangerFa/${f}/chr${f}.agp
end
# hgGoldGap does not handle dir names over 2 characters:
mv sangerFa/III sangerFa/3
# CREATE AN .agp FILE FOR ChrM (DONE, 2004-04-19, hartera)
ssh hgwdev
cd /cluster/data/ce1/sanger
cat << '_EOF_' > CHROMOSOME_M.agp
M 1 13794 1 F X54252.1 1 13794 +
'_EOF_'
mkdir -p /cluster/data/ce1/sangerFa/M
ln -s /cluster/data/ce1/sanger/CHROMOSOME_M.agp \
/cluster/data/ce1/sangerFa/M/chrM.agp
# CREATING DATABASE (DONE 2003-05-12 - Hiram)
# Create the database.
# next machine
ssh hgwdev
echo 'create database ce1' | hgsql ''
# if you need to delete that database: !!! WILL DELETE EVERYTHING !!!
echo 'drop database ce1' | hgsql ce1
# Use df to ake sure there is at least 5 gig free on
df -h /var/lib/mysql
# Before loading data:
# Filesystem Size Used Avail Use% Mounted on
# /dev/sda1 472G 389G 58G 87% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-05-12 - Hiram)
# next machine
ssh hgwdev
# the following command copies all the data from the table
# grp in the database rn1 to our new database ce1
echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
| hgsql ce1
# if you need to delete that table: !!! WILL DELETE ALL grp data !!!
echo 'drop table grp;' | hgsql ce1
# PREPARE Split contigs into 100,000 bp chunks for cluster runs
# next machine
ssh eieio
cd ~/ce1
rm -fr ./split
mkdir split
foreach f (I II III IV V X M)
mkdir split/$f
faSplit size sangerFa/chr${f}.fa 100000 split/$f/c -lift=split/chr${f}.lft
end
# 151 pieces of 151 written
# 153 pieces of 153 written
# 138 pieces of 138 written
# 175 pieces of 175 written
# 210 pieces of 210 written
# 178 pieces of 178 written
# 1 pieces of 1 written
cat split/*.lft > liftAll.lft
# copy them to /iscratch/i for cluster rsync
# next machine
ssh kkr1u00
cd ~/ce1/split
foreach c (I II III IV V X M)
echo -n "${c} "
mkdir -p /iscratch/i/worms/Celegans/unmaskedSplit/${c}
cp -p ${c}/c*.fa /iscratch/i/worms/Celegans/unmaskedSplit/${c}
end
~kent/bin/iSync
# Run RepeatMasker on the chromosomes (DONE 2003-06-04 - Hiram)
# next machine
ssh kk
cd ~/ce1
mkdir RMRun
cd RMRun
# create the script to use for the RepeatMasker run
cat << '_EOF_' > RMWorm
#!/bin/csh -fe
#
# This is a slight rearrangement of the
# /cluster/bin/scripts/RMWorm used in makeHg15.doc
# The results here need to go to a different location
# $1 == chrom name: I II III IV V X M
# $2 == directory where split contig .fa is found
# $3 == name of contig .fa file
cd $1
pushd .
cd $2
/bin/mkdir -p /tmp/$3/$1
/bin/cp $3 /tmp/$3/$1
cd /tmp/$3/$1
/scratch/hg/RepeatMasker/RepeatMasker -s -el -ali $3
popd
/bin/cp /tmp/$3/$1/$3.out ./
if( -e /tmp/$3/$1/$3.align ) /bin/cp /tmp/$3/$1/$3.align ./
# /bin/cp /tmp/$2*.masked ../masked/
/bin/rm -r /tmp/$3/$1
/bin/rmdir --ignore-fail-on-non-empty /tmp/$3
'_EOF_'
chmod +x RMWorm
# create job list
rm -f RMJobs
foreach c (I II III IV V X M)
mkdir ~/ce1/RMRun/${c}
foreach t ( /iscratch/i/worms/Celegans/unmaskedSplit/$c/c*.fa )
set d = $t:h
set f = $t:t
echo ./RMWorm ${c} ${d} ${f} \
'{'check out line+ ~/ce1/RMRun/$c/${f}.out'}' \
>> RMJobs
end
end
para create RMJobs
para try
para check
# when they are finished, liftUp and load the .out files into the database:
# next machine
ssh eieio
cd ~/ce1/RMRun
foreach c (I II III IV V X M)
liftUp chr${c}.fa.out ../split/chr${c}.lft warn ${c}/*.fa.out
end
# next machine
ssh hgwdev
cd ~/ce1/RMRun
hgLoadOut ce1 chr*.fa.out
# Noticed one error in this load:
# Processing chrI.fa.out
# Strange perc. field -0.6 line 791 of chrI.fa.out
# SIMPLE REPEAT [TRF] TRACK (DONE 2003-06-05 - Hiram)
# ensure chr*.fa files exist on /iscratch/i
# next machine
ssh kkr1u00
cp -p /cluster/store5/worm/ce1/sangerFa/*.fa \
/iscratch/i/worms/Celegans/unmaskedFa
~kent/bin/iSync
# Create cluster parasol job:
# next machine
ssh kk
mkdir -p ~/ce1/bed/simpleRepeat
cd ~/ce1/bed/simpleRepeat
mkdir trf
ls -1S /iscratch/i/worms/Celegans/unmaskedFa/chr*.fa > genome.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'_EOF_'
echo "" > dummy.lst
gensub2 genome.lst dummy.lst gsub spec
para create spec
para try # there are only 7, so this runs them all
para check
Average job time: 468s 7.80m 0.13h 0.01d
Longest job: 963s 16.05m 0.27h 0.01d
# When cluster run is done, combine into one:
cat trf/*.bed > simpleRepeat.bed
# Load into the database:
# next machine
ssh hgwdev
cd ~/ce1/bed/simpleRepeat
/cluster/bin/i386/hgLoadBed ce1 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# Loaded 28590 elements
# PROCESS SIMPLE REPEATS INTO MASK (DONE 2003-06-05 - Hiram)
# next machine
ssh eieio
cd ~/ce1/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# Create Soft and Hard masks from RepeatMaster and TRF outputs:
# and rebuild the nib files
# next machine
ssh eieio
cd ~/ce1
mkdir softMask
cd ~/ce1
foreach c (I II III IV V X M)
echo -n "masking chr${c} "
/cluster/bin/i386/maskOutFa sangerFa/chr${c}.fa RMRun/chr${c}.fa.out \
softMask/chr${c}.fa -soft
/cluster/bin/i386/maskOutFa softMask/chr${c}.fa \
bed/simpleRepeat/trfMask/chr${c}.bed \
softMask/chr${c}.fa -softAdd
faToNib -softMask softMask/chr${c}.fa nib/chr${c}.nib
end
# output:
# masking chrI Writing 15080473 bases in 7540245 bytes
# masking chrII Writing 15279303 bases in 7639660 bytes
# masking chrIII Writing 13783268 bases in 6891642 bytes
# masking chrIV Writing 17493790 bases in 8746903 bytes
# masking chrV Writing 20922238 bases in 10461127 bytes
# masking chrX Writing 17705013 bases in 8852515 bytes
# masking chrM Writing 13794 bases in 6905 bytes
# unknown if hardMasks are actually needed for anything
mkdir hardMask
foreach c (I II III IV V X M)
echo "masking chr${c}"
/cluster/bin/i386/maskOutFa softMask/chr${c}.fa hard \
hardMask/chr${c}.fa
end
# With masked nib files ready, prepare cluster for blastz runs
# next machine
ssh kkr1u00
cd ~/ce1/softMask
mkdir -p /iscratch/i/worms/Celegans/bothMasksFa
mkdir -p /iscratch/i/worms/Celegans/bothMasksNib
cp -p *.fa /iscratch/i/worms/Celegans/bothMasksFa
cd ~/ce1/nib
cp -p c*.nib /iscratch/i/worms/Celegans/bothMasksNib
~kent/bin/iSync
STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE 2003-05-12 - Hiram)
# Make symbolic links from /gbdb/ce1/nib to the real nibs.
# next machine
ssh hgwdev
mkdir -p /gbdb/ce1/nib
foreach f (/cluster/store5/worm/ce1/nib/*.nib)
ln -s $f /gbdb/ce1/nib
end
cd ~/ce1/tmp
# Load /gbdb/ce1/nib paths into database and save size info.
hgsql ce1 < ~/kent/src/hg/lib/chromInfo.sql
# if you need to delete that table: !!! DELETES ALL DATA IN TABLE !!!
echo 'drop table chromInfo;' | hgsql ce1
hgNibSeq -preMadeNib ce1 /gbdb/ce1/nib sangerFa/chr*.fa
# Typical output:
# Processing chrI.fa to /gbdb/ce1/nib/chrI.nib
# Processing chrII.fa to /gbdb/ce1/nib/chrII.nib
# Processing chrIII.fa to /gbdb/ce1/nib/chrIII.nib
# Processing chrIV.fa to /gbdb/ce1/nib/chrIV.nib
# Processing chrM.fa to /gbdb/ce1/nib/chrM.nib
# Processing chrV.fa to /gbdb/ce1/nib/chrV.nib
# Processing chrX.fa to /gbdb/ce1/nib/chrX.nib
# 100277879 total bases
# Verify the hgNibSeq load functioned OK:
echo "select chrom,size from chromInfo" | hgsql -N ce1 > chrom.sizes
cat chrom.sizes
# Typical contents of chrom.sizes:
chrI 15080473
chrII 15279303
chrIII 13783268
chrIV 17493790
chrM 13794
chrV 20922238
chrX 17705013
# Set up relational mrna tables.
hgLoadRna new ce1
# that created a bunch of tables. If you need to delete them:
hgLoadRna drop ce1
MAKE GCPERCENT AND GAP tracks (DONE 2003-05-12 - Hiram)
# next machine
ssh hgwdev
mkdir -p /cluster/store5/worm/ce1/bed/gcPercent
cd /cluster/store5/worm/ce1/bed/gcPercent
hgsql ce1 < ~/kent/src/hg/lib/gcPercent.sql
# If you need to delete that table created
echo 'drop table gcPercent;' | hgsql ce1;
hgGcPercent ce1 ../../nib
# hgGoldGap does not handle dir names over 2 characters
# directory III has been moved to 3 when all this was created above
hgGoldGapGl -noGl ce1 /cluster/store5/worm/ce1 sangerFa
# there are no gaps in these chromosomes. They are all making
# empty tables except for chrM. We tried running the browser
# with these empty tables removed, but then it complained when
# we tried to display the assembly track. The browser needs to
# be fixed. In the meantime, for completeness, and to allow
# featureBits to function, create an empty chrM_gap:
cat << '_EOF_' | hgsql ce1
CREATE TABLE chrM_gap (
bin smallint not null,
chrom varchar(255) not null,
chromStart int unsigned not null,
chromEnd int unsigned not null,
ix int not null,
n char(1) not null,
size int unsigned not null,
type varchar(255) not null,
bridge varchar(255) not null,
UNIQUE(chromStart),
UNIQUE(chromEnd)
);
'_EOF_'
# hgGoldGapGl needs to be fixed so it can create a gold track
# and not need to create a gap track. And the browser needs to
# be fixed so it can display the assembly track without the need
# for the gap tables to exist.
# REMAKE GAP TRACKS - ADD POSITIONS OF Ns TO ChrN_gap TABLES
# (DONE, 2004-04-30, hartera)
# Add Ns to gap tables as for Ce2, this is described on Gap details page
# next machine
ssh hgwdev
mkdir -p /cluster/data/ce1/bed/gap
cd /cluster/data/ce1/bed/gap
# finds motifs and finds location of gaps as part of output
foreach c (I II III IV V X M)
findMotif -chr=chr${c} -verbose=4 -motif=gcatg \
/gbdb/ce1/nib >& chr${c}Bed.stderr
end
grep -h GAP *.stderr | sed -e "s/#GAP //" > gap.bed
# All the gap tables are empty so
# Load in gap.bed
# Need to add extra fields to gap.bed file
cat << '_EOF_' > /cluster/data/ce1/scripts/createGapFile.pl
#!/usr/bin/perl -w
use strict;
my $oldchr = "";
while (<STDIN>) {
my @fields = split(/\t/);
my $chr = $fields[0];
if ($chr ne $oldchr) {
open(OUT, ">$chr"."_gap.bed");
$oldchr = $chr;
}
print OUT "$fields[0]\t$fields[1]\t$fields[2]\t$fields[3]\tN\t$fields[4]\tfragment\tyes\n";
}
'_EOF_'
perl /cluster/data/ce1/scripts/createGapFile.pl < gap.bed
# load into relevant tables
foreach c (I II III IV V X M)
hgLoadBed -tab -oldTable ce1 chr${c}_gap chr${c}_gap.bed
end
# MAKE chrM_gold TABLE FROM NEW chrM.agp FILE CREATED ABOVE
# (DONE, 2004-04-19, hartera)
# next machine
ssh hgwdev
cd /cluster/data/ce1/
hgGoldGapGl -noGl -chrom=chrM ce1 /cluster/data/ce1 sangerFa
MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CE1 (DONE 2002-05-12 - Hiram)
# next machine
ssh hgwdev
echo 'insert into defaultDb values("C. elegans", "ce1");' \
| hgsql -h genome-testdb hgcentraltest
# If you need to delete that entry:
echo 'delete from defaultDb where name="ce1";' \
| hgsql -h genome-testdb hgcentraltest
# Note: for next assembly, set scientificName column to
# "Caenorhabditis elegans"
echo 'insert into dbDb values("ce1", "May 2003", \
"/gbdb/ce1/nib", "Worm", "chrII:14642289-14671631", 1, 10, \
"C. elegans");' \
| hgsql -h genome-testdb hgcentraltest
# If you need to delete that entry:
echo 'delete from dbDb where name="ce1";' \
| hgsql -h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
cd ~/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add ce1 in all the right places and do
make update
make alpha
cvs commit makefile
MAKE SANGER GENE TRACK (DONE - 2003-05-19 - Hiram)
# next machine
ssh eieio
mkdir -p /cluster/store5/worm/ce1/bed/sangerGene
cd /cluster/store5/worm/ce1/bed/sangerGene
# the perl trims these files down to a reasonable size. As they are
# they cause ldHgGene to hangup due to memory size.
foreach f (I II III IV V X)
echo -n "${f} "
zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
-e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
-e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
perl -ne '@a = split; \
print if($a[1] =~ /curated|DNA|RNA/i && \
$a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
end
echo
# check file sizes, should be reasonable
ls -ogrt
# [hiram@eieio sangerGene]$ ls -ogrt
# total 22656
# -rw-r--r-- 1 hiram 3495873 May 19 11:52 chrI.gff
# -rw-r--r-- 1 hiram 3670984 May 19 11:54 chrII.gff
# -rw-r--r-- 1 hiram 3264917 May 19 11:56 chrIII.gff
# -rw-r--r-- 1 hiram 3642256 May 19 11:59 chrIV.gff
# -rw-r--r-- 1 hiram 4704564 May 19 12:02 chrV.gff
# -rw-rw-r-- 1 hiram 3787721 May 19 12:03 chrX.gff
# now load database with those transformed gff files
# next machine
ssh hgwdev
cd /cluster/store5/worm/ce1/bed/sangerGene
ldHgGene ce1 sangerGene *.gff
# Read 37330 transcripts in 411148 lines in 6 files
# 37330 groups 6 seqs 11 sources 6 feature types
# 22128 gene predictions
# if you need to delete that table:
echo 'drop table sangerGene' | hgsql ce1
# MAKING AND STORING mRNA AND EST ALIGNMENTS (WORKING 2003-05-28 - Hiram)
# next machine
ssh kkr1u00
cd /iscratch/i/worms/Celegans
mkdir mRNA
# there are 1644 sequences in mrna.fa, the 1000000 makes 1644 files
faSplit sequence /iscratch/i/mrna.134/Caenorhabditis_elegans/mrna.fa \
1000000 mRNA/m
~kent/bin/iSync
# next machine
ssh kk
cd ~/ce1/bed
mkdir mrna est
cd mrna
mkdir psl
ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
ls -1S /iscratch/i/worms/Celegans/mRNA/m*.fa > mrna.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 worm.lst mrna.lst gsub spec
para create spec
para try
para check
para push
... etc ...
# These jobs are a bit on the short side. I wonder if the mrna
# fa's could be put together in larger chunks ?
# Average job time: 8s 0.13m 0.00h 0.00d
# Longest job: 63s 1.05m 0.02h 0.00d
# cluster run done
pslSort dirs raw.psl /tmp psl
pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_mrna.psl \
/dev/null
pslSortAcc nohead chrom /tmp all_mrna.psl
# load mrna tables
# next machine
ssh hgwdev
cd ~/ce1/bed/mrna/chrom
# names must be chr*_mrna.psl
foreach d (I II III IV V X M)
rm -f chr${d}_mrna.psl
mv chr${d}.psl chr${d}_mrna.psl
end
# this load command rewrites these table entirely if they exist
hgLoadPsl ce1 *.psl
mkdir /gbdb/ce1/mrna.134
ln -s /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.fa \
/gbdb/ce1/mrna.134
# ! ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
cd ~/ce1/bed/mrna
hgLoadRna add -type=mRNA ce1 /gbdb/ce1/mrna.134/mrna.fa \
/cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.ra
hgLoadPsl ce1 all_mrna.psl
# setup /iscratch/i with the est.fa file for blat run
# next machine
ssh kkr1u00
mkdir -p /iscratch/i/worms/Celegans/EST
cd /iscratch/i/worms/Celegans/EST
# create about a 200 files to provide about 1400 jobs to the cluster
# (these jobs are very small despite this small number of est files)
faSplit sequence \
/cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.fa 200 e
~kent/bin/iSync
# next machine
ssh kk
cd ~/ce1/bed/est
mkdir psl
ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
ls -1S /iscratch/i/worms/Celegans/EST/e*.fa > est.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 worm.lst est.lst gsub spec
para create spec
para try
para check
para push
# These 1358 jobs are pretty short:
# Average job time: 43s 0.71m 0.01h 0.00d
# Longest job: 77s 1.28m 0.02h 0.00d
# cluster run done
# next machine
ssh hgwdev
cd ~/ce1/bed/est
pslSort dirs raw.psl /tmp psl
pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_est.psl \
/dev/null
pslSortAcc nohead chrom /tmp all_est.psl
cd chrom
# names must be chr*_est.psl
foreach d (I II III IV V X M)
rm -f chr${d}_est.psl
mv chr${d}.psl chr${d}_est.psl
end
# this load command rewrites these table entirely if they exist
hgLoadPsl ce1 *.psl
ln -s /cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.fa \
/gbdb/ce1/mrna.134
# ! ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
cd ~/ce1/bed/est
hgLoadRna add -mrnaType=EST -type=EST ce1 /gbdb/ce1/mrna.134/est.fa \
/cluster/store5/mrna.134/org/Caenorhabditis_elegans/est.ra
hgLoadPsl ce1 all_est.psl -nobin
MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CE1 (DONE - 2003-05-29 - Hiram)
# next machine
ssh hgwdev
# DNA port is "0", trans prot port is "1"
echo 'insert into blatServers values("ce1", "blat10", "17784", "0"); \
insert into blatServers values("ce1", "blat10", "17785", "1");' \
| hgsql -h genome-testdb hgcentraltest
# if you need to delete those entries
echo 'delete from blatServers where db="ce1";' \
| hgsql -h genome-testdb hgcentraltest
# to check the entries:
echo 'select * from blatServers where db="ce1";' \
| hgsql -h genome-testdb hgcentraltest
PRODUCING FUGU FISH ALIGNMENTS (DONE 2003-06-05 - Hiram)
# Assumes masked NIBs have been prepared as above
# and Fugu pieces are already on kluster /iscratch/i.
# next machine
ssh kk
mkdir -p ~/ce1/bed/blatFugu
cd ~/ce1/bed/blatFugu
mkdir psl
ls -1S /iscratch/i/fugu/*.fa > fugu.lst
ls -1S /iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -q=dnax -t=dnax -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 worm.lst fugu.lst gsub spec
para create spec
para try
para check
para push
para check
# Average job time: 237s 3.95m 0.07h 0.00d
# Longest job: 1326s 22.10m 0.37h 0.02d
# want names chrI_blatFugu
# When cluster run is done, sort alignments
# next machine
ssh eieio
cd ~/ce1/bed/blatFugu
foreach d (I II III IV V X M)
echo -n "${d} "
pslCat psl/chr${d}_*.psl | \
pslSortAcc nohead chrom temp stdin
rm -f chrom/chr${d}_blatFugu.psl
mv chrom/chr${d}.psl chrom/chr${d}_blatFugu.psl
end
# next machine
ssh hgwdev
cd ~/ce1/bed/blatFugu/chrom
hgLoadPsl ce1 chr*_blatFugu.psl
# Make fugu /gbdb/ symlink and load Fugu sequence data.
mkdir /gbdb/ce1/fuguSeq
ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta /gbdb/ce1/fuguSeq
# ! ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
cd ~/ce1/bed/blatFugu
hgLoadRna addSeq ce1 /gbdb/ce1/fuguSeq/fugu_v3_mask.fasta
# RUN Waba alignment with briggsae (DONE - 2003-06-02 - Hiram)
# prepare contigs from C. briggsae
# Assumes C. briggsae data has been downloaded according to
# makeCb1.doc
# next machine
ssh kkr1u00
mkdir -p /iscratch/i/worms/Cbriggsae/waba_contigs
cd /iscratch/i/worms/Cbriggsae/waba_contigs
faSplit sequence ../contigs.fa 2000 c
~kent/bin/iSync
# next machine
ssh kk
mkdir ~/ce1/bed/waba/out
cd ~/ce1/bed/waba
ls /iscratch/i/worms/Cbriggsae/contigs/c*.fa > briggsae.lst
ls /iscratch/i/worms/Celegans/unmaskedFa/chr*.fa > elegans.lst
echo "" > dummy.lst
# create scripts to be used here
cat << '_EOF_' > wabaRun
#!/bin/csh -fe
#
# $1 - full pathname to a briggsae contig
# $2 - file path to an elegans chrom.fa
# $3 - result file full pathname
#
set f = $1:t
set chr = $2:t
set d = $chr:r
mkdir -p /tmp/$d/$f
cp $1 /tmp/$d/$f
pushd .
cd /tmp/$d/$f
set t = $f:r
/cluster/home/hiram/bin/i386/waba 1 $f $2 $t.1
/cluster/home/hiram/bin/i386/waba 2 $t.1 $t.2
/cluster/home/hiram/bin/i386/waba 3 $t.2 $t.wab
cp $t.wab $3
popd
rm -f /tmp/$d/$f/$t.*
rmdir --ignore-fail-on-non-empty /tmp/$d/$f
rmdir --ignore-fail-on-non-empty /tmp/$d
'_EOF_'
chmod +x wabaRun
echo
cat << '_EOF_' > gsub
#LOOP
/cluster/store5/worm/ce1/bed/waba/wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists /cluster/store5/worm/ce1/bed/waba/out/$(root2)/$(root1).wab}
#ENDLOOP
'_EOF_'
gensub2 briggsae.lst elegans.lst gsub spec
para create spec
para try
para check
para push
... etc ...
# one of the jobs takes quite a while. Most of the others are OK:
# Average job time: 2308s 38.46m 0.64h 0.03d
# Longest job: 47442s 790.70m 13.18h 0.55d
# next machine
ssh hgwdev
# you may need to build hgWaba
cd ~/kent/src/hg/makeDb/hgWaba
make
# hgWaba.c has been adjusted to avoid the squeezed assert by
# ignoring them. That may want to be looked at some day.
# (The waba cluster run has a number of errors too. waba is not
# perfect, but it is as good as the Intronerator was...)
cd ~/ce1/bed/waba
# one of the outputs produces a result that won't process properly
# with hgWaba. Move it out of the way
mv out/chrII/c0911.wab out/chrII_c0911.wab.broken
mkdir Load
cd Load
# The cat through the pipe to hgWaba will avoid making large files
# that are not needed.
cat << '_EOF_' > loadEm.sh
#!/bin/sh
#
for c in I II III IV V X M
do
echo -n "${c} "
cat /cluster/store5/worm/ce1/bed/waba/out/chr${c}/c*.wab |
~/bin/i386/hgWaba ce1 Cbr chr${c} 0 stdin > proc${c}.out 2>&1
done
exit 0
'_EOF_'
# run it to load the waba track:
./loadEm.sh
# remove garbage temp file:
rm full_waba.tab
rm chrom_waba.tab
# worm/ce1/trackDb.ra entry:
# track cbrWaba
# shortLabel Briggsae Waba
# longLabel C. briggsae Waba Alignments
# group genes
# priority 50
# visibility dense
# color 140,0,200
# altColor 210,140,250
# BEGINNING OF Cb1 BLASTZ
# next machine
ssh kkr1u00
# These nibs were created in the combined RepeatMasker and TRF above
mkdir -p /iscratch/i/worms/Celegans/rmsk
cd /iscratch/i/worms/Celegans/rmsk
cp -p /cluster/store5/worm/ce1/RMRun/chr*.fa.out .
~kent/bin/iSync
# These blastz sections usually start by saying:
# FIRST FOLLOW INSTRUCTIONS IN ~angie/hummus/Notes
# This Notes file is the actual blastz run.
# But I found those notes a bit obscure, for clarity I have
# reproduced the exact steps here.
# The initial DEF file has been created for this.
# It is in ~angie/hummus/DEF.ce1-cb1.2003-05-29
# which will blastz soft masked Cbriggsae ultra contigs
# against the soft masked Celegans chroms
# 1. created the DEF file for this particular blastz run
# The TARGET files are the masked NIBS created above, in:
# SEQ1_DIR=/iscratch/i/worms/Celegans/bothMasksNib
# and the repeatmasker *.out files in:
# SEQ1_RMSK=/iscratch/i/worms/Celegans/rmsk
# There is no lineage specific repeats for the worms as far as
# I can tell, thus:
# SEQ1_SMSK=/dev/null
# This requires:
# BLASTZ_ABRIDGE_REPEATS=0
# The QUERY sequence are the repeat and trf soft masked chrUn.fa
# nib file from the briggsae processing:
# SEQ2_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib
# And the repeat masker run from the briggsae processing:
# SEQ2_RMSK=/iscratch/i/worms/Cbriggsae/rmsk
# No lineage specific repeats:
# SEQ2_SMSK=/dev/null
#
# Our base of operations:
# BASE=/cluster/store5/worm/ce1/bed/blastzCb1
#
# Reproduced here are the instructions as specified in the
# hummus/Notes file.
# Evidently historical archives of the DEF files exist in
# in ~angie/hummus/DEF* and they are copied out of there to
# to the file: $BASE/DEF for the actual run. Don't worry, none of
# this seems to make any sense.
# next machine
ssh kk
cd ~/ce1/bed/blastzCb1
# source the DEF file which was carefully written as above.
. ./DEF
mkdir -p $BASE/run
~angie/hummus/make-joblist $DEF > $BASE/run/j
# that created xdir.sh and joblist run/j
sh $BASE/xdir.sh
# xdir.sh makes a bunch of result directories in $BASE/raw/
# based on chrom name and CHUNK size
cd $BASE/run
# now edit j because it is not correct:
sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
wc -l j*
head j2
# *** make sure the j2 edits are OK, then use it:
mv j2 j
# para create will create the file: 'batch' for the cluster run
para create j
para try
para check
para push
... etc ...
Average job time: 1222s 20.36m 0.34h 0.01d
Longest job: 3772s 62.87m 1.05h 0.04d
# When that cluster run is done, results are in $BASE/raw/chr*/*
# continuing with ~angie/hummus/Notes:
# --- normalize and single_cov
cd ~/ce1/bed/blastzCb1
# source the DEF file again in case you are coming back to this
. ./DEF
# a new run directory
mkdir -p $BASE/run.1
# another obscure script creates a new job list:
~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
cd $BASE/run.1
# the job list is once again incorrect, edit it:
sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2
wc -l j*
head j2
# make sure the edited j2 is OK, then use it:
mv j2 j
para create j
para try
para push;para check; ... etc ...
Average job time: 65s 1.08m 0.02h 0.00d
Longest job: 136s 2.27m 0.04h 0.00d
# Translate the .lav files created by the end of ~angie/hummus Notes
# into axt files
ssh eieio
set base="/cluster/store5/worm/ce1/bed/blastzCb1"
set seq1_dir="/cluster/store5/worm/ce1/nib"
set seq2_dir="/cluster/store5/worm/cb1/nib"
set tbl="blastzCb1"
cd $base
mkdir -p axtChrom
foreach c (lav/*)
pushd $c
set chr=$c:t
set out=$base/axtChrom/$chr.axt
echo "Translating $chr lav to $out"
cat `ls -1 *.lav | sort -g` \
| /cluster/home/hiram/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \
| /cluster/bin/i386/axtSort stdin $out
popd
end
# Translate the sorted axt files into psl:
cd $base
mkdir -p pslChrom
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# load these blastz results
# next machine
ssh hgwdev
cd ~/ce1/bed/blastzCb1/pslChrom
/cluster/bin/i386/hgLoadPsl ce1 chr*_*.psl
# trackDb/worm/ce1/trackDb.ra entry:
# track blastzCb1
# shortLabel briggsae Blastz
# longLabel Blastz C briggsae
# group compGeno
# priority 159
# visibility dense
# color 0,0,0
# altColor 50,128,50
# spectrum on
# type psl xeno cb1
# CHAINING Briggsae blastz
# next machine, small cluster is good for these tiny jobs
ssh kkr1u00
cd /cluster/store5/worm/ce1/bed/blastzCb1
mkdir axtChain
cd axtChain
mkdir run1
cd run1
ls -1S ../../axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /cluster/store5/worm/ce1/nib /cluster/store5/worm/cb1/nib $2 > $3
'_EOF_'
chmod a+x doChain
mkdir out chain
gensub2 input.lst single gsub spec
para create spec
para try
Average job time: 303s 5.04m 0.08h 0.00d
Longest job: 1006s 16.77m 0.28h 0.01d
# now on the cluster server, sort chains
# next machine
ssh eieio
cd ~/ce1/bed/blastzCb1/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd ~/ce1/bed/blastzCb1/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain ce1 ${c}_cb1Chain $i
echo done $c
end
# Create the nets XXXX STOPPED HERE 2003-06-11
# next machine
ssh eieio
cd ~/ce1/bed/blastzCb1/axtChain
mkdir preNet
cd chain
# MAKING AND STORING refSeq ALIGNMENTS (WORKING 2003-06-02 - Hiram)
# MAKING AND STORING refSeq ALIGNMENTS - hgRefSeqMrna load REDONE (2004-05-18, hartera)
# hgRefSeqStatus - refSeqStatus table rebuilt with updated loc2ref
# (2004-06-16, hartera)
# next machine
ssh kkr1u00
mkdir -p /iscratch/i/worms/Celegans/refSeq
cd /iscratch/i/worms/Celegans
# there are about 20000 sequences in the refSeq.fa file,
# split them into about 300 files
faSplit sequence \
/cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.fa \
300 refSeq/r
~kent/bin/iSync
# next machine
ssh kk
cd ~/ce1/bed
mkdir -p refSeq/psl
cd refSeq
# for reference:
ln -s \
/cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.* .
ls -1S /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib/chr*.nib > worm.lst
ls -1S /cluster/bluearc/iscratch/i/worms/Celegans/refSeq/r*.fa > refSeq.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -fine -q=rna -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
gensub2 worm.lst refSeq.lst gsub spec
para create spec
para try
para check
para push ... etc ...
# Average job time: 55s 0.92m 0.02h 0.00d
# Longest job: 626s 10.43m 0.17h 0.01d
# when cluster run done
# next machine
ssh eieio
cd ~/ce1/bed/refSeq
pslSort dirs raw.psl /tmp psl
pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl all_refSeq.psl \
/dev/null
pslSortAcc nohead chrom /tmp all_refSeq.psl
pslCat -dir chrom > refSeqAli.psl
# load refSeq tables
# next machine
ssh hgwdev
cd ~/ce1/bed/refSeq
hgLoadPsl ce1 -tNameIx refSeqAli.psl
ln -s \
/cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.fa \
/gbdb/ce1/mrna.134
# ! ! DO NOT RUN hgLoadRna in /gbdb - it leaves .tab files
cd ~/ce1/bed/refSeq
hgLoadRna add -type=refSeq ce1 /gbdb/ce1/mrna.134/refSeq.fa \
/cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.ra
# Reload refLink table as LocusLink ID column is all 0 and protein
# accessions missing. (DONE, 2004-05-12, hartera)
# RefSeq accessions for ce1 appear not to be in
# /cluster/store5/mrna.134/refSeq/041003/loc2ref so could not also
# map the LocusLink ID to OMIM ID through mim2loc
# Dowload the latest loc2ref and mim2loc from NCBI and put in
# /cluster/store5/mrna.134/refSeq/051204/
# Previous error in loading refPep was because there were duplicate
# accessions in hs.faa once the suffix of .1 and .2 were removed
# A search in GenBank showed that NP_002900.1 has been replaced
# by NP_002900.2 so removed the former from hs.faa
# (2004-05-18, hartera) Deleted contents of refPep table since
# hs.faa contains only human sequences. No C. elegans peptide sequences
# available from this NCBI Build 134 so added protein translation of genomic DNA
# on the RefSeq genes details pages.
hgRefSeqMrna ce1 /gbdb/ce1/mrna.134/refSeq.fa \
/cluster/store5/mrna.134/refSeq/041003/org/Caenorhabditis_elegans/refSeq.ra \
all_refSeq.psl \
/cluster/store5/mrna.134/refSeq/051204/loc2ref \
/cluster/store5/mrna.134/refSeq/041003/hs.faa \
/cluster/store5/mrna.134/refSeq/051204/mim2loc
# Missing locusLinkIds for 184 of 20661
# Missing protein accessions for 184 of 20661
# Add RefSeq status info (updated 2004-06-16 as mrnaAcc values not found
# in refLink.mrnaAcc column). Only load in those for C. elegans (Taxonomy
# ID is 6239)
cd /cluster/data/ce1/bed/refSeq
hgRefSeqStatus -taxId=6239 ce1 \
/cluster/store5/mrna.134/refSeq/051204/loc2ref
# Create precomputed join of refFlat and refGene:
echo 'CREATE TABLE refFlat \
(KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) \
SELECT refLink.name as geneName, refGene.* \
FROM refLink,refGene \
WHERE refLink.mrnaAcc = refGene.name' \
| hgsql ce1
# END - MAKING AND STORING refSeq ALIGNMENTS
# BEGINNING OF Self BLASTZ
# DEF file here is the same as the blastzCb1/DEF file, with
# the following changes:
# SEQ1_DIR=/iscratch/i/worms/Celegans/bothMasksNib
# SEQ2_DIR=/iscratch/i/worms/Celegans/bothMasksNib
# SEQ2_RMSK=/iscratch/i/worms/Celegans/rmsk
# SEQ2_CHUNK=10000000
# SEQ2_LAP=10000
# BASE=/cluster/store5/worm/ce1/bed/blastzSelf
# Reproduced here are the instructions as specified in the
# hummus/Notes file.
# Evidently historical archives of the DEF files exist in
# in ~angie/hummus/DEF* and they are copied out of there to
# to the file: $BASE/DEF for the actual run. Don't worry, none of
# this seems to make any sense.
# next machine
ssh kk
mkdir -p ~/ce1/bed/blastzSelf
cd ~/ce1/bed/blastzSelf
. ./DEF
mkdir -p $BASE/run
~angie/hummus/make-joblist $DEF > $BASE/run/j
# that created xdir.sh and joblist run/j
sh $BASE/xdir.sh
# xdir.sh makes a bunch of result directories in $BASE/raw/
# based on chrom name and CHUNK size
cd $BASE/run
# now edit j because it is not correct:
sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
wc -l j*
head j2
# *** make sure the j2 edits are OK, then use it:
mv j2 j
# para create will create the file: 'batch' for the cluster run
para create j
para try
para check
para push
... etc ...
Average job time: 2228s 37.14m 0.62h 0.03d
Longest job: 15281s 254.68m 4.24h 0.18d
# That longest job time is not correct. I was having trouble with
# the cluster during that run.
# When that cluster run is done, results are in $BASE/raw/chr*/*
# continuing with ~angie/hummus/Notes:
# --- normalize and single_cov
cd ~/ce1/bed/blastzSelf
# source the DEF file again in case you are coming back to this
. ./DEF
# a new run directory
mkdir -p $BASE/run.1
# another obscure script creates a new job list:
~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
cd $BASE/run.1
# the job list is once again incorrect, edit it:
sed -e 's/^/\/cluster\/home\/angie\/schwartzbin\//' j > j2
wc -l j*
head j2
# make sure the edited j2 is OK, then use it:
mv j2 j
para create j
para try
para push;para check; ... etc ...
Average job time: 241s 4.01m 0.07h 0.00d
Longest job: 459s 7.65m 0.13h 0.01d
# Translate the .lav files created by the end of ~angie/hummus Notes
# into axt files
ssh eieio
set base="/cluster/store5/worm/ce1/bed/blastzSelf"
set seq1_dir="/cluster/store5/worm/ce1/nib"
set seq2_dir="/cluster/store5/worm/ce1/nib"
set tbl="blastzSelf"
cd $base
mkdir -p axtChrom
foreach c (lav/*)
pushd $c
set chr=$c:t
set out=$base/axtChrom/$chr.axt
echo "Translating $chr lav to $out"
cat `ls -1 *.lav | sort -g` \
| /cluster/home/hiram/bin/i386/lavToAxt stdin $seq1_dir $seq2_dir stdout \
| /cluster/bin/i386/axtSort stdin $out
popd
end
# DropOverlap cleans out a bit more than DropSelf:
cd $base
mkdir DropOverlap
foreach c (M I II III IV V X)
echo "dropping chr${c}"
/cluster/bin/i386/axtDropOverlap axtChrom/chr${c}.axt \
DropOverlap/chr${c}.axt
end
# Translate the sorted axt files into psl:
mkdir -p pslChrom
foreach f (DropOverlap/chr*.axt)
set c=$f:t:r
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# load these blastz results (they already have the correct names
# chr*_blastzSelf.psl from axtToPsl above)
# next machine
ssh hgwdev
cd ~/ce1/bed/blastzSelf/pslChrom
/cluster/bin/i386/hgLoadPsl ce1 chr*_*.psl
# trackDb/worm/ce1/trackDb.ra entry:
# track blastzSelf
# shortLabel Blastz Self
# longLabel Blastz C elegans (self)
# group compGeno
# priority 159
# visibility dense
# color 0,0,0
# altColor 50,128,50
# spectrum on
# type psl xeno ce1
# CHAINING Self blastz
# next machine, small cluster is good for these tiny jobs
ssh kkr1u00
cd /cluster/store5/worm/ce1/bed/blastzSelf
mkdir -p axtChain/run1
cd /cluster/store5/worm/ce1/bed/blastzSelf/axtChain/run1
ls -1S /cluster/store5/worm/ce1/bed/blastzSelf/DropOverlap/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# Nibs are on /cluster/bluearc now
cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /cluster/bluearc/iscratch/i/worms/Celegans/bothMasksNib /iscratch/i/worms/Celegans/bothMasksNib $2 > $3
'_EOF_'
chmod a+x doChain
mkdir out chain
gensub2 input.lst single gsub spec
para create spec
para try
para push ... etc ...
Average job time: 502s 8.37m 0.14h 0.01d
Longest job: 874s 14.57m 0.24h 0.01d
# now on the cluster server, sort chains
# next machine
ssh eieio
cd ~/ce1/bed/blastzSelf/axtChain
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# optionally: rm run1/chain/*.chain
# Load chains into database
# next machine
ssh hgwdev
cd ~/ce1/bed/blastzSelf/axtChain/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain ce1 ${c}_chainSelf $i
echo done $c
end
# Loading 845504 chains into ce1.chrI_chainSelf
# Loading 845307 chains into ce1.chrII_chainSelf
# Loading 1152790 chains into ce1.chrIII_chainSelf
# Loading 859566 chains into ce1.chrIV_chainSelf
# Loading 2 chains into ce1.chrM_chainSelf
# Loading 1114939 chains into ce1.chrV_chainSelf
# Loading 604293 chains into ce1.chrX_chainSelf
# END of Blastz Self
LOAD SOFTBERRY GENES (DONE - 2003-08-14 - Hiram)
mkdir -p /cluster/data/ce1/bed/softberry
cd /cluster/data/ce1/bed/softberry
wget \
ftp://www.softberry.com/pub/SC_CEL_MAY03/Soft_fgenesh_Ce03.tar.gz
tar xvzf Soft_fgenesh_Ce03.tar.gz
ldHgGene ce1 softberryGene chr*.gff
# Read 19112 transcripts in 276202 lines in 6 files
# 19112 groups 6 seqs 1 sources 4 feature types
# 19112 gene predictions
hgPepPred ce1 softberry *.protein
hgSoftberryHom ce1 *.protein
# Add entry to worm/ce1/trackDb.ra and add specific
# softberryGene.html file
CREATE ORFTOGENE TABLE (Used by hgGene and hgNear)
mkdir /cluster/data/ce1/bed/orfToGene
cd !$
wget ftp://ftp.wormbase.org/pub/wormbase/GENE_DUMPS/gene_names.txt
awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt
hgCeOrfToGene ce1 gene_names.txt sangerGene orfToGene
CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear)
mkdir /cluster/data/ce1/bed/steinHelp
# Mail Lincoln Stein and ask for mappings from wormPep to
# SwissProt and from ORF to wormPep to description. Place
# these files in the steinHelp directory:
# seqPepDesc.ace - contains ORF/WormPep/Description
# spWp.ace - contains WormPep/SwissProt
# Then go to the AQL search page at
# http://www.wormbase.org/db/searches/aql_query.
# and create the following files in that directory:
# wbGeneClass.txt - associates 3 letter first part of worm gene
# name with a brief description. Produced with query:
# select l,l->Description from l in class Gene_Class
# wbConcise.txt - associates worm gene name with a several
# sentence description. Produces with query:
# select l,g from l in class Locus, g in l->Gene_information[Provisional_description]
hgWormLinks spWp.ace seqPepDesc.ace wbConcise.txt wbGeneClass.txt ../orfToGene/orfToGene sangerLinks.txt
hgsql ce1 < ~/kent/src/hg/near/hgWormLinks/sangerLinks.sql
hgsql -e \
'load data local infile "sangerLinks.txt" into table sangerLinks.sql' \
ce1
TWINSCAN GENE PREDICTIONS (DONE - 2004-01-05 - Hiram)
mkdir /cluster/data/ce1/bed/twinscan
cd /cluster/data/ce1/bed/twinscan
wget --timestamping \
http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS100/12-15-2003/Celegans_WS100_12-15-03.tgz
tar xvzf Celegans_WS100_12-15-03.tgz
rm -r chr_tx
# clean up chrom field of GTF files
foreach c (I II III IV V X)
set f = chr_gtf/CHROMOSOME_${c}.pred_gtf
echo ${f}
sed -e "s/CHROMOSOME_/chr/g" ${f} | \
sed -e "s/_[0-9][0-9]*.seq\t/\t/" > \
chr_gtf/chr${c}-fixed.gtf
end
# pare down protein FASTA header to id and add missing .1:
foreach c (I II III IV V X)
set f = chr_ptx/CHROMOSOME_${c}.ptx
echo ${f}
perl -wpe 's/^\>CHROMOSOME_(\S+).*$/\>chr$1.1/;' < \
${f} > chr_ptx/chr${c}-fixed.fa
end
ldHgGene ce1 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS
# Read 20889 transcripts in 146518 lines in 6 files
# 20889 groups 206 seqs 1 sources 3 feature types
# 20889 gene predictions
hgPepPred ce1 generic twinscanPep chr_ptx/chr*-fixed.fa
# MYTOUCH FIX - Jen - 2006-01-25
sudo mytouch ce1 twinscanPep 0404031400.00
# MRNAORIENT, creating tables for clusterRna operation
# (working 2004-02-10 - Hiram)
mkdir -p /cluster/bluearc/ce1/bed/est
mkdir -p /cluster/bluearc/ce1/bed/mrna
cd /cluster/bluearc/ce1/bed/est
cp -p /cluster/data/ce1/bed/est/chrom/*.psl .
pslSortAcc nohead contig /tmp/psl *.psl
cd /cluster/bluearc/ce1/bed/mrna
cp -p /cluster/data/ce1/bed/mrna/chrom/*.psl .
pslSortAcc nohead contig /tmp/psl *.psl
mkdir mRNA
faSplit sequence /cluster/store5/mrna.134/org/Caenorhabditis_elegans/mrna.fa 1000000 mRNA/m
mkdir -p /cluster/data/ce1/bed/mrnaOrientInfo/oi
cd /cluster/data/ce1/bed/mrnaOrientInfo
ls -1S /cluster/bluearc/ce1/bed/mrna/contig/*.psl > psl.lst
ls -1S /iscratch/i/worms/Celegans/mrna.134/mrna.fa > mrna.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/polyInfo {check in line $(path1)} {check in line+ /iscratch/i/worms/Celegans/trfFa/$(root1).fa} /iscratch/i/worms/Celegans/mrna.134/mrna.fa {check out line+ oi/$(root1).tab}
#ENDLOOP
'_EOF_'
ssh kk
cd /cluster/data/ce1/bed/mrnaOrientInfo
gensub2 psl.lst mrna.lst gsub jobList
para create jobList
para try # there are only six jobs, this runs them
cat oi/*.tab > mrnaOrientInfo.bed
hgLoadBed ce1 mrnaOrientInfo mrnaOrientInfo.bed \
-sqlTable=$HOME/kent/src/hg/lib/mrnaOrientInfo.sql
mkdir -p /cluster/data/ce1/bed/estOrientInfo/oi
cd /cluster/data/ce1/bed/estOrientInfo
ls -1S /cluster/bluearc/ce1/bed/est/contig/*.psl > psl.lst
ls -1S /iscratch/i/worms/Celegans/mrna.134/est.fa > est.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/polyInfo {check in line $(path1)} {check in line+ /iscratch/i/worms/Celegans/trfFa/$(root1).fa} /iscratch/i/worms/Celegans/mrna.134/est.fa {check out line+ oi/$(root1).tab}
#ENDLOOP
'_EOF_'
ssh kkr1u00
cd /cluster/data/ce1/bed/estaOrientInfo
gensub2 psl.lst est.lst gsub jobList
para create jobList
para try # there are only six jobs, this runs them
cat oi/*.tab > estOrientInfo.bed
bedSort estOrientInfo.bed estOrientInfo.bed
hgLoadBed ce1 estOrientInfo estOrientInfo.bed \
-sqlTable=$HOME/kent/src/hg/lib/estOrientInfo.sql
mkdir /cluster/data/ce1/bed/rnaCluster
cd /cluster/data/ce1/bed/rnaCluster
foreach f (I II III IV V X)
echo "clusterRna ce1 /dev/null chr${f}.bed -chrom=${f}"
clusterRna ce1 /dev/null chr${f}.bed -chrom=chr${f}
end
hgLoadBed ce1 rnaCluster *.bed
hgsql -A -e "select chrom from chromInfo" ce1 | egrep -v chrom > chrom.tab
(perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce1 | egrep -v chromStart | cut -f2\-13 > $_.ce1.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.tab > rnaCluster.sh
mkdir -p /cluster/data/ce1/bed/altSplice/agx
cd /cluster/data/ce1/bed/altSplice
for c in I II III IV V X M
do
hgsql -e "create table chr${c}_intronEst as select chr${c}_est.* from chr${c}_est,estOrientInfo where (chr${c}_est.qName = estOrientInfo.name) and (estOrientInfo.intronOrientation != 0)" ce1
done
for c in I II III IV V X M
do
hgsql -e "alter table chr${c}_intronEst add index bin (bin);" ce1
hgsql -e "alter table chr${c}_intronEst add index tStart (tStart);" ce1
hgsql -e "alter table chr${c}_intronEst add index qName (qName(12));" ce1
hgsql -e "alter table chr${c}_intronEst add index tEnd (tEnd);" ce1
echo "done chr${c}"
done
hgsql -A -e "select chrom from chromInfo" ce1 | egrep -v chrom > chrom.tab
perl -e 'while(<>) {chomp($_); print "~/bin/i386/altSplice -db=ce1 -beds=../rnaCluster/$_.ce1.rnaCluster.bed -agxOut=agx/$_.ce1.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.tab > altSplice.para.spec
# these jobs are no big deal on the worm, just run this file:
chmod +x altSplice.para.spec
mkdir agx
./altSplice.para.spec
cat agx/*.agx > altSplice.agx
agxToBed altSplice.agx altSplice.bed
cat << '_EOF_' >> altGraphX.sql
CREATE TABLE altGraphX (
bin smallint unsigned not null,
tName varchar(255) NOT NULL default '',
tStart int(11) NOT NULL default '0',
tEnd int(11) NOT NULL default '0',
name varchar(255) NOT NULL default '',
id int(10) unsigned NOT NULL auto_increment,
strand char(2) NOT NULL default '',
vertexCount int(10) unsigned NOT NULL default '0',
vTypes longblob NOT NULL,
vPositions longblob NOT NULL,
edgeCount int(10) unsigned NOT NULL default '0',
edgeStarts longblob NOT NULL,
edgeEnds longblob NOT NULL,
evidence longblob NOT NULL,
edgeTypes longblob NOT NULL,
mrnaRefCount int(11) NOT NULL default '0',
mrnaRefs longblob NOT NULL,
mrnaTissues longblob NOT NULL,
mrnaLibs longblob NOT NULL,
PRIMARY KEY (id),
KEY tName (tName(16),tStart,tEnd)
);
'_EOF_'
# << emacs
hgLoadBed -sqlTable=altGraphX.sql ce1 altGraphX altSplice.agx
# trackDb entry:
track altGraphX
shortLabel Alt-Splice
longLabel Alternative Splicing
group rna
priority 74.1
visibility pack
type bed 3
canPack off
# copy frame based view to goldenPath 2006-01-12 - Hiram
cd /usr/local/apache/htdocs/goldenPath
ln -s ceMay2003 ce1
cd /cluster/data/ce1/bed/altSites
mkdir /usr/local/apache/htdocs/goldenPath/ce1/altGraphX
sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \
> /usr/local/apache/htdocs/goldenPath/ce1/altGraphX/links.html
sed -e \
"s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
-e "s/NM_005487/chrII:14689493-14690232/" \
-e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \
frame.html \
> /usr/local/apache/htdocs/goldenPath/ce1/altGraphX/frame.html
# GC 5 BASE WIGGLE TRACK (DONE - 2004-02-13 - Hiram)
# working on eieio, the fileserver for this data.
# was named gc20KBase to start with, became a 5 base dataset later
# the generic trackDb html file is named gc20KBase.html
ssh eieio
mkdir /cluster/data/ce1/bed/gc20KBase
cd /cluster/data/ce1/bed/gc20KBase
mkdir wigData5 dataLimits5
for n in ../../nib/*.nib
do
c=`basename ${n} | sed -e "s/.nib//"`
C=`echo $c | sed -e "s/chr//"`
echo -n "working on ${c} - ${C} ... "
hgGcPercent -chr=${c} -doGaps \
-file=stdout -win=5 ce1 ../../nib | grep -w GC | awk '
{
bases = $3 - $2
perCent = $5/10.0
printf "%d\t%.1f\n", $2+1, perCent
}' | wigAsciiToBinary -dataSpan=5 -chrom=${c} \
-wibFile=wigData5/gc5Base_${C} -name=${C} stdin > dataLimits5/${c}
echo "done ${c}"
done
# data is compete, load track on hgwdev
ssh hgwdev
cd /cluster/data/ce1/bed/gc20KBase
hgLoadWiggle ce1 gc20KBase wigData5/*.wig
# create symlinks for .wib files
mkdir /gbdb/ce1/wib
ln -s `pwd`/wigData5/*.wib /gbdb/ce1/wib
# the trackDb entry
track gc20KBase
shortLabel GC Percent
longLabel GC Percent in 5 base windows
group map
priority 23.5
visibility hide
-autoScaleDefault Off
+autoScale Off
maxHeightPixels 128:36:16
graphTypeDefault Bar
gridDefault OFF
windowingFunction Mean
color 0,128,255
altColor 255,128,0
viewLimits 30:70
type wig 0 100
# miRNA track (DONE - 2004-03-02 - Hiram)
# data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
# and Michel.Weber@ibcg.biotoul.fr
# notify them if this assembly updates to renew this track
mkdir /cluster/data/ce1/bed/miRNA
cd /cluster/data/ce1/bed/miRNA
wget --timestamping \
"http://www.sanger.ac.uk/Software/Rfam/mirna/data/ucsc_cel_mir_track.txt"
grep "^chr" ucsc_cel_mir_track.txt | sed -e "s/ /\t/g" > worm.mir.bed
hgLoadBed ce1 miRNA worm.mir.bed
# entry in trackDb/trackDb.ra already there for same track in Hg16
# see makeHg16.doc for more comments on this
#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
# File emailed from Xinwei She <xws@u.washington.edu>
mkdir /cluster/data/ce1/bed/genomicSuperDups
cd /cluster/data/ce1/bed/genomicSuperDups
sed -e 's/CHROMOSOME_/chr/g; s/\t_\t/\t-\t/;' \
ce1_genomicSuperDup.tab \
| awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
| hgLoadBed ce1 genomicSuperDups stdin \
-sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql