src/hg/makeDb/doc/ce2.txt 1.8
1.8 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/ce2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce2.txt,v
retrieving revision 1.7
retrieving revision 1.8
diff -b -B -U 1000000 -r1.7 -r1.8
--- src/hg/makeDb/doc/ce2.txt 17 Oct 2008 01:06:31 -0000 1.7
+++ src/hg/makeDb/doc/ce2.txt 25 Nov 2009 21:48:38 -0000 1.8
@@ -1,2825 +1,2825 @@
# for emacs: -*- mode: sh; -*-
# This file describes how to make the browser database for the
# worm C. elegans
# 2004-03-30 [DONE, 2004-05-20, hartera]
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +------------------+-------------------------+
# | tableName | type |
# +------------------+-------------------------+
# | sangerGene | genePred sangerPep |
# | sangerGenefinder | genePred |
# | refGene | genePred refPep refMrna |
# | twinscan | genePred twinscanPep |
# | geneid | genePred geneidPep |
# +------------------+-------------------------+
# DOWNLOAD SEQUENCE (DONE, 2004-03-31, hartera)
# next machine
ssh eieio
mkdir -p /cluster/store5/worm/ce2/sanger
mkdir -p /cluster/store5/worm/ce2/tmp
ln -s /cluster/store5/worm/ce2 /cluster/data/ce2
cd /cluster/data/ce2/sanger
wget -o ce2.fetch.log -r -l1 --no-directories \
ftp://ftp.sanger.ac.uk/pub/wormbase/WS120/CHROMOSOMES/
# This site is now at: # ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS120/CHROMOSOMES/
# Takes about five minutes
# This current_release is updated every two weeks. Although the
# sequence is quite stable at this time and changes very little.
# Check that you have some valid files. Should be chroms I II III IV V X
# in dna, gff and agp formats, also Mitochondrial DNA, MtDNA in dna and
# gff formats.
ls -ogrt
# Rename CHROMOSOME_MtDNA files to CHROMOSOME_M and change to
# "CHROMOSOME_M" inside files
chmod u+w CHROMOSOME_M*
zcat CHROMOSOME_MtDNA.dna.gz | sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" \
| gzip > CHROMOSOME_M.dna.gz
zcat CHROMOSOME_MtDNA.gff.gz | sed -e "s/CHROMOSOME_MtDNA/CHROMOSOME_M/g" \
| gzip > CHROMOSOME_M.gff.gz
# remove old files
rm CHROMOSOME_MtDNA*
# CHROMOSOME_M sequence is identical to X54252.1 in GenBank
# create a .agp file for chrM as there is none and hgGoldGapGl and other
# programs require a .agp file so create CHROMOSOME_M.agp:
# M 1 13794 1 F X54252.1 1 13794 +
# translate to unzipped .fa, all upper case, and
# rename the agp files so hgGoldGap can find them
mkdir sangerFa
foreach f (I II III IV V X M)
echo -n "${f} "
zcat sanger/CHROMOSOME_${f}.dna.gz | tr '[a-z]' '[A-Z]' | \
sed -e "s/CHROMOSOME_/chr/" > sangerFa/chr${f}.fa
mkdir -p sangerFa/${f}
ln -s /cluster/data/ce2/sanger/CHROMOSOME_${f}.agp sangerFa/${f}/chr${f}.agp
end
# hgGoldGap does not handle dir names over 2 characters:
mv sangerFa/III sangerFa/3
# CREATING DATABASE (DONE, 2004-03-31 - hartera)
# Create the database.
# next machine
ssh hgwdev
echo 'create database ce2' | hgsql ''
# if you need to delete that database: !!! WILL DELETE EVERYTHING !!!
echo 'drop database ce2' | hgsql ce2
# Use df to ake sure there is at least 5 gig free on
df -h /var/lib/mysql
# Before loading data:
# Filesystem Size Used Avail Use% Mounted on
# /dev/sdc1 1.8T 280G 1.4T 17% /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE, 2004-03-31 - hartera)
# next machine
ssh hgwdev
# the following command copies all the data from the table
# grp in the database galGal2 to our new database ce2
echo "create table grp (PRIMARY KEY(NAME)) select * from galGal2.grp" \
| hgsql ce2
# if you need to delete that table: !!! WILL DELETE ALL grp data !!!
echo 'drop table grp;' | hgsql ce2
# MAKE JKSTUFF AND BED DIRECTORIES (DONE, 2004-04-01, hartera)
# This used to hold scripts -- better to keep them inline here so
# they're in CVS. Now it should just hold lift file(s) and
# temporary scripts made by copy-paste from this file.
mkdir /cluster/data/ce2/jkStuff
# This is where most tracks will be built:
mkdir /cluster/data/ce2/bed
# PREPARE Split contigs into 100,000 bp chunks for cluster runs
# (DONE, 2004-04-01, hartera)
# next machine
ssh eieio
cd /cluster/data/ce2
rm -fr ./split
mkdir split
foreach f (I II III IV V X M)
mkdir split/$f
faSplit size sangerFa/chr${f}.fa 100000 split/$f/c -lift=split/chr${f}.lft
end
151 pieces of 151 written
153 pieces of 153 written
138 pieces of 138 written
175 pieces of 175 written
210 pieces of 210 written
178 pieces of 178 written
1 pieces of 1 written
cat split/*.lft > liftAll.lft
# copy them to /iscratch/i for cluster rsync
# next machine
ssh kkr1u00
cd /cluster/data/ce2/split
foreach c (I II III IV V X M)
echo -n "${c} "
mkdir -p /iscratch/i/worms/Celegans2/unmaskedSplit/${c}
cp -p ${c}/c*.fa /iscratch/i/worms/Celegans2/unmaskedSplit/${c}
end
iSync
# Run RepeatMasker on the chromosomes (DONE, 2004-04-02 - hartera)
# next machine
ssh kk
cd /cluster/data/ce2
# make run directory and job list, create the script to use
# for the RepeatMasker run
cat << '_EOF_' > jkStuff/RMWorm
#!/bin/csh -fe
#
# This is a slight rearrangement of the
# RMChicken script used in makeGalGal2.doc
# The results here need to go to a different location
# $1 == chrom name: I II III IV V X M
# $2 == directory where split contig .fa is found
# $3 == name of contig .fa file
cd $1
pushd .
cd $2
/bin/mkdir -p /tmp/ce2/$3/$1
/bin/cp $3 /tmp/ce2/$3/$1
cd /tmp/ce2/$3/$1
/cluster/bluearc/RepeatMasker/RepeatMasker -alignments -s -species elegans $3
popd
/bin/cp /tmp/ce2/$3/$1/$3.out ./
if( -e /tmp/ce2/$3/$1/$3.align ) /bin/cp /tmp/ce2/$3/$1/$3.align ./
if (-e /tmp/ce2/$3/$1/$3.tbl) /bin/cp /tmp/ce2/$3/$1/$3.tbl ./
if (-e /tmp/ce2/$3/$1/$3.cat) /bin/cp /tmp/ce2/$3/$1/$3.cat ./
/bin/rm -r /tmp/ce2/$3/$1
/bin/rmdir --ignore-fail-on-non-empty /tmp/ce2/$3
/bin/rmdir --ignore-fail-on-non-empty /tmp/ce2
'_EOF_'
# << this line makes emacs coloring happy
chmod +x jkStuff/RMWorm
# create job list
mkdir RMRun
rm -f RMRun/RMJobs
foreach c (I II III IV V X M)
mkdir /cluster/data/ce2/RMRun/${c}
foreach t ( /iscratch/i/worms/Celegans2/unmaskedSplit/$c/c*.fa )
set d = $t:h
set f = $t:t
echo /cluster/data/ce2/jkStuff/RMWorm ${c} ${d} ${f} \
'{'check out line+ /cluster/data/ce2/RMRun/$c/${f}.out'}' \
>> RMRun/RMJobs
end
end
# Do the run
cd /cluster/data/ce2/RMRun
para create RMJobs
para try, para check, para check, para push, para check, ...
para try:
# para time
# Checking finished jobs
# Completed: 1006 of 1006 jobs
# CPU time in finished jobs: 821747s 13695.78m 228.26h 9.51d 0.026 y
# IO & Wait Time: 13643s 227.39m 3.79h 0.16d 0.000 y
# Average job time: 830s 13.84m 0.23h 0.01d
# Longest job: 935s 15.58m 0.26h 0.01d
# Submission to last job: 5837s 97.28m 1.62h 0.07d
# when they are finished, liftUp and load the .out files into the database:
# next machine
ssh eieio
cd /cluster/data/ce2/RMRun
foreach c (I II III IV V X M)
liftUp chr${c}.fa.out /cluster/data/ce2/split/chr${c}.lft warn ${c}/*.fa.out
end
# next machine
ssh hgwdev
cd /cluster/data/ce2/RMRun
hgLoadOut ce2 chr*.fa.out
# Noticed one error in this load (reported to Robert Hubley):
# Processing chrV.fa.out
# Strange perc. field -0.1 line 2196 of chrV.fa.out
# SIMPLE REPEAT [TRF] TRACK (DONE, hartera 2004-04-01)
# ensure chr*.fa files exist on /iscratch/i
# next machine
ssh kkr1u00
mkdir -p /iscratch/i/worms/Celegans2/unmaskedFa
cp -p /cluster/data/ce2/sangerFa/*.fa \
/iscratch/i/worms/Celegans2/unmaskedFa
iSync
# done iSync,
# Create cluster parasol job:
# next machine
ssh kk
mkdir -p /cluster/data/ce2/bed/simpleRepeat
cd /cluster/data/ce2/bed/simpleRepeat
mkdir trf
ls -1S /iscratch/i/worms/Celegans2/unmaskedFa/chr*.fa > genome.lst
cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)} /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'_EOF_'
echo "" > dummy.lst
gensub2 genome.lst dummy.lst gsub spec
para create spec
para try # there are only 7, so this runs them all
para check
# para time
# Checking finished jobs
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 3301s 55.01m 0.92h 0.04d 0.000 y
# IO & Wait Time: 38s 0.64m 0.01h 0.00d 0.000 y
# Average job time: 477s 7.95m 0.13h 0.01d
# Longest job: 975s 16.25m 0.27h 0.01d
# Submission to last job: 975s 16.25m 0.27h 0.01d
# When cluster run is done, combine into one:
cat trf/*.bed > simpleRepeat.bed
# Load into the database:
# next machine
ssh hgwdev
cd /cluster/data/ce2/bed/simpleRepeat
hgLoadBed ce2 simpleRepeat simpleRepeat.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# Loaded 28598 elements of size 16
# PROCESS SIMPLE REPEATS INTO MASK (DONE, 2004-04-02 - hartera)
# After the simpleRepeats track has been built, make a filtered version
# of the trf output: keep trf's with period <= 12:
# next machine
ssh eieio
cd /cluster/data/ce2/bed/simpleRepeat
mkdir -p trfMask
foreach f (trf/*.bed)
awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
end
# create Soft and Hard masks from RepeatMaster and TRF outputs:
# and rebuild the nib files using the soft masking in the fa:
# next machine
ssh eieio
cd /cluster/data/ce2
mkdir softMask
mkdir nib
cd /cluster/data/ce2
foreach c (I II III IV V X M)
echo -n "masking chr${c} "
maskOutFa sangerFa/chr${c}.fa RMRun/chr${c}.fa.out \
softMask/chr${c}.fa -soft
maskOutFa softMask/chr${c}.fa \
bed/simpleRepeat/trfMask/chr${c}.bed \
softMask/chr${c}.fa -softAdd
faToNib -softMask softMask/chr${c}.fa nib/chr${c}.nib
end
# output:
# masking chrI Writing 15080483 bases in 7540250 bytes
# masking chrII Writing 15279308 bases in 7639662 bytes
# masking chrIII Writing 13783313 bases in 6891665 bytes
# masking chrIV Writing 17493791 bases in 8746904 bytes
# masking chrV Writing 20922231 bases in 10461124 bytes
# masking chrX Writing 17718849 bases in 8859433 bytes
# masking chrM Writing 13794 bases in 6905 bytes
# create hard masks
mkdir hardMask
foreach c (I II III IV V X M)
echo "masking chr${c}"
/cluster/bin/i386/maskOutFa softMask/chr${c}.fa hard \
hardMask/chr${c}.fa
end
ssh kkr1u00
cd /cluster/data/ce2/softMask
mkdir -p /iscratch/i/worms/Celegans2/bothMasksFa
mkdir -p /iscratch/i/worms/Celegans2/nib
cp -p *.fa /iscratch/i/worms/Celegans2/bothMasksFa
cd /cluster/data/ce2/nib
cp -p c*.nib /iscratch/i/worms/Celegans2/nib
iSync
STORING O+O SEQUENCE AND ASSEMBLY INFORMATION (DONE, 2004-04-02 - hartera)
# Make symbolic links from /gbdb/ce1/nib to the real nibs.
# next machine
ssh hgwdev
mkdir -p /gbdb/ce2/nib
foreach f (/cluster/data/ce2/nib/*.nib)
ln -s $f /gbdb/ce2/nib
end
cd /cluster/data/ce2/tmp
# Load /gbdb/ce2/nib paths into database and save size info
# hgNibSeq creates chromInfo table
hgNibSeq -preMadeNib ce2 /gbdb/ce2/nib /cluster/data/ce2/sangerFa/chr*.fa
# Typical output:
# Processing /cluster/data/ce2/sangerFa/chrI.fa to /gbdb/ce2/nib/chrI.nib
# Processing /cluster/data/ce2/sangerFa/chrII.fa to /gbdb/ce2/nib/chrII.nib
# Processing /cluster/data/ce2/sangerFa/chrIII.fa to /gbdb/ce2/nib/chrIII.nib
# Processing /cluster/data/ce2/sangerFa/chrIV.fa to /gbdb/ce2/nib/chrIV.nib
# Processing /cluster/data/ce2/sangerFa/chrM.fa to /gbdb/ce2/nib/chrM.nib
# Processing /cluster/data/ce2/sangerFa/chrV.fa to /gbdb/ce2/nib/chrV.nib
# Processing /cluster/data/ce2/sangerFa/chrX.fa to /gbdb/ce2/nib/chrX.nib
# 100291769 total bases
# Verify the hgNibSeq load functioned OK:
hgsql -e "select chrom, size from chromInfo" ce2 > chrom.sizes
cat chrom.sizes
# chrom.sizes:
# chrom size
# chrI 15080483
# chrII 15279308
# chrIII 13783313
# chrIV 17493791
# chrM 13794
# chrV 20922231
# chrX 17718849
# MAKE GAP tracks AND LOAD ASSEMBLY FRAGMENTS INTO DATABASE (DONE, 2004-04-05, hartera)
# next machine
ssh hgwdev
mkdir -p /cluster/data/ce2/bed/gap
cd /cluster/data/ce2/bed/gap
# finds motifs and finds location of gaps as part of output
foreach c (I II III IV V X M)
findMotif -chr=chr${c} -verbose=4 -motif=gcatg /gbdb/ce2/nib >& chr${c}Bed.stderr
end
grep -h GAP *.stderr | sed -e "s/#GAP //" > gap.bed
# hgGoldGap does not handle dir names over 2 characters
# directory III has been moved to 3 when all this was created above
hgGoldGapGl -noGl ce2 /cluster/data/ce2 sangerFa
# All the gap tables are empty
# Load in gap.bed
# Need to add extra fields to gap.bed file
cat << '_EOF_' > /cluster/data/ce2/jkStuff/createGapFile.pl
#!/usr/bin/perl -w
use strict;
my $oldchr = "";
while (<STDIN>) {
my @fields = split(/\t/);
my $chr = $fields[0];
if ($chr ne $oldchr) {
open(OUT, ">$chr"."_gap.bed");
$oldchr = $chr;
}
print OUT "$fields[0]\t$fields[1]\t$fields[2]\t$fields[3]\tN\t$fields[4]\tfragment\tyes\n";
}
'_EOF_'
perl /cluster/data/ce2/jkStuff/createGapFile.pl < gap.bed
# load into relevant tables
foreach c (I II III IV V X M)
hgLoadBed -tab -oldTable ce2 chr${c}_gap chr${c}_gap.bed
end
# CREATE gc5Base wiggle TRACK (DONE, 2004-04-05, hartera)
# Perform a gc count with a 5 base window.
# Also compute a "zoomed" view for display efficiency.
mkdir /cluster/data/ce2/bed/gc5Base
cd /cluster/data/ce2/bed/gc5Base
# in the script below, the 'grep -w GC' selects the lines of
# output from hgGcPercent that are real data and not just some
# information from hgGcPercent. The awk computes the number
# of bases that hgGcPercent claimed it measured, which is not
# necessarily always 5 if it ran into gaps, and then the division
# by 10.0 scales down the numbers from hgGcPercent to the range
# [0-100]. Two columns come out of the awk print statement:
# <position> and <value> which are fed into wigAsciiToBinary through
# the pipe. It is set at a dataSpan of 5 because each value
# represents the measurement over five bases beginning with
# <position>. The result files end up in ./wigData5.
cat << '_EOF_' > /cluster/data/ce2/jkStuff/runGcPercent.sh
#!/bin/sh
mkdir -p wigData5
mkdir -p dataLimits5
for n in /cluster/data/ce2/nib/*.nib
do
c=`basename ${n} | sed -e "s/.nib//"`
C=`echo $c | sed -e "s/chr//"`
echo -n "working on ${c} - ${C} ... "
hgGcPercent -chr=${c} -doGaps \
-file=stdout -win=5 ce2 /cluster/data/ce2/nib | grep -w GC | \
awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
wigAsciiToBinary \
-dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
-name=${C} stdin 2> dataLimits5/${c}
echo "done"
done
'_EOF_'
chmod +x /cluster/data/ce2/jkStuff/runGcPercent.sh
# This is going to take perhaps two hours to run. It is a lot of
# data. make sure you do it on the fileserver:
ssh eieio
cd /cluster/data/ce2/bed/gc5Base
/cluster/data/ce2/jkStuff/runGcPercent.sh
# load the .wig files back on hgwdev:
ssh hgwdev
cd /cluster/data/ce2/bed/gc5Base
hgLoadWiggle ce2 gc5Base wigData5/*.wig
# and symlink the .wib files into /gbdb
mkdir /gbdb/ce2/wib
ln -s `pwd`/wigData5/*.wib /gbdb/ce2/wib
# to speed up display for whole chromosome views, compute a "zoomed"
# view and load that on top of the existing table. The savings
# comes from the number of data table rows the browser needs to load
# for a full chromosome view. Without the zoomed view there are
# over 43,000 data rows for chrom 1. With the zoomed view there are
# only 222 rows needed for the display. If your original data was
# at 1 value per base the savings would be even greater.
# Pretty much the same data calculation
# situation as above, although this time note the use of the
# 'wigZoom -dataSpan=1000 stdin' in the pipeline. This will average
# together the data points coming out of the awk print statment over
# a span of 1000 bases. Thus each <position> coming out of wigZoom
# will represent the measurement of GC in the next 1000 bases. Note
# the use of -dataSpan=1000 on the wigAsciiToBinary to account for
# this type of data. You want your dataSpan here to be an exact
# multiple of your original dataSpan (5*200=1000) and on the order
# of at least 1000, doesn't need to go too high. For data that is
# originally at 1 base per value, a convenient span is: -dataSpan=1024
# A new set of result files ends up in ./wigData5_1K/*.wi[gb]
cat << '_EOF_' > /cluster/data/ce2/jkStuff/runZoom.sh
#!/bin/sh
mkdir -p wigData5_1K
mkdir -p dataLimits5_1K
for n in /cluster/data/ce2/nib/*.nib
do
c=`basename ${n} | sed -e "s/.nib//"`
C=`echo $c | sed -e "s/chr//"`
echo -n "working on ${c} - ${C} ... "
hgGcPercent -chr=${c} -doGaps \
-file=stdout -win=5 ce2 /cluster/data/ce2/nib | grep -w GC | \
awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
-dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
-name=${C} stdin 2> dataLimits5_1K/${c}
echo "done"
done
'_EOF_'
chmod +x /cluster/data/ce2/jkStuff/runZoom.sh
# This is going to take even longer than above, certainly do this
# on the fileserver
ssh eieio
cd /cluster/data/ce2/bed/gc5Base
time /cluster/data/ce2/jkStuff/runZoom.sh
# 520.000u 19.310s 6:15.09 143.7% 0+0k 0+0io 7875pf+0w
# Then load these .wig files into the same database as above
ssh hgwdev
hgLoadWiggle -oldTable ce2 gc5Base wigData5_1K/*.wig
# and symlink these .wib files into /gbdb
mkdir /gbdb/ce2/wib
ln -s `pwd`/wigData5_1K/*.wib /gbdb/ce2/wib
# The browser needs to be fixed so it can display the assembly track
# without the need for the gap tables to exist.
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR CE2 (DONE, 2002-04-05, hartera)
# next machine
ssh hgwdev
# Make trackDb table so browser knows what to expect:
cd $HOME/kent/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add ce2 in all the right places and do
make update
make alpha
cvs commit -m "Added ce2" makefile
# Add dbDb and defaultDb entries:
echo 'insert into dbDb (name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk) \
values("ce2", "March 2004", \
"/gbdb/ce2/nib", "Worm", "chrII:14642289-14671631", 1, \
60, "C. elegans", "Caenorhabditis elegans", \
"/gbdb/ce2/html/description.html", 0);' \
| hgsql -h genome-testdb hgcentraltest
echo 'update defaultDb set name = "ce2" where genome = "C. elegans"' \
| hgsql -h genome-testdb hgcentraltest
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE, 2004-04-05, hartera)
ssh hgwdev
mkdir /cluster/data/ce2/html
cd /cluster/data/ce2/html
# make a symbolic link from /gbdb/ce2/html to /cluster/data/ce2/html
ln -s /cluster/data/ce2/html /gbdb/ce2/html
# Write a description.hmtl - copy from /cluster/data/ce1/html/
# with a description of the assembly and some sample position queries.
# create ce2 dir in /trackDb/worm and commit to CVS
mkdir ~/kent/src/hg/makeDb/trackDb/worm/ce2
cvs add ce2
cvs commit ce2
# Add this also to ~/kent/src/hg/makeDb/trackDb/worm/ce2/description.html
chmod a+r $HOME/kent/src/hg/makeDb/trackDb/worm/ce2/description.html
# Check it in and copy (ideally using "make alpha" in trackDb) to
# /gbdb/ce2/html
cvs commit description.html
# MAKE SANGER GENE (WORM BASE GENES) TRACK (DONE, 2004-04-09, hartera)
# ADD MISSING ORFS AND PROTEIN NAMES TO sangerLinks TABLE AND
# ADD proteinID COLUMN TO SANGERGENE TABLE FOR USE BY THE
# FAMILY BROWSER (DONE, 2004-05-20, hartera)
# next machine
ssh eieio
mkdir -p /cluster/data/ce2/bed/sangerGene
cd /cluster/data/ce2/bed/sangerGene
# the perl trims these files down to a reasonable size. As they are
# they cause ldHgGene to hangup due to memory size.
foreach f (I II III IV V X M)
echo -n "${f} "
zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
-e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
-e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
-e "s/CHROMOSOME_M/chrM/g" \
-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
perl -ne '@a = split; \
print if($a[1] =~ /curated|DNA|RNA/i && \
$a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
end
echo
# remove CDS before id e.g. from CDS "Y74C9A.2" and remove "
perl -pi.bak -e 's/CDS "//' chr*.gff
perl -pi.bak -e 's/"//' chr*.gff
# check this worked ok then cleanup
rm *.bak
# check file sizes, should be reasonable
ls -ogrt
# total 23352
# -rw-rw-r-- 1 3430979 Apr 9 16:09 chrIII.gff
# -rw-rw-r-- 1 3885105 Apr 9 16:09 chrII.gff
# -rw-rw-r-- 1 3680753 Apr 9 16:09 chrI.gff
# -rw-rw-r-- 1 4926390 Apr 9 16:09 chrV.gff
# -rw-rw-r-- 1 3635 Apr 9 16:09 chrM.gff
# -rw-rw-r-- 1 3854560 Apr 9 16:09 chrIV.gff
# -rw-rw-r-- 1 4085489 Apr 9 16:09 chrX.gff
# now load database with those transformed gff files
# next machine
ssh hgwdev
cd /cluster/data/ce2/bed/sangerGene
# 2004-05-10, hartera, Reload sangerGene table using extended GenePred
# format. 2004-05-11, hartera. Extended format frame information does not
# look correct. Reload without the extended fields.
ldHgGene ce2 sangerGene *.gff
# Read 41259 transcripts in 423845 lines in 7 files
# 41259 groups 7 seqs 11 sources 6 feature types
# 23076 gene predictions
# if you need to delete that table:
echo 'drop table sangerGene' | hgsql ce2
# worm/ce2 trackDb.ra entry
# track sangerGene
# shortLabel WormBase Genes
# longLabel WormBase Gene Annotations
# group genes
# priority 48
# visibility pack
# color 0,0,200
# chromosomes chrI,chrII,chrIII,chrIV,chrV,chrX,chrM
# type genePred sangerPep
# url http://www.wormbase.org/db/gene/gene?name=$$
# hgGene on
# Add proteinID field to sangerGene table, used by Gene Sorter
ssh hgwdev
cd /cluster/data/ce2/bed/sangerGene
hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce2
# To add index on this column
hgsql -e 'alter table sangerGene add index(proteinID);' ce2
# Add Swiss-Prot protein IDs to this column
# There are 23076 entries in sangerGene and 21780 of the names
# are in sangerLinks as orfName
hgsql -N -e 'select count(*) from sangerGene as g,sangerLinks as l \
where g.name = l.orfName;' ce2
# 21780
# get names from sangerGene and sangerLinks tables
hgsql -N -e "select name from sangerGene;" ce2 | sort > name.sangerGene
hgsql -N -e "select orfName from sangerLinks;" ce2 | sort > orfName.sangerLinks
# get list of names in sangerGene not in sangerLinks
comm -23 name.sangerGene orfName.sangerLinks > geneNames.notin.sangerLinks
# Go to the WS120 WormBase mirror site and check SwissProt IDs
# http://http://ws120.wormbase.org/db/searches/info_dump
# Some of these genes are non coding RNAs or have no SwissProt ID BUT
# others in the geneNames.notin.sangerLinks list do have a Swiss-Prot ID
# Download IDs in using batches of about 400 for gene names from
# geneNames.notin.sangerLinks and add to file, SPIds.wormPep.WS120
# Create a perl script to parse out Swiss-Prot IDs for those genes that
# have them and create the sql statements to insert them into sangerLinks
cat << '_EOF_' > getSPIDsandLoad.pl
#!/usr/bin/perl -w
use strict;
my %SPhash;
my $found = "false";
my $gene = "";
my $SP = "";
open(SP, ">addSPIds.sql") || die "Can not create addSPIds.sql: $!";
while(<STDIN>) {
chomp;
my @fields = split(/\t/);
if ($fields[0] =~ /^Gene:/) {
$gene = $fields[1];
}
elsif (($fields[0] =~ /^GenPep:/) && ($gene ne "n/a") ){
# if there are 2 fields then store second as Swiss-Prot ID
if ($#fields == 1) {
$SP = $fields[1];
}
# add gene and Swiss-Prot ID to hash
$SPhash{$gene} = $SP;
$gene = "";
$SP = "";
}
}
foreach my $g (keys(%SPhash) ) {
my $s = $SPhash{$g};
# print if there is a Swiss-Prot ID
if ($s =~ /^[A-Z]{1}[0-9]+/) {
print "Gene: $g\tSP ID: $s\n";
# create mySQL to add this to the sangerLinks table;
print SP "INSERT into sangerLinks (orfName, protName, description) VALUES(\"$g\",\"$s\",\"\");\n"
}
}
close SP;
'_EOF_'
# run perl script
perl getSPIDsandLoad.pl < SPIds.wormPep.WS120 > orfsandSPIds.out
# There are 111 of these genes with SwissProt IDs
# To insert new orfNames and SwissProt IDs into table:
hgsql ce2 < addSPIds.sql
# Now 21891 names in sangerGene that match orfNames in sangerLinks
# Use SwissProt IDs in sangerLinks table to fill proteinID
# column in the sangerGene table
hgsql -e 'update sangerGene as g, sangerLinks as l \
set g.proteinID = l.protName where g.name = l.orfName;' ce2
# check there are 21891 rows with the protName field filled
# MYTOUCH FIX - Jen - 2006-01-24
sudo mytouch ce2 orfToGene 0505031600.00
sudo mytouch ce2 sangerBlastTab 0505031600.00
sudo mytouch ce2 sangerBlastTab 0505031600.00
sudo mytouch ce2 sangerCanonical 0505031600.00
sudo mytouch ce2 sangerIsoforms 0505031600.00
sudo mytouch ce2 sangerLinks 0505031600.00
sudo mytouch ce2 sangerPep 0505031600.00
sudo mytouch ce2 sangerToKim 0505031600.00
sudo mytouch ce2 sangerToPfam 0505031600.00
sudo mytouch ce2 sangerToRefSeq 0505031600.00
# MAKE WORMBASE GENEFINDER TRACKS (DONE, 2004-04-12, hartera)
# next machine
ssh eieio
mkdir -p /cluster/data/ce2/bed/sangerGenefinder
cd /cluster/data/ce2/bed/sangerGenefinder
# the perl trims these files down to a reasonable size. As they are
# they cause ldHgGene to hangup due to memory size.
foreach f (I II III IV V X M)
echo -n "${f} "
zcat ../../sanger/CHROMOSOME_${f}.gff.gz | \
sed -e "s/CHROMOSOME_III/chrIII/g" -e "s/CHROMOSOME_II/chrII/g" \
-e "s/CHROMOSOME_IV/chrIV/g" -e "s/CHROMOSOME_I/chrI/g" \
-e "s/CHROMOSOME_X/chrX/g" -e "s/CHROMOSOME_V/chrV/g" \
-e "s/CHROMOSOME_M/chrM/g" \
-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' | \
perl -ne '@a = split; \
print if($a[1] =~ /Genefinder/i && \
$a[2] =~ /intron|exon|cds|sequence|transcri/i);' > chr${f}.gff
end
echo
# remove CDS before id e.g. from CDS "Y74C9A.2" and remove "
perl -pi.bak -e 's/CDS "//' chr*.gff
perl -pi.bak -e 's/"//' chr*.gff
# check this worked ok then cleanup
rm *.bak
# check file sizes, should be reasonable
ls -ogrt
# total 23712
# -rw-rw-r-- 1 3984558 Apr 12 09:14 chrII.gff
# -rw-rw-r-- 1 3702836 Apr 12 09:14 chrI.gff
# -rw-rw-r-- 1 0 Apr 12 09:14 chrM.gff
# -rw-rw-r-- 1 3948579 Apr 12 09:14 chrIV.gff
# -rw-rw-r-- 1 3364561 Apr 12 09:14 chrIII.gff
# -rw-rw-r-- 1 5303266 Apr 12 09:14 chrV.gff
# -rw-rw-r-- 1 3929597 Apr 12 09:14 chrX.gff
# now load database with those transformed gff files
# next machine
ssh hgwdev
cd /cluster/data/ce2/bed/sangerGenefinder
# 2004-05-10, hartera, Reload sangerGene table using extended GenePred
# format. 2004-05-11, hartera. Extended format frame information does not
# look correct. Reload without the extended fields.
ldHgGene ce2 sangerGenefinder *.gff
# Read 36976 transcripts in 401688 lines in 7 files
# 36976 groups 6 seqs 1 sources 4 feature types
# 21180 gene predictions
# if you need to delete that table:
echo 'drop table sangerGenefinder' | hgsql ce2
# RUN Waba alignment with briggsae (WORKING - 2004-04-06 - Hiram)
# prepare contigs from C. briggsae
# Assumes C. briggsae data has been downloaded according to
# makeCb1.doc
# using briggsae contigs from previous work:
/iscratch/i/worms/Cbriggsae/contigs
# next machine
ssh kk
mkdir -p /cluster/data/ce2/bed/waba/out
cd /cluster/data/ce2/bed/waba
ls -1S /iscratch/i/worms/Cbriggsae/contigs/c*.fa > briggsae.lst
ls -1S /iscratch/i/worms/Celegans2/unmaskedFa/chr*.fa > elegans.lst
cat elegans.lst | while read FN
do
b=`basename ${FN}`
mkdir out/${b%%.fa}
done
# create scripts to be used here
cat << '_EOF_' > wabaRun
#!/bin/csh -fe
#
# $1 - full pathname to a briggsae contig
# $2 - file path to an elegans chrom.fa
# $3 - result file full pathname
#
set f = $1:t
set chr = $2:t
set d = $chr:r
mkdir -p /tmp/$d/$f
cp $1 /tmp/$d/$f
pushd .
cd /tmp/$d/$f
set t = $f:r
/cluster/home/hiram/bin/i386/waba 1 $f $2 $t.1
/cluster/home/hiram/bin/i386/waba 2 $t.1 $t.2
/cluster/home/hiram/bin/i386/waba 3 $t.2 $t.wab
cp $t.wab $3
popd
rm -f /tmp/$d/$f/$t.*
rmdir --ignore-fail-on-non-empty /tmp/$d/$f
rmdir --ignore-fail-on-non-empty /tmp/$d
'_EOF_'
chmod +x wabaRun
cat << '_EOF_' > jobTemplate
#LOOP
/cluster/store5/worm/ce2/bed/waba/wabaRun {check in exists+ $(path1)} {check in exists+ $(path2)} {check out exists /cluster/store5/worm/ce2/bed/waba/out/$(root2)/$(root1).wab}
#ENDLOOP
'_EOF_'
gensub2 briggsae.lst elegans.lst jobTemplate jobList
para create jobList
para try
para check
para push
# one of the jobs takes quite a while. Most of the others are OK:
Completed: 6950 of 6951 jobs
Crashed: 1 jobs
CPU time in finished jobs: 19284472s 321407.87m 5356.80h 223.20d 0.612 y
IO & Wait Time: 63556s 1059.26m 17.65h 0.74d 0.002 y
Average job time: 2784s 46.40m 0.77h 0.03d
Longest job: 47017s 783.62m 13.06h 0.54d
Submission to last job: 58555s 975.92m 16.27h 0.68d
# The failed job is:
# /cluster/store5/worm/ce2/bed/waba/wabaRun \
# /iscratch/i/worms/Cbriggsae/contigs/c0907.fa \
# /iscratch/i/worms/Celegans2/unmaskedFa/chrI.fa \
# /cluster/store5/worm/ce2/bed/waba/out/chrI/c0907.wab
XXXX - running 2004-04-07 - retrying this stand-along on kkr1u00
# after about 1.5 hours fails with the error:
# waba: xenbig.c:550: mergeTwo: Assertion `c->qStart == qStart &&
# c->qEnd == qEnd' failed.
# Abort
# Ignoring that error, and moving on
# next machine
ssh hgwdev
cd /cluster/data/ce2/bed/waba
mkdir Load
cd Load
# The cat through the pipe to hgWaba will avoid making large files
# that are not needed.
cat << '_EOF_' > loadEm.sh
#!/bin/sh
#
for c in I II III IV V X M
do
echo -n "${c} "
cat /cluster/store5/worm/ce2/bed/waba/out/chr${c}/c*.wab |
/cluster/bin/i386/hgWaba ce2 Cbr chr${c} 0 stdin > proc${c}.out 2>&1
done
exit 0
'_EOF_'
chmod +x loadEm.sh
# run it to load the waba track:
./loadEm.sh
# remove garbage temp file:
rm full_waba.tab chrom_waba.tab
# worm/ce2/trackDb.ra entry:
# track cbrWaba
# shortLabel Briggsae Waba
# longLabel C. briggsae Waba Alignments
# group compGeno
# priority 20
# visibility dense
# color 140,0,200
# altColor 210,140,250
# MAKE 2BIT FILE FOR BLAT (DONE, 2004-04-07, hartera)
# Make one big 2bit file as well, and make a link to it in
# /gbdb/ce2/nib because hgBlat looks there:
ssh eieio
cd /cluster/data/ce2
faToTwoBit softMask/chr*.fa ce2.2bit
ssh hgwdev
ln -s /cluster/data/ce2/ce2.2bit /gbdb/ce2/nib/
# MAKE 11.OOC FILE FOR BLAT (DONE, 2004-04-07, hartera)
# Use -repMatch=40 (based on size, for human use 1024)
ssh kkr1u00
mkdir /cluster/bluearc/ce2
mkdir /cluster/data/ce2/bed/ooc
cd /cluster/data/ce2/bed/ooc
ls -1 /cluster/data/ce2/nib/chr*.nib > nib.lst
/cluster/bin/i386/blat nib.lst /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/ce2/11.ooc -repMatch=40
# Writing /cluster/bluearc/ce2/11.ooc
# Wrote 43676 overused 11-mers to /cluster/bluearc/ce2/11.ooc
cp -p /cluster/bluearc/ce2/11.ooc /iscratch/i/worms/Celegans2/
iSync
# AUTO UPDATE GENBANK MRNA AND EST RUN (DONE, 2004-04-07, hartera)
# next machine
ssh hgwdev
# Update genbank config and source in CVS:
cd ~/kent/src/hg/makeDb/genbank
cvsup .
# See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
# edits, check them in if necessary:
diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf
# Edit etc/genbank.conf: default includes native genbank mRNAs and ESTs,
# genbank xeno mRNAs but no ESTs, native RefSeq mRNAs but not xeno RefSeq
# Add these lines:
# ce2 (C. elegans)
ce2.genome = /iscratch/i/worms/Celegans2/nib/chr*.nib
ce2.lift = no
ce2.downloadDir = ce2
cvs commit -m "Added ce2" etc/genbank.conf
make
# markd added -maxIntron=100000 for ce to genbank/src/align/gbBlat
# Edit src/align/gbBlat to add /iscratch/i/worms/Celegans2/11.ooc
cvs diff src/align/gbBlat
make
cvs commit -m "Added 11.ooc for ce2" src/align/gbBlat
# Install to /cluster/data/genbank:
make install-server
# next machine
ssh eieio
cd /cluster/data/genbank
# This is an -initial run, RefSeq mRNA only:
nice bin/gbAlignStep -srcDb=refseq -type=mrna -verbose=1 -initial ce2
# Load results for RefSeq mRNAs:
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad ce2
# To start next run, results need to be moved out the way
cd /cluster/data/genbank/work
mv initial.ce2 initial.ce2.refseq.mrna
# -initial for GenBank mRNAs
ssh eieio
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=genbank -type=mrna -verbose=1 -initial ce2
# Load results for GenBank mRNAs
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 ce2
cd /cluster/data/genbank/work
mv initial.ce2 initial.ce2.genbank.mrna
# -initial for ESTs
ssh eieio
cd /cluster/data/genbank
nice bin/gbAlignStep -srcDb=genbank -type=est -verbose=1 -initial ce2
# Load results for GenBank ESTs
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 ce2
cd /cluster/data/genbank/work
mv initial.ce2 initial.ce2.genbank.est
# Everything loaded ok, so clean up
rm -r /cluster/data/genbank/work/initial.ce2*
# REMOVE XENO mRNAs (DONE, 2004-04-09, hartera)
# Remove xeno mRNAs, too many are distantly related and do
# not have meaningful alignments using Blat
ssh hgwdev
# Remove all mrna.xeno.* files from
# /cluster/data/genbank/data/aligned/genbank.140.0/ce2
# edit /cluster/data/genbank/etc/genbank.conf
cd ~/kent/src/hg/makeDb/genbank
# add line to ce2:
ce2.genbank.mrna.xeno.load = no
# so now it is:
# ce2 (C. elegans)
ce2.genome = /iscratch/i/worms/Celegans2/nib/chr*.nib
ce2.lift = no
ce2.genbank.mrna.xeno.load = no
ce2.downloadDir = ce2
make
cvs update etc/genbank.conf
cvs commit etc/genbank.conf
make install-server
cd /cluster/data/genbank/
# reload only the native mRNAs to the database
nice bin/gbDbLoadStep -reload -type=mrna -srcDb=genbank -verbose=1 ce2
# drop xenoMrna table
echo 'drop table xenoMrna' | hgsql ce2
# PRODUCING FUGU BLAT ALIGNMENTS (DONE, 2004-04-07, hartera)
# Assumes masked NIBs have been prepared as above
# and Fugu pieces are already on kluster /iscratch/i.
# next machine
ssh kk
mkdir /cluster/data/ce2/bed/blatFr1
cd /cluster/data/ce2/bed/blatFr1
ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
ls -1S /iscratch/i/worms/Celegans2/nib/*.nib > worm.lst
cat << '_EOF_' > gsub
#LOOP
blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
mkdir psl
gensub2 worm.lst fugu.lst gsub spec
para create spec
para try, check, push, check, ...
# para time
# Completed: 4046 of 4046 jobs
# CPU time in finished jobs: 263359s 4389.31m 73.16h 3.05d 0.008 y
# IO & Wait Time: 13317s 221.95m 3.70h 0.15d 0.000 y
# Average job time: 68s 1.14m 0.02h 0.00d
# Longest job: 1103s 18.38m 0.31h 0.01d
# Submission to last job: 2170s 36.17m 0.60h 0.03d
# When cluster run is done, sort alignments
# next machine
ssh eieio
cd /cluster/data/ce2/bed/blatFr1
foreach d (I II III IV V X M)
echo -n "${d} "
pslCat psl/chr${d}_*.psl | \
pslSortAcc nohead chrom temp stdin
rm -f chrom/chr${d}_blatFr1.psl
mv chrom/chr${d}.psl chrom/chr${d}_blatFr1.psl
end
# next machine
ssh hgwdev
# Load Fugu Blat alignments
cd /cluster/data/ce2/bed/blatFr1/chrom
cat *.psl | hgLoadPsl -fastLoad -table=blatFr1 ce2 stdin
# Make fugu /gbdb/ symlink and load Fugu sequence data.
mkdir /gbdb/ce2/fuguSeq
ln -s cluster/data/fr1/fugu_v3.masked.fa /gbdb/ce2/fuguSeq
cd /cluster/data/ce2/bed/blatFr1
hgLoadSeq ce2 /gbdb/ce2/fuguSeq/fugu_v3.masked.fa
# Adding /gbdb/ce2/fuguSeq/fugu_v3.masked.fa
# 20379 sequences
# worm/ce2 trackDb.ra entry
# track blatFr1
# shortLabel Fugu Blat
# longLabel Takifugu rubripes (fr1) Translated Blat Alignments
# group compGeno
# priority 110
# visibility dense
# color 0,60,120
# altColor 200,220,255
# spectrum on
# type psl xeno
# BLASTZ C. Briggsae (Cb1) (DONE, 2004-04-13, hartera)
# next machine
ssh kkr1u00
# blastz requires lineage-specific repeats but there are none for the worms
# so create empty files for each chromsome and iSync
mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
cd /iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
# create chrI.out.spec and cp to chrN.out.spec for chrII, chrIII, chrIV, chrV, chrX, chrM
mkdir -p /iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans
# create empty chrUn.out.spec file
iSync
ssh kk
mkdir -p /cluster/data/ce2/bed/blastz.cb1.2004-04-12
cd /cluster/data/ce2/bed
ln -s blastz.cb1.2004-04-12 blastz.cb1
cd blastz.cb1
cat << '_EOF_' > DEF
# C. elegans vs. C. briggsae
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
#BLASTZ_ABRIDGE_REPEATS=1
# when SMSK=/dev/null
BLASTZ_ABRIDGE_REPEATS=0
# TARGET
SEQ1_DIR=/iscratch/i/worms/Celegans2/nib
# RMSK not currently used
SEQ1_RMSK=/iscratch/i/worms/Celegans2/rmsk
SEQ1_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCbriggsae
# FLAG not currently used
SEQ1_FLAG=-worm
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
SEQ2_DIR=/iscratch/i/worms/Cbriggsae/bothMasksNib
# RMSK not currently used
SEQ2_RMSK=/iscratch/i/worms/Cbriggsae/rmsk
# FLAG not currently used
SEQ2_SMSK=/iscratch/i/worms/Cbriggsae/linSpecRep.notinCelegans
SEQ2_FLAG=-worm
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=30000000
SEQ2_LAP=0
BASE=/cluster/data/ce2/bed/blastz.cb1
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
#DEBUG=1
'_EOF_'
# << this line keeps emacs coloring happy
# Save the DEF file in the current standard place
cp DEF ~angie/hummus/DEF.ce2-cb1.2004-04-12
# Need shell scripts from mm4 to do cluster runs
mv /cluster/data/mm4/jkStuff/BlastZ*.sh /cluster/data/ce2/jkStuff/
# prepare first cluster run
ssh kk
cd /cluster/data/ce2/bed/blastz.cb1
source DEF
/cluster/data/ce2/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
# para time
# Completed: 56 of 56 jobs
# CPU time in finished jobs: 155723s 2595.39m 43.26h 1.80d 0.005 y
# IO & Wait Time: 6387s 106.45m 1.77h 0.07d 0.000 y
# Average job time: 2895s 48.25m 0.80h 0.03d
# Longest job: 6137s 102.28m 1.70h 0.07d
# Submission to last job: 11078s 184.63m 3.08h 0.13d
# Second cluster run to convert the .out's to .lav's
cd /cluster/data/ce2/bed/blastz.cb1
source DEF
/cluster/data/ce2/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# para time
# Completed: 14 of 14 jobs
# CPU time in finished jobs: 762s 12.70m 0.21h 0.01d 0.000 y
# IO & Wait Time: 62s 1.03m 0.02h 0.00d 0.000 y
# Average job time: 59s 0.98m 0.02h 0.00d
# Longest job: 115s 1.92m 0.03h 0.00d
# Submission to last job: 293s 4.88m 0.08h 0.00d
# Third cluster run to convert lav's to axt's
cd /cluster/data/ce2/bed/blastz.cb1
source DEF
/cluster/data/ce2/jkStuff/BlastZ_run2.sh
cd run.2
para try
para check
# only 7 jobs so all completed by para try
# para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 187s 3.11m 0.05h 0.00d 0.000 y
# IO & Wait Time: 169s 2.82m 0.05h 0.00d 0.000 y
# Average job time: 51s 0.85m 0.01h 0.00d
# Longest job: 77s 1.28m 0.02h 0.00d
# Submission to last job: 77s 1.28m 0.02h 0.00d
# translate sorted axt files into psl
ssh eieio
cd /cluster/data/ce2/bed/blastz.cb1
mkdir -p pslChrom
set tbl = "blastzCb1"
foreach f (axtChrom/chr*.axt)
set c=$f:t:r
echo "Processing chr $c"
/cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
end
# Load database tables
ssh hgwdev
cd /cluster/data/ce2/bed/blastz.cb1/pslChrom
foreach d (I II III IV V X M)
/cluster/bin/i386/hgLoadPsl -noTNameIx ce2 chr${d}_blastzCb1.psl
end
# CHAIN Cb1 BLASTZ (DONE, 2004-04-30, hartera)
# CHAINS ARE RE-DONE DUE TO BUG IN axtChain which has now been fixed by Jim
ssh kk
mkdir -p /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/run1
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/run1
mkdir out chain
ls -1S /cluster/data/ce2/bed/blastz.cb1/axtChrom/*.axt > input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter $1 | /cluster/home/hartera/bin/i386/axtChain stdin \
/iscratch/i/worms/Celegans2/nib \
/iscratch/i/worms/Cbriggsae/bothMasksNib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
# only 7 jobs so all done by para try
para try
para check
# para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 1265s 21.08m 0.35h 0.01d 0.000 y
# IO & Wait Time: 69s 1.16m 0.02h 0.00d 0.000 y
# Average job time: 191s 3.18m 0.05h 0.00d
# Longest job: 632s 10.53m 0.18h 0.01d
# Submission to last job: 632s 10.53m 0.18h 0.01d
# now on the file server, sort chains
ssh eieio
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
time chainMergeSort run1/chain/*.chain > all.chain
# User 37.040u
# System 3.090s
# Elapsed Real 0:42.54
time chainSplit chain all.chain
# User 37.310u
# System 2.930s
# Elapsed Real 0:43.85
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/chain
foreach i (*.chain)
set c = $i:r
hgLoadChain ce2 ${c}_chainCb1 $i
echo done $c
end
# NET Cb1 (DONE, 2004-04-30, hartera)
# REMAKE NET Cb1 WITH NEW CHAINS
ssh eieio
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
mkdir preNet
mv /cluster/data/ce2/tmp/chrom.sizes /cluster/data/ce2/
# remove header line in file so chainPreNet will work
cd chain
foreach i (*.chain)
echo preNetting $i
/cluster/bin/i386/chainPreNet $i /cluster/data/ce2/chrom.sizes \
/cluster/data/cb1/chrom.sizes ../preNet/$i
end
cd ..
mkdir n1
cd preNet
foreach i (*.chain)
set n = $i:r.net
echo primary netting $i
/cluster/bin/i386/chainNet $i -minSpace=1 /cluster/data/ce2/chrom.sizes \
/cluster/data/cb1/chrom.sizes ../n1/$n /dev/null
end
cd ..
cat n1/*.net | /cluster/bin/i386/netSyntenic stdin hNoClass.net
# memory usage 84246528, utime 710 s/100, stime 70
ssh hgwdev
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
# add classification info using db tables
# use -noAr option - don't look for ancient repeats
time netClass -noAr hNoClass.net ce2 cb1 cb1.net
# User 11.860u
# System 2.090s
# Elapsed Real 0:16.24
# Load the nets into database
ssh hgwdev
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
netFilter -minGap=10 cb1.net | hgLoadNet ce2 netCb1 stdin
# MAKE axtNet TRACK (DONE, 2004-04-30, hartera)
# TRACK REMADE WITH NEW CHAINS
# Move back to the file server to create axt files corresponding to the net
ssh kksilo
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
# try without netFilter as not working well
# netFilter -minGap=10 cb1.net > cb1Filt.net
mkdir /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29
netSplit cb1.net cb1NetSplit
cd cb1NetSplit
foreach i (*.net)
set c = $i:r
netToAxt -maxGap=300 $i \
/cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29/chain/$c.chain \
/iscratch/i/worms/Celegans2/nib \
/iscratch/i/worms/Cbriggsae/bothMasksNib \
/cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/$c.axt
echo done ../axtNet.2004-04-29/$c.axt
end
cd ..
rm -r cb1NetSplit
# sort axt files based on target position
mkdir /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29
foreach i (*.axt)
set c = $i:r
axtSort $c.axt ./sortedaxtNet/$c.axt
echo done sorting $c.axt
end
# Load up the axtNet (alignment score wiggle) track:
ssh hgwdev
mkdir /gbdb/ce2/axtNetCb1
foreach f (/cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet/chr*.axt)
ln -s $f /gbdb/ce2/axtNetCb1
end
cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
${HOME}/bin/i386/hgLoadAxt ce2 axtNetCb1
# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR CE2 (DONE, 2003-04-13, hartera)
# next machine
ssh hgwdev
# DNA port is "0", trans prot port is "1"
# 8/26/04: set canPcr=1 for untranslated blat server.
echo 'insert into blatServers values("ce2", "blat1", 17781, 0, 1); \
insert into blatServers values("ce2", "blat1", 17780, 1, 0);' \
| hgsql -h genome-testdb hgcentraltest
# if you need to delete those entries
echo 'delete from blatServers where db="ce2";' \
| hgsql -h genome-testdb hgcentraltest
# to check the entries:
echo 'select * from blatServers where db="ce2";' \
| hgsql -h genome-testdb hgcentraltest
# CREATE ORFTOGENE TABLE (Used by hgGene and hgNear) (DONE, 2004-04-27, hartera)
mkdir /cluster/data/ce2/bed/orfToGene
cd !$
# gene_names.txt for WS120 was created and provided by todd.harris@cshl.edu
# ORF e.g. Y110A7A.10 gene e.g. aap-1
awk -F '\t' '$2 == "Caenorhabditis elegans" && $3 == "Gene" {printf("%s\t%s\n", $4, $1)}' gene_names.txt > orfToGene.txt
# reformat this for use in creating Sanger Links with hgWormLinks
awk 'NF == 2' orfToGene.txt > orfToGene.txt2
# use perl script to move ORFs with the same gene name onto separate lines
# and remove lines where there is a gene name with an alternate name
# in parentheses. this output file is used for Sanger Links
/cluster/data/ce2/jkStuff/formatorfToGene.pl < orfToGene.txt2 > orfToGene.tab2
hgCeOrfToGene ce2 gene_names.txt sangerGene orfToGene
# DOWNLOAD WORMBASE FOR WS120 RELEASE (DONE, 2004-04-23, hartera)
# REMOVED acedb DIRECTORY AS NO LONGER NEEDED (DONE, 2008-08-18, hartera)
ssh eieio
mkdir -p /cluster/store7/acedb
cd /cluster/store7/acedb
wget -o ce2.fetchws120.log -r -l1 --timestamp --no-directories \
ftp://ftp.sanger.ac.uk/pub/wormbase/FROZEN_RELEASES/WS120/
gunzip *.gz
tar xvf *.tar
# 2008-08-18, hartera. Remove the acedb directory.
cd /cluster/store7
rm -rf acedb
# CREATE SANGER LINKS TABLE FROM WORMBASE INFO (used by hgGene and hgNear)
# (DONE, 2004-05-03, hartera)
mkdir /cluster/data/ce2/bed/steinHelp
# Mail Lincoln Stein and ask for mappings from wormPep to
# SwissProt and from ORF to wormPep to description. Place
# these files in the steinHelp directory:
# swissprot_dump.txt - contains ORF/WormPep/Description
# and contains SwissProt Ids and Accessions
# Jorge downloaded and installed the software for using AceDB
# to /cluster/bin/acedb/
# To use
ssh hgwdev
cd /cluster/store7/acedb
# can use /cluster/bin/acedb/xace . to get X-Window query interface
# To use AceDB Query Language (AQL)
cd /cluster/store7/acedb
/cluster/bin/acede/tace .
# At prompt, can type ? for help, type the following queries
# wbGeneClass.txt - associates 3 letter first part of worm gene
# name with a brief description.
acedb> AQL -o wbGeneClass.txt select l,l->Description from l in class Gene_Clas
# wbConcise.txt - associates worm gene name with a several
# sentence description.
acedb> AQL -o wbConcise.txt select l,g from l in class Locus, g in l->Gene_information[Provisional_description]
# remove "" in wbGeneClass.txt and wbConcise.txt
sed -e 's/"//g;' wbGeneClass.txt > wbGeneClass.new
sed -e 's/"//g;' wbConcise.txt > wbConcise.new
mv wbGeneClass.new wbGeneClass.txt
mv wbConcise.new wbConcise.txt
# first remove header lines
sed -e 's/Format//' wbConcise.txt | sed -e 's/Locus[:]*//' > tmp
sed -e 's/Text//' tmp > tmp2
mv tmp2 wbConcise.txt
rm tmp
sed -e 's/Format//' wbGeneClass.txt | sed -e 's/Gene_class[:]*//' > tmp
sed -e 's/Text//' tmp > tmp2
mv tmp2 wbGeneClass.txt
rm tmp
# get SwissProt and TrEMBL Ids and accessions from ftp site
# TrEMBL Ids missing from swissprot_dump.txt
# Need these for details pages to work for sangerGene track
wget -o ce2.fetch.log -r -l1 --no-directories --timestamping \
ftp://ftp.sanger.ac.uk/pub/databases/wormpep/old_wormpep120/wormpep.table120
# parse out CDS to Swiss-Prot IDs mappings
cat << '_EOF_' > /cluster/data/ce2/jkStuff/getCDSandSPId.pl
#!/usr/bin/perl -w
use strict;
my $found = "false";
while (<STDIN>) {
chomp;
my @fields = split(/\t/);
foreach my $f (@fields) {
# get CDS name
if ($f =~ /^>([0-9A-Za-z\.]+)/) {
print "$1\t";
$found = "false";
}
elsif ($f =~ /^(TR:|SP:)([0-9A-Z]{4,})$/) {
print $2;
$found = "true";
}
}
if ($found eq "false") {
print "NULL";
}
print "\n";
}
'_EOF_'
chmod a+x /cluster/data/ce2/jkStuff/getCDSandSPId.pl
perl /cluster/data/ce2/jkStuff/getCDSandSPId.pl < wormpep.table120 > spCdsToId.txt
# rewrote hgWormLinks to extract information from these files
hgWormLinks spCdsToId.txt swissprot_dump.txt \
wbConcise.txt wbGeneClass.txt \
/cluster/data/ce2/bed/orfToGene/orfToGene.tab2 sangerLinks.txt
# create table in ce2 database and load data
hgsql ce2 < ~/kent/src/hg/near/hgWormLinks/sangerLinks.sql
hgsql -e \
'load data local infile "sangerLinks.txt" into table sangerLinks' \
ce2
# CREATE SANGERPEP TABLE (DONE, 2004-04-29, hartera)
# RECREATED SANGERPEP TABLE SO AS all.joiner EXPECTS THE CREATE TIME TO
# BE LATER THAN THAT FOR THE SANGERGENE TABLE (DONE, 2004-05-21, hartera)
mkdir -p /cluster/data/ce2/bed/sangerPep
cd /cluster/data/ce2/bed/sangerPep
# Download peptide sequences from the Sanger Centre ftp site:
wget -o ce2.fetch.log -r -l1 --no-directories --timestamping \
ftp://ftp.sanger.ac.uk/pub/databases/wormpep/old_wormpep120/wormpep120
# Load into database
# 2004-05-10 and 2004-05-11 hartera dropped sangerPep table and recreated it
# all.joiner expects sangerGene table to have an earlier
# load time than sangerPep so recreate sangerPep after sangerGene
hgPepPred ce2 generic sangerPep wormpep120
# the sangerPep table is used by the sangerGene track
# TWINSCAN GENE PREDICTIONS (DONE, 2004-04-16, hartera)
# These are Iscan (new version of Twinscan) gene predictions
# e-mailed Michael Brent at WUSTL: brent@cse.wustl.edu to obtain data
ssh hgwdev
mkdir /cluster/data/ce2/bed/twinscan
cd /cluster/data/ce2/bed/twinscan
wget --timestamping -r \
http://genes.cs.wustl.edu/predictions/worm/C_elegans/WS120/4-13-2004/
mv ./genes.cs.wustl.edu/predictions/worm/C_elegans/WS120/4-13-2004/* \
/cluster/data/ce2/bed/twinscan/
rm -r genes.cs.wustl.edu
rm index* ./chr_gtf/index* ./chr_ptx/index* ./chr_tx/index*
# Clean up chrom field of GTF files
foreach c (I II III IV V X)
set f = chr_gtf/chr_${c}.iscan_pred_gtf
echo ${f}
sed -e "s/CHROMOSOME_/chr/g" ${f} | \
sed -e "s/_[0-9][0-9]*.seq\t/\t/" > \
chr_gtf/chr${c}-fixed.gtf
end
# pare down protein FASTA header to id and add missing .1:
foreach c (I II III IV V X)
set f = chr_ptx/CHROMOSOME_${c}.ptx
echo ${f}
perl -wpe 's/^\>CHROMOSOME_(\S+).*$/\>chr$1.1/;' < \
${f} > chr_ptx/chr${c}-fixed.fa
end
# load data into database, need -gtf option as no stop codons in
# these predictions. also use new -frame -id and -geneName options
ldHgGene -gtf -frame -id -geneName ce2 twinscan chr_gtf/chr*-fixed.gtf
# Read 20990 transcripts in 147535 lines in 6 files
# 20990 groups 6 seqs 1 sources 3 feature types
# 20990 gene predictions
# 2004-05-10, hartera dropped twinscanPep table and re-created it.
# twinscan table was reloaded using extended options after loading
# twinscanPep. all.joiner expects twinscan table to have an earlier
# load time than twinscanPep
hgPepPred ce2 generic twinscanPep chr_ptx/chr*-fixed.fa
# MAKING BLASTZ SELF (DONE, 2004-04-20, hartera)
# The procedure for lineage spec business with self is to simply
# use the actual repeat masker output for this C. elegans assembly as
# the lineage specific repeats for itself. Thus, merely make
# symlinks to the repeat masker out files and name them as expected
# for blastz. In this case they are called notinCelegans but they
# really mean inCelegans. Yes, it is confusing, but that's just the
# nature of the game in this case.
# next machine
ssh kkr1u00
mkdir -p /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
cd /iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
foreach f (/iscratch/i/worms/Celegans2/bothMasksFa/*.fa)
set base = $f:t:r:r
echo $base.out.spec
ln -s $f $base.out.spec
end
iSync
# next machine
ssh hgwdev
mkdir -p /cluster/data/ce2/bed/blastzSelf
cd /cluster/data/ce2/bed/blastzSelf
cat << '_EOF_' > DEF
# C. elegans vs. C. elegans
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/home/angie/schwartzbin:/cluster/home/kent/bin/i386
ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=1
# TARGET
# C. elegans
SEQ1_DIR=/iscratch/i/worms/Celegans2/nib
# not used
SEQ1_RMSK=
# not used
SEQ1_FLAG=
SEQ1_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# C. elegans
SEQ2_DIR=/iscratch/i/worms/Celegans2/nib
# not currently used
SEQ2_RMSK=
# not currently used
SEQ2_FLAG=
SEQ2_SMSK=/iscratch/i/worms/Celegans2/linSpecRep.notinCelegans
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/cluster/data/ce2/bed/blastzSelf
DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
# << this line makes emacs coloring happy
# Save the DEF file in the current standard place
cp DEF ~angie/hummus/DEF.ce2-ce2.2004-04-14
ssh kk
cd /cluster/data/ce2/bed/blastzSelf
# source the DEF file to establish environment for following commands
source DEF
/cluster/data/mm4/jkStuff/BlastZ_run0.sh
cd run.0
para try, check, push, check, ....
# para time
# Completed: 196 of 196 jobs
# CPU time in finished jobs: 418569s 6976.16m 116.27h 4.84d 0.013 y
# IO & Wait Time: 2416s 40.26m 0.67h 0.03d 0.000 y
# Average job time: 2148s 35.80m 0.60h 0.02d
# Longest job: 17917s 298.62m 4.98h 0.21d
# Submission to last job: 53181s 886.35m 14.77h 0.62d
cd /cluster/data/ce2/bed/blastzSelf
source DEF
/cluster/data/ce2/jkStuff/BlastZ_run1.sh
cd run.1
para try, check, push, etc ...
# para time
# Completed: 14 of 14 jobs
# CPU time in finished jobs: 3340s 55.67m 0.93h 0.04d 0.000 y
# IO & Wait Time: 98s 1.63m 0.03h 0.00d 0.000 y
# Average job time: 246s 4.09m 0.07h 0.00d
# Longest job: 439s 7.32m 0.12h 0.01d
# Submission to last job: 892s 14.87m 0.25h 0.01d
# Third cluster run to convert lav's to axt's
# lavToAxt has a new -dropSelf option that replaces axtDropOverlap
# to remove the alignment blocks on the diagonal
# When this was first run, it crashed for chrV with the
# error: breakUpIfOnDiagonal: Too many fragmented block lists!
# This means that when a blockList is taken and broken up everywhere
# there is a block along the diagonal and added to an array of
# fragmented sub-lists then in this chrosomosome the list has exceeded
# the array size of 256 for an alignment. Change the line:
# struct block *bArr[256] to struct block *bArr[1024] in the
# parseIntoAxt function, make and re-run
cd /cluster/data/ce2/bed/blastzSelf
mkdir axtChrom
mkdir run.2
cd run.2
cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| ~/bin/i386/lavToAxt -dropSelf stdin /iscratch/i/worms/Celegans2/nib \
/iscratch/i/worms/Celegans2/nib stdout \
| axtSort stdin $2
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x do.csh
cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/ce2/bed/blastzSelf/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
ls -1Sd ../lav/chr* > chrom.list
gensub2 chrom.list single gsub jobList
# check jobList
wc -l jobList
head jobList
para create jobList
para try, check ....
# only 7 jobs so all completed by para try
para try, para check ...etc.
# para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 796s 13.27m 0.22h 0.01d 0.000 y
# IO & Wait Time: 551s 9.18m 0.15h 0.01d 0.000 y
# Average job time: 192s 3.21m 0.05h 0.00d
# Longest job: 302s 5.03m 0.08h 0.00d
# Submission to last job: 303s 5.05m 0.08h 0.00d
# Use Blastz alignments to create Self Chain. Do not create
# Self Blastz track since this is redundant with the Self Chain.
# CHAIN SELF BLASTZ (DONE, 2004-05-03, hartera)
# CHAINS ARE RE-DONE DUE TO BUG IN axtChain which has now been fixed by Jim
# The axtChain is best run on the small kluster, or the kki kluster
# in this case, it was run on the kk kluster
ssh kk
mkdir -p /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/run1
cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/run1
mkdir out chain
ls -1S /cluster/data/ce2/bed/blastzSelf/axtChrom/*.axt \
> input.lst
cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
# << this line makes emacs coloring happy
# Problems running this. At first, all jobs crashed and para problems said that
# the ./out/*.out files were empty. Then copied nib files also to a nib2
# directory and change in doChain so the 2 nib arguments are not the same.
# Then crashed saying that the ./chain/*.chain files were empty and that the
# files in nib2 could not be opened (permissions ok). changed doChain back to
# having the two nib directory arguments the same. Removed and recreated jobList# but left ./chain/*.chain and ./out/*.out
# This time the jobs ran ok. Also runs ok if run axtFilter followed by
# axtChain separately on the command line.
cat << '_EOF_' > doChain
#!/bin/csh
axtFilter $1 | /cluster/home/hartera/bin/i386/axtChain stdin \
/iscratch/i/worms/Celegans2/nib /iscratch/i/worms/Celegans2/nib $2 >& $3
'_EOF_'
# << this line makes emacs coloring happy
chmod a+x doChain
gensub2 input.lst single gsub jobList
para create jobList
# 7 jobs so all completed by para try
para try, check,....etc.
# after remaking the axt files using the -dropSelf option for lavToAxt,
# these jobs no longer crashed.
# para time
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 2700s 45.00m 0.75h 0.03d 0.000 y
# IO & Wait Time: 620s 10.34m 0.17h 0.01d 0.000 y
# Average job time: 474s 7.90m 0.13h 0.01d
# Longest job: 844s 14.07m 0.23h 0.01d
# Submission to last job: 844s 14.07m 0.23h 0.01d
# now on the file server, sort chains
ssh eieio
cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30
chainMergeSort run1/chain/*.chain > all.chain
chainSplit chain all.chain
# take a look at score distr's
foreach f (chain/*.chain)
grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
echo $f:t:r
textHistogram -binSize=10000 /tmp/score.$f:t:r
echo ""
end
# for chrI
# chrI
# large values truncated: need 1414 bins or larger binSize than 10000
# 0 ************************************************************ 710660
# 10000 ******** 90988
# 20000 ** 25092
# 30000 * 10405
# 40000 5912
# 50000 2958
# 60000 2157
# 70000 1780
# 80000 1336
# 90000 900
# 100000 636
# 110000 579
# 120000 451
# 130000 403
# 140000 351
# 150000 337
# 160000 287
# 170000 210
# 180000 88
# 190000 129
# 200000 147
# 210000 80
# 220000 60
# 230000 73
# 240000 42
# trim to minScore=20000 to cut some of the fluff
mkdir chainFilt
foreach f (chain/*.chain)
chainFilter -minScore=20000 $f > chainFilt/$f:t
end
# Load chains into database
# next machine
ssh hgwdev
cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30/chainFilt
foreach i (*.chain)
set c = $i:r
echo loading $c
hgLoadChain ce2 ${c}_chainSelf $i
echo done $c
end
# MAKE DOWNLOADABLE SEQUENCE FILES (DONE, 2004-04-21, hartera)
ssh kksilo
cd /cluster/data/ce2
#- Build the .zip files
cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
# chrom AGP's
zip -j zip/chromAgp.zip ./sangerFa/[0-9A-Z]*/chr*.agp
# chrom RepeatMasker out files
zip -j zip/chromOut.zip ./RMRun/chr*.fa.out
# soft masked chrom fasta
zip -j zip/chromFa.zip ./softMask/chr*.fa
# hard masked chrom fasta
zip -j zip/chromFaMasked.zip hardMask/chr*.fa
# chrom TRF output files
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMask/chr*.bed
cd ../..
# get GenBank native mRNAs
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=ce2 -native GenBank mrna \
/cluster/data/ce2/zip/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=ce2 -xeno GenBank mrna \
/cluster/data/ce2/zip/xenoMrna.fa
# get native RefSeq mRNAs
./bin/i386/gbGetSeqs -db=ce2 -native refseq mrna \
/cluster/data/ce2/zip/refMrna.fa
# get native GenBank ESTs
./bin/i386/gbGetSeqs -db=ce2 -native GenBank est \
/cluster/data/ce2/zip/est.fa
cd /cluster/data/ce2/zip
# zip GenBank native and xeno mRNAs, native ESTs and RefSeq mRNAs
zip -j mrna.zip mrna.fa
zip -j xenoMrna.zip xenoMrna.fa
zip -j refMrna.zip refMrna.fa
zip -j est.zip est.fa
'_EOF_'
# << this line makes emacs coloring happy
csh ./jkStuff/zipAll.csh |& tee zipAll.log
cd zip
#- Look at zipAll.log to make sure all file lists look reasonable.
# Make upstream files and Copy the .zip files to
# hgwdev:/usr/local/apache/...
ssh hgwdev
cd /cluster/data/ce2/zip
# make upstream files for C. elegans RefSeq
featureBits ce2 refGene:upstream:1000 -fa=upstream1000.fa
zip upstream1000.zip upstream1000.fa
featureBits ce2 refGene:upstream:2000 -fa=upstream2000.fa
zip upstream2000.zip upstream2000.fa
featureBits ce2 refGene:upstream:5000 -fa=upstream5000.fa
zip upstream5000.zip upstream5000.fa
#- Check zip file integrity:
foreach f (*.zip)
unzip -t $f > $f.test
tail -1 $f.test
end
wc -l *.zip.test
set gp = /usr/local/apache/htdocs/goldenPath/ce2
mkdir -p $gp/bigZips
cp -p *.zip $gp/bigZips
mkdir -p $gp/chromosomes
foreach f ( ../sangerFa/chr*.fa )
zip -j $gp/chromosomes/$f:t.zip $f
end
cd $gp/bigZips
md5sum *.zip > md5sum.txt
cd $gp/chromosomes
md5sum *.zip > md5sum.txt
# Take a look at bigZips/* and chromosomes/*, update their README.txt's
# MAKE VSCB1 DOWNLOADS (DONE, 2004-05-11, hartera)
# DO THIS LATER
ssh kksilo
cd /cluster/data/ce2/bed/blastz.cb1
zip -j /cluster/data/ce2/zip/Cb1axtChrom.zip axtChrom/chr*.axt
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
cp all.chain cb1.chain
zip -j /cluster/data/ce2/zip/cb1.chain.zip cb1.chain
rm cb1.chain
zip -j /cluster/data/ce2/zip/cb1.net.zip cb1.net
# add axtNets
cd /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet
foreach f (I II III IV V X M)
cp chr${f}.axt chr{$f}.axtNet.axt
end
zip -j /cluster/data/ce2/zip/axtNetCb1.zip chr*.axtNet.axt
rm *.axtNet.axt
ssh hgwdev
mkdir -p /usr/local/apache/htdocs/goldenPath/ce2/vsCb1
cd /usr/local/apache/htdocs/goldenPath/ce2/vsCb1
mv /cluster/data/ce2/zip/Cb1axtChrom.zip axtChrom.zip
mv /cluster/data/ce2/zip/cb1*.zip .
mv /cluster/data/ce2/zip/axtNetCb1.zip .
md5sum *.zip > md5sum.txt
# Copy over & edit README.txt w/pointers to chain, net formats.
# MAKE VSSELF DOWNLOADS (DONE, 2004-05-11, hartera)
ssh kksilo
cd /cluster/data/ce2/bed/blastSelf
zip -j /cluster/data/ce2/zip/SelfaxtChrom.zip axtChrom/chr*.axt
cd /cluster/data/ce2/bed/blastzSelf/axtChain.2004-04-30
cp all.chain self.chain
zip -j /cluster/data/ce2/zip/self.chain.zip self.chain
rm self.chain
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/ce2/vsSelf
cd /usr/local/apache/htdocs/goldenPath/ce2/vsSelf
mv /cluster/data/ce2/zip/SelfaxtChrom.zip axtChrom.zip
mv /cluster/data/ce2/zip/self.chain.zip .
md5sum *.zip > md5sum.txt
# Copy over & edit README.txt w/pointers to chain formats.
# miRNA track (DONE - 2004-05-04 - Hiram)
# data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
# and Michel.Weber@ibcg.biotoul.fr
# notify them if this assembly updates to renew this track
ssh hgwdev
mkdir /cluster/data/ce2/bed/miRNA
cd /cluster/data/ce2/bed/miRNA
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/cel_ws120.*"
grep -v "^track " cel_ws120.bed | sed -e "s/ /\t/g" > ce2.bed
hgLoadBed ce2 miRNA ce2.bed
# entry in trackDb/trackDb.ra already there
# Compare with Ce1:
# featureBits ce1 miRNA
# 10329 bases of 100277784 (0.010%) in intersection
# featureBits ce2 miRNA
# 11382 bases of 100291761 (0.011%) in intersection
# MAKE ALT. SPLICING TRACK (DONE, 2004-05-13, hartera)
# create rnaCluster table and Gene Bounds track
ssh hgwdev
mkdir /cluster/data/ce2/bed/rnaCluster
cd /cluster/data/ce2/bed/rnaCluster
foreach f (I II III IV V X)
echo "clusterRna ce2 /dev/null chr${f}.bed -chrom=${f}"
clusterRna ce2 /dev/null chr${f}.bed -chrom=chr${f}
end
hgLoadBed ce2 rnaCluster *.bed
hgsql -A -e "select chrom from chromInfo" ce2 | egrep -v chrom > chrom.tab
(perl -e 'print "#\!/bin/sh\n"; while(<>) {chomp($_); printf "hgsql -A -e =select * from rnaCluster where chrom like \"$_\" order by chromStart, name= ce2 | egrep -v chromStart | cut -f2\-13 > $_.ce2.rnaCluster.bed\n";}' | sed -e "s/=/'/g") < chrom.tab > rnaCluster.sh
chmod a+x rnaCluster.sh
rnaCluster.sh
mkdir -p /cluster/data/ce2/bed/altSplice/agx
cd /cluster/data/ce2/bed/altSplice
hgsql -A -e "select chrom from chromInfo" ce2 | egrep -v chrom > chrom.tab
perl -e 'while(<>) {chomp($_); print "~/bin/i386/altSplice -db=ce2 -beds=../rnaCluster/$_.ce2.rnaCluster.bed -agxOut=agx/$_.ce2.rnaCluster.agx -consensus -weightMrna\n";}' < chrom.tab > altSplice.para.spec
# these jobs are no big deal on the worm, just run this file:
chmod +x altSplice.para.spec
mkdir agx
./altSplice.para.spec
# 0 genePredictions with 2610 clusters, 0 cassette exons, 0 of are not mod 3.
cat agx/*.agx > altSplice.agx
# UP TO HERE, make agxToBed and use local copy
~/bin/i386/agxToBed altSplice.agx altSplice.bed
cat << '_EOF_' >> altGraphX.sql
CREATE TABLE altGraphX (
bin smallint unsigned not null,
tName varchar(255) NOT NULL default '',
tStart int(11) NOT NULL default '0',
tEnd int(11) NOT NULL default '0',
name varchar(255) NOT NULL default '',
id int(10) unsigned NOT NULL auto_increment,
strand char(2) NOT NULL default '',
vertexCount int(10) unsigned NOT NULL default '0',
vTypes longblob NOT NULL,
vPositions longblob NOT NULL,
edgeCount int(10) unsigned NOT NULL default '0',
edgeStarts longblob NOT NULL,
edgeEnds longblob NOT NULL,
evidence longblob NOT NULL,
edgeTypes longblob NOT NULL,
mrnaRefCount int(11) NOT NULL default '0',
mrnaRefs longblob NOT NULL,
mrnaTissues longblob NOT NULL,
mrnaLibs longblob NOT NULL,
PRIMARY KEY (id),
KEY tName (tName(16),tStart,tEnd)
);
'_EOF_'
hgLoadBed -sqlTable=altGraphX.sql ce2 altGraphX altSplice.agx
# trackDb entry:
track altGraphX
shortLabel Alt-Splice
longLabel Alternative Splicing
group rna
priority 74.1
visibility pack
type bed 3
canPack off
##################################################################
# Create frames based page views of alt splice areas of interest
# (DONE - 2004-05-16 - Hiram)
ssh hgwdev
mkdir /cluster/data/ce2/bed/altSplice/altAnalysis
cd /cluster/data/ce2/bed/altSplice/altAnalysis
altAnalysis ce2 ../altSplice.agx altSummary.txt links.html \
frame.html -bedName=all
# creates files, (as well as several .bed files):
# -rw-rw-r-- 1 282 May 16 09:19 frame.html
# -rw-rw-r-- 1 391029 May 16 09:19 links.html
# Copy these to the Intronerator WS120 location, change
# default opening location:
cp -p links.html /usr/local/apache/htdocs/IntronWS120
sed -e \
"s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
-e "s/NM_005487/chrII:14689493-14690232/" \
-e "s/hgwdev-sugnet/genome-test/" \
frame.html > /usr/local/apache/htdocs/IntronWS120/frame.html
# Better location is in the goldenPath itself 2006-01-12:
mkdir /usr/local/apache/htdocs/goldenPath/ce2/altGraphX
sed -e "s#http://hgwdev-sugnet.cse.ucsc.edu##" links.html \
> /usr/local/apache/htdocs/goldenPath/ce2/altGraphX/links.html
sed -e \
"s/Human Alt Splicing Conserved in Mouse/Alt Splicing in C. elegans/" \
-e "s/NM_005487/chrII:14689493-14690232/" \
-e "s#http://hgwdev-sugnet.cse.ucsc.edu##" \
frame.html \
> /usr/local/apache/htdocs/goldenPath/ce2/altGraphX/frame.html
# MAKE single coverage BEST track from blastz axt's (DONE - 2004-05-11 - Hiram)
# This data is for use with the phyloHMM construction, below.
ssh eieio
cd /cluster/data/ce2
cat << '_EOF_' > jkStuff/mkBest.sh
#!/bin/sh
mkdir axtBest
mkdir pslBest
for c in I II III IV V X M
do
echo "axtBesting chr$c"
/cluster/bin/i386/axtBest axtChrom/chr${c}.axt chr${c} \
axtBest/chr${c}.axt -minScore=300
echo "processing chr${c}.axt -> chr${c}_blastzBestCb1.psl"
/cluster/bin/i386/axtToPsl axtBest/chr${c}.axt \
S1.len S2.len pslBest/chr${c}_blastzBestCb1.psl
echo "Done: chr${c}_blastzBestRn3.psl"
done
'_EOF_'
chmod +x jkStuff/mkBest.sh
cd /cluster/data/ce2/bed/blastz.cb1
time ../../jkStuff/mkBest.sh
# real 1m47.069s
# user 1m1.880s
# sys 0m14.240s
# Load this table
ssh hgwdev
cd /cluster/data/ce2/bed/blastz.cb1/pslBest
time /cluster/bin/i386/hgLoadPsl ce2 -table=blastzBestCb1 \
chr*_blastzBestCb1.psl
# real 0m21.775s
# user 0m8.510s
# sys 0m1.500s
# MAKE phyloHMM data
# (redone using new phastCons, acs 9/14/04)
# (redone using nets rather than axtBest, acs 10/10/04)
ssh hgwdev
mkdir /cluster/data/ce2/bed/pHMM
cd /cluster/data/ce2/bed/pHMM
mkdir axt
cd axt
ln -s /cluster/data/ce2/bed/blastz.cb1/axtNet.2004-04-29/sortedaxtNet/*.axt .
cd ..
mkdir maf
# create maf files from axtBest axt files
for c in I II III IV M V X
do
axtToMaf axt/chr${c}.axt tSizes qSizes maf/chr${c}.Tmaf
sed -e "s/chrUn/cb1.chrUn/" maf/chr${c}.Tmaf | sed -e "s/ chr/ ce2.chr/" > \
maf/chr${c}.maf
rm -f maf/chr${c}.Tmaf
echo "done chr${c}"
done
# split up alignments; no need to use cluster with worm
mkdir -p /cluster/bluearc/ce2/phastCons/WINDOWS
mkdir -p /scratch/phastCons.ce2.09-14-04.WINDOWS
for i in I II III IV V X M; do \
msa_split maf/chr${i}.maf -i MAF -M /cluster/data/ce2/sangerFa/chr${i}.fa \
-w 1000000,0 -r /scratch/phastCons.ce2.09-14-04.WINDOWS/chr${i} -o SS \
-I 1000 -B 5000 ;\
done
for file in /scratch/phastCons.ce2.09-14-04.WINDOWS/* ; do \
echo $file ;\
gzip -c $file > /cluster/bluearc/ce2/phastCons/WINDOWS/`basename $file`.gz ;\
done
rm -rf /scratch/phastCons.ce2.09-14-04.WINDOWS
# produce rough starting model
msa_view -i MAF --aggregate ce2,cb1 -o SS -z maf/chr*.maf > all.ss
phyloFit -i SS all.ss -o starting-tree
# also get average GC content of aligned regions
msa_view -i SS all.ss --summary-only
# descrip. A C G T G+C length all_gaps some_gaps
# all.ss 0.3130 0.1872 0.1869 0.3129 0.3741 58909311 0 11236473
# now set up cluster job to estimate model parameters. See
# makeHg17.doc for add'l details
cat << 'EOF' > doEstimate.sh
#!/bin/sh
zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.3741 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 25 --target-coverage 0.45 --quiet --log $2 --estimate-trees $3
EOF
chmod u+x doEstimate.sh
# (target of .45 is based on target of 0.2 and about 45% alignment
# coverage: 0.2/0.45 is approx. 0.45)
rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobs.lst
for f in /cluster/bluearc/ce2/phastCons/WINDOWS/*.ss.gz ; do
root=`basename $f .ss.gz` ;\
echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
done
# run cluster job
ssh kk... para create ...
# (takes about 2 minutes)
# Now combine parameter estimates.
ls TREES/*.cons.mod > cons.txt
phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
ls TREES/*.noncons.mod > noncons.txt
phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
# Now set up cluster job for computing consevation scores and predicted elements
cat << 'EOF' > doPhastCons.sh
#!/bin/sh
mkdir -p /cluster/bluearc/ce2/phastCons/POSTPROBS /cluster/bluearc/ce2/phastCons/ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 25 --target-coverage 0.45 --quiet --seqname $chr --idpref $pref --viterbi /cluster/bluearc/ce2/phastCons/ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/ce2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
EOF
chmod u+x doPhastCons.sh
rm -fr /cluster/bluearc/ce2/phastCons/POSTPROBS /cluster/bluearc/ce2/phastCons/ELEMENTS
rm -f jobs2.lst
for f in /cluster/bluearc/ce2/phastCons/WINDOWS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done
# run cluster job
ssh kk... para create, ...
# (takes about 30 sec)
# set up phastConsElements track
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' /cluster/bluearc/ce2/phastCons/ELEMENTS/*.bed > all.raw.bed
/cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
hgLoadBed ce2 phastConsElements all.bed
featureBits ce2 phastConsElements
# 20217282 bases of 100291761 (20.158%) in intersection
# set up wiggle
mkdir wib
for i in I II III IV V X M; do \
zcat `ls /cluster/bluearc/ce2/phastCons/POSTPROBS/chr${i}.*.pp.gz | sort -t\. -k2,2n` | \
wigAsciiToBinary -chrom=chr${i} -wibFile=wib/chr${i}_phastCons stdin ;\
done
hgLoadWiggle ce2 phastCons -pathPrefix=/gbdb/ce2/phastCons/wib wib/chr*_phastCons.wig
mkdir -p /gbdb/ce2/phastCons/wib
rm -f /gbdb/ce2/phastCons/wib/chr*phastCons.wib
ln -s /cluster/data/ce2/bed/pHMM/wib/*.wib /gbdb/ce2/phastCons/wib
chmod 775 . wib /gbdb/ce2/phastCons /gbdb/ce2/phastCons/wib
chmod 664 wib/*.wib
# move postprobs over and clean up bluearc
rsync -av /cluster/bluearc/ce2/phastCons/POSTPROBS .
# (people sometimes want the raw scores)
rm -r /cluster/bluearc/ce2/phastCons/ELEMENTS /cluster/bluearc/ce2/phastCons/POSTPROBS
# set up full alignment/conservation track
# note that in this case the pairwise mafs are used both for the
# "multiple" alignment and the pairwise alignments
mkdir -p /gbdb/ce2/c_briggsae_pwMaf
cd maf
ln -s `pwd`/*.maf /gbdb/ce2/c_briggsae_pwMaf
hgLoadMaf ce2 -warn c_briggsae_pwMaf -pathPrefix=/gbdb/ce2/c_briggsae_pwMaf
chmod 775 /gbdb/ce2/c_briggsae_pwMaf .
chmod 664 *.maf
# trackDb entry:
# track c_briggsae_pwMaf
# shortLabel Conservation
# longLabel Blastz Alignments with C. Briggsae & Conservation
# group compGeno
# priority 100
# visibility pack
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons
# yLineOnOff Off
-# autoScaleDefault Off
+# autoScale Off
# pairwise pwMaf
# speciesOrder c_briggsae
######## MAKING GENE SORTER TABLES #### (DONE - 2004-05-27 - hartera)
# These are instructions for building the
# neighborhood browser. Don't start these until
# there is a sangerGene table with a proteinID column containing Swiss-Prot
# protein IDs, and also Kim lab expression data is required (in hgFixed).
# Cluster together various alt-splicing isoforms.
# Creates the sangerIsoforms and sangerCanonical tables
# (DONE, 2004-05-21, hartera)
ssh hgwdev
hgClusterGenes ce2 sangerGene sangerIsoforms sangerCanonical
# You may need to build this binary in src/hg/near/hgClusterGenes
# Got 20713 clusters, from 23076 genes in 7 chromosomes
# featureBits ce2 sangerCanonical
# 54156601 bases of 100291761 (53.999%) in intersection
# featureBits ce1 sangerCanonical
# 53654286 bases of 100277784 (53.506%) in intersection
# Create Self blastp table - sangerBlastTab (DONE, 2004-05-24, hartera)
# Reload blastp data into sangerBlastTab and drop knownBlastTab
# - table given the wrong name (DONE, 2004-05-28, hartera)
# Get sangerPep peptide sequences fasta file from sangerPep dir
# and create a blast database out of them.
mkdir -p /cluster/data/ce2/bed/blastp
cd /cluster/data/ce2/bed/blastp
mv /cluster/data/ce2/bed/sangerPep/wormPep120 wormPep120.faa
formatdb -i wormPep120.faa -t wormPep120 -n wormPep120
# This command is in /projects/compbio/bin/$MACH/formatdb
# Copy database over to iscratch
ssh kkr1u00
mkdir -p mkdir -p /iscratch/i/ce2/blastp
cp /cluster/data/ce2/bed/blastp/wormPep120.* /iscratch/i/ce2/blastp
iSync
# Blastall and Data directory are already in /iscratch/i/blast/
# From the dates, this looks to be the same versio as in
# /projects/compbio/bin/i686/
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/ce2/bed/blastp
mkdir split
faSplit sequence wormPep120.faa 8000 split/wp
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/ce2/bed/blastp/self
cd /cluster/data/ce2/bed/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep120 \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
chmod a+x blastSome
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# Create parasol batch
# 'ls ../../split/*.fa' is too long an argument list, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try
# Wait a minute, and do a para check, if all is good
# then do a
para push, para check etc.
# para time
# Completed: 6684 of 6684 jobs
# CPU time in finished jobs: 35657s 594.28m 9.90h 0.41d 0.001 y
# IO & Wait Time: 27262s 454.37m 7.57h 0.32d 0.001 y
# Average job time: 9s 0.16m 0.00h 0.00d
# Longest job: 193s 3.22m 0.05h 0.00d
# Submission to last job: 302s 5.03m 0.08h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/ce2/bed/blastp/self/run/out
hgLoadBlastTab ce2 sangerBlastTab *.tab
# CREATE EXPRESSION DISTANCE TABLES FOR
# KIM LAB EXPRESSION DATA (DONE, 2004-05-24, hartera)
hgExpDistance ce2 hgFixed.kimWormLifeMedianRatio \
hgFixed.kimWormLifeMedianExps kimExpDistance
# CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES
# sangerToRefSeq (DONE, 2004-05-24, hartera)
hgMapToGene ce2 refGene sangerGene sangerToRefSeq
# CREATE TABLE TO MAP BETWEEN SANGER GENES AND PFAM DOMAINS
# sangerToPfam (DONE, 2004-05-24, hartera)
# Drop table knownToPfam - wrong table name.
# Reload data into sangerToPfam (DONE, 2004-05-28, hartera)
hgMapViaSwissProt ce2 sangerGene name proteinID Pfam sangerToPfam
# CREATE MAPPING OF SANGER GENES TO KIM LAB EXPRESSION DATA GENES
# sangerToKim (DONE, 2004-05-27, hartera)
# This is actually a mapping of the sangerGene table to itself -
# this is how this table was created for ce1.
# This means that many gene names (4614/19134) in kimWormLifeAllRatio
# do not appear in sangerGene but when randomly checking some of these,
# it was found they are not in WS120 WormBase and probably represent
# alternate names or names that no longer exist or withdrawn sequences.
hgMapToGene ce2 sangerGene sangerGene sangerToKim
# SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera)
# set hgNearOk to 1 on hgcentraltest to make Ce2 Gene Sorter viewable
hgsql -h hgwbeta -e 'update dbDb set hgNearOk = 1 where name = "Ce2";' \
hgcentraltest
# MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie)
ssh hgwdev
mkdir /cluster/data/ce2/bed/mitopred
cd /cluster/data/ce2/bed/mitopred
wget http://mitopred.sdsc.edu/data/cel_30.out
perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' cel_30.out > mitopred.tab
cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
name varchar(10) not null, # SwissProt ID
confidence varchar(8) not null, # Confidence level
#Indices
PRIMARY KEY(name(6))
);
'_EOF_'
hgsql ce2 < mitopred.sql
hgsql ce2 -e 'load data local infile "mitopred.tab" into table mitopred'
# MAKE Human Proteins track
ssh eieio
mkdir -p /cluster/data/ce2/blastDb
cd /cluster/data/ce2/blastDb
for i in ../sangerFa/*.fa; do ln -s $i .; done
for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
rm *.fa *.log
ssh kkr1u00
mkdir -p /iscratch/i/ce2/blastDb
cp /cluster/data/ce2/blastDb/* /iscratch/i/ce2/blastDb
(~kent/bin/iSync) 2>&1 > sync.out
mkdir -p /cluster/data/ce2/bed/tblastn.hg16KG
cd /cluster/data/ce2/bed/tblastn.hg16KG
ls -1S /iscratch/i/ce2/blastDb/*.nsq | sed "s/\.nsq//" > worm.lst
exit
# back to eieio
cd /cluster/data/ce2/bed/tblastn.hg16KG
mkdir kgfa
# calculate a reasonable number of jobs
calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc worm.lst | awk "{print \\\$1}"`\)
# 41117/(150000/578) = 158.437507
split -l 158 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg
cd kgfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2
mv $3.tmp $3
rm -f $f.1 $f.2
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'
chmod +x blastSome
gensub2 worm.lst kg.lst blastGsub blastSpec
ssh kk
cd /cluster/data/ce2/bed/tblastn.hg16KG
para create blastSpec
para shove # jobs will need to be restarted, but they should all finish
# Completed: 150858 of 150858 jobs
# CPU time in finished jobs: 10404860s 173414.33m 2890.24h 120.43d 0.330 y
# IO & Wait Time: 2468643s 41144.06m 685.73h 28.57d 0.078 y
# Average job time: 85s 1.42m 0.02h 0.00d
# Longest job: 411s 6.85m 0.11h 0.00d
# Submission to last job: 26364s 439.40m 7.32h 0.31d
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=10000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
ssh kki
cd /cluster/data/ce2/bed/tblastn.hg16KG
para create chainSpec
para push
# Completed: 261 of 261 jobs
# CPU time in finished jobs: 828s 13.81m 0.23h 0.01d 0.000 y
# IO & Wait Time: 19521s 325.34m 5.42h 0.23d 0.001 y
# Average job time: 78s 1.30m 0.02h 0.00d
# Longest job: 270s 4.50m 0.07h 0.00d
# Submission to last job: 3878s 64.63m 1.08h 0.04d
exit
# back to eieio
cd /cluster/data/ce2/bed/tblastn.hg16KG/blastOut
for i in kg??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/ce2/jkStuff/liftAll.lft warn stdin | \
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../preblastHg16KG.psl
cd ..
blatDir=/cluster/data/hg16/bed/blat.hg16KG
protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl
ssh hgwdev
cd /cluster/data/ce2/bed/tblastn.hg16KG
hgLoadPsl ce2 blastHg16KG.psl
exit
# back to eieio
rm -rf blastOut
# End tblastn Human
# MAKE Drosophila Proteins track
ssh eieio
mkdir -p /cluster/data/ce2/blastDb
cd /cluster/data/ce2/blastDb
for i in ../sangerFa/*.fa; do ln -s $i .; done
for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
rm *.fa *.log
ssh kkr1u00
mkdir -p /iscratch/i/ce2/blastDb
cp /cluster/data/ce2/blastDb/* /iscratch/i/ce2/blastDb
(~kent/bin/iSync) 2>&1 > sync.out
mkdir -p /cluster/data/ce2/bed/tblastn.dm1FB
cd /cluster/data/ce2/bed/tblastn.dm1FB
ls -1S /iscratch/i/ce2/blastDb/*.nsq | sed "s/\.nsq//" > worm.lst
exit
# back to eieio
cd /cluster/data/ce2/bed/tblastn.dm1FB
mkdir fbfa
split -l 40 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
cd fbfa
for i in *; do pslxToFa $i $i.fa; rm $i; done
cd ..
ls -1S fbfa/*.fa > fb.lst
mkdir blastOut
for i in `cat fb.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 1 0.1 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2
mv $3.tmp $3
rm -f $f.1 $f.2
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp
exit 1
'_EOF_'
chmod +x blastSome
gensub2 worm.lst fb.lst blastGsub blastSpec
exit
ssh kk
cd /cluster/data/ce2/bed/tblastn.dm1FB
para create blastSpec
para push
# Completed: 3283 of 3283 jobs
# CPU time in finished jobs: 3063561s 51059.35m 850.99h 35.46d 0.097 y
# IO & Wait Time: 25333s 422.22m 7.04h 0.29d 0.001 y
# Average job time: 941s 15.68m 0.26h 0.01d
# Longest job: 3435s 57.25m 0.95h 0.04d
# Submission to last job: 9387s 156.45m 2.61h 0.11d
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
# << for emacs
chmod +x chainSome
ls -1dS `pwd`/blastOut/fb?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
ssh kki
cd /cluster/data/ce2/bed/tblastn.dm1FB
para create chainSpec
para push
# Completed: 469 of 469 jobs
# CPU time in finished jobs: 721s 12.02m 0.20h 0.01d 0.000 y
# IO & Wait Time: 1562s 26.03m 0.43h 0.02d 0.000 y
# Average job time: 5s 0.08m 0.00h 0.00d
# Longest job: 42s 0.70m 0.01h 0.00d
# Submission to last job: 5546s 92.43m 1.54h 0.06d
exit
# back to eieio
cd /cluster/data/ce2/bed/tblastn.dm1FB/blastOut
for i in fb??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../blastDm1FB.psl
cd ..
ssh hgwdev
cd /cluster/data/ce2/bed/tblastn.dm1FB
hgLoadPsl ce2 blastDm1FB.psl
exit
# back to eieio
rm -rf blastOut
# End tblastn
# MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
# First, get ranges of target coverage:
ssh eieio
mkdir /cluster/data/ce2/bed/pHMM/coverage
cd /cluster/data/ce2/bed/pHMM/maf.new.axtNet
cat *.maf | nice mafRanges -notAllOGap stdin ce2 \
/cluster/data/ce2/bed/pHMM/coverage/ce2.mafRanges.bed
ssh hgwdev
cd /cluster/data/ce2/bed/pHMM/coverage
# To make subsequent intersections a bit quicker, output a bed with
# duplicate/overlapping ranges collapsed:
nice featureBits ce2 ce2.mafRanges.bed \
-bed=ce2.mafRangesCollapsed.bed
#43967098 bases of 100291761 (43.839%) in intersection
# mafCoverage barfs currently, so pass on this for now:
#cat ../maf.new.axtNet/*.maf \
#| nice mafCoverage -count=2 ce2 stdin > ce2.mafCoverage
# Intersect maf target coverage with gene regions --
# use Adam's files which are a union of sangerGene and refGene:
nice featureBits ce2 -enrichment \
/cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormCds.bed \
ce2.mafRangesCollapsed.bed \
-bed=ce2.mafCds.bed
#wormCds.bed 25.701%, ce2.mafRangesCollapsed.bed 43.839%, both 20.701%, cover 80.55%, enrich 1.84x
nice featureBits ce2 -enrichment \
/cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormUtr3.bed \
ce2.mafRangesCollapsed.bed \
-bed=ce2.mafUtr3.bed
#wormUtr3.bed 0.323%, ce2.mafRangesCollapsed.bed 43.839%, both 0.216%, cover 66.86%, enrich 1.53x
nice featureBits ce2 -enrichment \
/cluster/data/hg17/bed/var_multiz.2004-08-12/phastCons/stats2/wormUtr5.bed \
ce2.mafRangesCollapsed.bed \
-bed=ce2.mafUtr5.bed
#wormUtr5.bed 0.057%, ce2.mafRangesCollapsed.bed 43.839%, both 0.041%, cover 72.37%, enrich 1.65x
# syntenic net with cb1 (acs, 11/18/04)
cd /cluster/data/ce2/bed/blastz.cb1/axtChain.2004-04-29
netFilter -syn cb1.net > cb1Syn.net
netFilter -minGap=10 cb1Syn.net | hgLoadNet ce2 syntenyNetCb1 stdin
# (add trackDb.ra entry)
# [acs@hgwdev axtChain.2004-04-29]$ featureBits ce2 netCb1
# 88355987 bases of 100291761 (88.099%) in intersection
# [acs@hgwdev axtChain.2004-04-29]$ featureBits ce2 syntenyNetCb1
# 73427463 bases of 100291761 (73.214%) in intersection
# Hmmm -- looking in browser, seems a bit stringent for worm...
# CLEANUP CB1 BLASTZ (DONE, 2004-02-24, hartera)
ssh eieio
cd /cluster/data/ce2/bed/blastz.cb1
nice rm axtChain/run1/chain/* &
nice rm -fr axtChain/n1 axtChain/hNoClass.net &
nice gzip axtChrom/* pslChrom/* axtChain/all.chain axtChain/cb1Filt.net axtChain/*.net &
nice rm -fr axtChain/chain axtChain/preNet &
nice rm -rf raw &
nice rm -rf lav &
# CLEANUP SELF BLASTZ (DONE, 2004-02-24, hartera)
ssh eieio
cd /cluster/data/ce2/bed/blastzSelf
# remove old chains
nice rm -fr axtChain &
nice rm axtChain.2004-04-30/run1/chain/* &
nice gzip axtChrom/* pslChrom/* axtChain.2004-04-30/all.chain axtChain.2004-04-30/chainFilt/* axtChain.2004-04-30/chain/* &
nice rm -rf raw &
nice rm -rf lav &
###########################################################################
# LOAD GENEID PREDICTIONS (DONE 3/15/05)
ssh eieio
mkdir /cluster/data/ce2/bed/geneid
cd /cluster/data/ce2/bed/geneid
foreach chr (chrI chrII chrIII chrIV chrM chrV chrX)
wget \
http://genome.imim.es/genepredictions/C.elegans/Ce200403/geneid_v1.2/$chr.gtf
wget \
http://genome.imim.es/genepredictions/C.elegans/Ce200403/geneid_v1.2/$chr.prot
end
# Add ".1" suffix to each item in .prot's, to match transcript_id's in gtf
perl -wpe 's/^(>\S+)/$1.1/' *.prot > geneid.fa
ssh hgwdev
cd /cluster/data/ce2/bed/geneid
ldHgGene -gtf -genePredExt ce2 geneid *.gtf
hgPepPred ce2 generic geneidPep geneid.fa
###########################################################################
# DONE - 2006-01-06 - Hiram
# Obtained a photograph from Professor Mark Blaxter
# mark.blaxter@ed.ac.uk - http://nema.cap.ed.ac.uk/index.html
# Saved image from email to:
# /cluster/data/ce2/html/C.elegans.jpg
# -rw-rw-r-- 1 227317 Jan 6 09:40 C.elegans.jpg
# Transform image to a size appropriate for our WEB pages:
ssh hgwdev
cd /cluster/data/ce2/html
convert -normalize -sharpen 0 -geometry 300x200 \
C.elegans.jpg Caenorhabditis_elegans.jpg
cp -p Caenorhabditis_elegans.jpg /usr/local/apache/htdocs/images
# Add IMG tag pointer to this image in the description.html page
# and for ce1 and ce3
########################################################################
### microRNA targets tracks (DONE - 2006-03-29 - 2006-04-26 - Hiram)
### from: http://pictar.bio.nyu.edu/ Rajewsky Lab
### Nikolaus Rajewsky nr@scarbo.bio.nyu.edu
### Yi-Lu Wang ylw205@nyu.edu
### dg@thp.Uni-Koeln.DE
ssh hgwdev
mkdir /cluster/data/ce2/bed/picTar
cd /cluster/data/ce2/bed/picTar
wget --timestamping \
'http://pictar.bio.nyu.edu/ucsc/new_single_elegans_bed' \
-O newSingleElegans.bed
grep -v "^track" newSingleElegans.bed \
| hgLoadBed -strict ce2 picTar stdin
# Loaded 11987 elements of size 9
nice -n +19 featureBits ce2 picTar
# 39233 bases of 100291761 (0.039%) in intersection
########################################################################
# CLEANUP OF CB1 BLASTZ (DONE, 2007-06-25, hartera)
ssh kkstore02
cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChain
rm -r cb1NetSplit cb1NetSplit1 tmp1
cd ..
rm -r axtChain.2004-04-29test
# Looks like new chians were made on 2004-04-29
cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChain.2004-04-29
# remove the split net and chain directories as these can easily be
# created from the all.chain and cb1.net files
rm -r chain cb1NetSplit run1/chain
# gzip files
gzip all.chain *.net
cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/
rm -r axtNet1
cd axtNet.2004-04-29
# remove *.axt files as these are also in the sortedaxtNet directory
rm *.axt sortedaxtNet/axtNetCb1.tab
cd /cluster/store5/worm/ce2/bed/blastz.cb1.2004-04-12/axtChrom/maf
gzip sortedaxtNet/*.axt
#######################################################################
## LIFTOVER To Ce4 (DONE - 2007-07-16 Hiram)
ssh kkr1u00
cd /cluster/data/ce4
mkdir nib
cd nib
for C in I II III IV V X M
do
faToNib -softMask ../goldenPath/chromosomes/chr${C}.fa.gz chr${C}.nib
done
# Writing 15072419 bases in 7536218 bytes
# Writing 15279316 bases in 7639666 bytes
# Writing 13783681 bases in 6891849 bytes
# Writing 17493784 bases in 8746900 bytes
# Writing 20919398 bases in 10459707 bytes
# Writing 17718852 bases in 8859434 bytes
# Writing 13794 bases in 6905 bytes
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
ce4 /cluster/data/ce4/nib
# as it says, DO THIS NEXT:
ssh kk
# if bin/scripts is not in your PATH, add it for this command:
PATH=$PATH:/cluster/bin/scripts \
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
ce2 /cluster/data/ce2/nib ce4 /iscratch/i/ce4/split10k \
/cluster/data/ce4/jkStuff/11.ooc
# as it says, DO THIS NEXT:
cd /cluster/data/ce2/bed/blat.ce4.2007-07-16/run
para try, check, push, check, ...
# Completed: 49 of 49 jobs
# CPU time in finished jobs: 14998s 249.97m 4.17h 0.17d 0.000 y
# IO & Wait Time: 405s 6.74m 0.11h 0.00d 0.000 y
# Average job time: 314s 5.24m 0.09h 0.00d
# Longest finished job: 706s 11.77m 0.20h 0.01d
# Submission to last job: 728s 12.13m 0.20h 0.01d
# as it says, DO THIS NEXT:
# this does the liftUp and makes the psl files
# kkr1u00 is down at this time, fixup this script to work on kkr3u00
ssh kkr3u00
cd /cluster/data/ce2/bed
ln -s blat.ce4.2007-07-16 ce4
time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh ce2 ce4
# real 1m1.780s
# as it says, DO THIS NEXT:
# the prepares the batch to run for the chaining
ssh kki
time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
ce2 /cluster/data/ce2/nib ce4 /cluster/data/ce4/nib
# as it says, DO THIS NEXT:
# running the chain batch
cd /cluster/data/ce2/bed/blat.ce4.2007-07-16/chainRun
para try, check, push, check, ...
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 19s 0.31m 0.01h 0.00d 0.000 y
# IO & Wait Time: 17s 0.29m 0.00h 0.00d 0.000 y
# Average job time: 5s 0.09m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 7s 0.12m 0.00h 0.00d
# Submission to last job: 7s 0.12m 0.00h 0.00d
ssh kkstore02
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh ce2 ce4
# Created /cluster/data/ce2/bed/liftOver/ce2ToCe4.over.chain.gz
# as it says, DO THIS NEXT:
ssh hgwdev
$HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh ce2 ce4
# It says this:
# Now, add link for
# /usr/local/apache/htdocs/goldenPath/ce2/liftOver/ce2ToCe4.over.chain
# to hgLiftOver
# But I believe that link was already done:
cd /gbdb/ce2/liftOver
ls -og ce2ToCe4*
# lrwxrwxrwx 1 53 Jun 5 16:35 ce2ToCe4.over.chain.gz ->
# /cluster/data/ce2/bed/liftOver/ce2ToCe4.over.chain.gz
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
ce2.upstreamGeneTbl = refGene