src/hg/makeDb/doc/ce6.txt 1.19
1.19 2010/04/15 18:03:15 chinhli
CE6->CE4 LIFTOVER
Index: src/hg/makeDb/doc/ce6.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce6.txt,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -B -U 1000000 -r1.18 -r1.19
--- src/hg/makeDb/doc/ce6.txt 19 Jan 2010 20:14:21 -0000 1.18
+++ src/hg/makeDb/doc/ce6.txt 15 Apr 2010 18:03:15 -0000 1.19
@@ -1,2808 +1,2823 @@
# for emacs: -*- mode: sh; -*-
# Caenorhabditis elegans
# Washington University School of Medicine GSC and Sanger Institute WS190
# $Id$
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2008-05-30 - Hiram)
ssh kkstore06
mkdir /cluster/store3/worm/ce6
ln -s /cluster/store3/worm/ce6 /cluster/data/ce6
mkdir /cluster/data/ce6/downloads
cd /cluster/data/ce6/downloads
for C in I II III IV V X MtDNA
do
F=WS190/CHROMOSOMES/CHROMOSOME_${C}
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.agp"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.dna.gz"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.gff.gz"
done
#########################################################################
# NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2008-05-30 - Hiram)
ssh kkstore06
cd /cluster/data/ce6/downloads
# Fix fasta names:
zcat CHR*.dna.gz \
| sed -e '/^$/ d; s/^>CHROMOSOME_MtDNA/>chrM/; s/^>CHROMOSOME_/>chr/;' \
| gzip -c > UCSC.fa.gz
faSize -detailed UCSC.fa.gz
# chrI 15072421
# chrII 15279323
# chrIII 13783681
# chrIV 17493785
# chrM 13794
# chrV 20919568
# chrX 17718854
# Make sure we get the same sizes from this command:
zcat CHR*.dna.gz | sed -e '/^$/ d;' | faSize -detailed stdin
# Fix AGP names:
sed -e 's/^/chr/' CHR*.agp > UCSC.agp
# And add a fake mitochondrial AGP entry for the sake of downstream
# tools (make sure the GenBank sequence is identical to given):
echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp
#########################################################################
# run the makeGenomeDb procedure to create the db and unmasked sequence
# (DONE - 2008-05-30 - Hiram)
ssh kkstore06
cd /cluster/data/ce6
cat << '_EOF_' > ce6.config.ra
# Config parameters for makeGenomeDb.pl:
db ce6
clade worm
genomeCladePriority 10
scientificName Caenorhabditis elegans
commonName C. elegans
assemblyDate May 2008
assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS190
orderKey 826
mitoAcc none
fastaFiles /cluster/data/ce6/downloads/UCSC.fa.gz
agpFiles /cluster/data/ce6/downloads/UCSC.agp
# qualFiles /dev/null
dbDbSpeciesDir worm
'_EOF_'
# << emacs
mkdir jkStuff
# run just to AGP to make sure things are sane first
nice makeGenomeDb.pl ce6.config.ra -stop agp \
> jkStuff/makeGenomeDb.agp 2>&1
# now, continuing to make the Db and all
time nice -n +19 makeGenomeDb.pl ce6.config.ra -continue db \
> jkStuff/makeGenomeDb.log 2>&1
# that takes 2m30s to run to completion
# take the trackDb business there and check it into the source tree
# fixup the description, gap and gold html page descriptions
#########################################################################
# REPEATMASKER (DONE - 2008-05-30 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/repeatMasker
cd /cluster/data/ce6/bed/repeatMasker
time nice -n +19 doRepeatMasker.pl -bigClusterHub=pk \
-buildDir=/cluster/data/ce6/bed/repeatMasker ce6 > do.log 2>&1 &
# real 133m17.088s
# from the do.log:
# RepeatMasker version development-$Id:
# RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $
# CC RELEASE 20071204; *
#########################################################################
# SIMPLE REPEATS (DONE - 2008-05-30 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/simpleRepeat
cd /cluster/data/ce6/bed/simpleRepeat
time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kki \
-buildDir=/cluster/data/ce6/bed/simpleRepeat ce6 > do.log 2>&1 &
# about 30 minutes
#########################################################################
# MASK SEQUENCE WITH RM+TRF (DONE - 2008-05-30 - Hiram)
# Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
# now it's time to combine the masking into the final ce6.2bit,
# following the instructions at the end of doSimpleRepeat's output.
ssh kolossus
cd /cluster/data/ce6
twoBitMask ce6.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce6.2bit
# You can safely ignore the warning about extra BED columns
twoBitToFa ce6.2bit stdout | faSize stdin
# 100281426 bases (0 N's 100281426 real 86981803 upper 13299623 lower)
# in 7 sequences in 1 files
# Total size: mean 14325918.0 sd 6728308.1 min 13794 (chrM)
# max 20919568 (chrV) median 15279323
# %13.26 masked total, %13.26 masked real
# set the symlink on hgwdev to /gbdb/ce6
cd /gbdb/ce6
ln -s /cluster/data/ce6/ce6.2bit .
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2008-05-30 - Hiram)
# Use -repMatch=100 (based on size -- for human we use 1024, and
# worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome
# size from featureBits. So we would use 34, but that yields a very
# high number of tiles to ignore, especially for a small more compact
# genome. Bump that up a bit to be more conservative.
ssh kolossus
cd /cluster/data/ce6
blat ce6.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=jkStuff/11.ooc -repMatch=100
# Wrote 8523 overused 11-mers to jkStuff/11.ooc
# copy all of this stuff to the Iservers
ssh kkr1u00
mkdir /iscratch/i/worms/ce6
cd /iscratch/i/worms/ce6
cp -p /cluster/data/ce6/ce6.2bit .
cp -p /cluster/data/ce6/jkStuff/11.ooc .
cp -p /cluster/data/ce6/chrom.sizes .
-
cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/ce6
#########################################################################
# GENBANK AUTO UPDATE (DONE - 2008-05-30 - Hiram)
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add ce6 just before ce4
# ce6 (C. elegans)
ce6.serverGenome = /cluster/data/ce6/ce6.2bit
ce6.clusterGenome = /iscratch/i/worms/ce6/ce6.2bit
ce6.ooc = /iscratch/i/worms/ce6/11.ooc
ce6.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter}
ce6.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter}
ce6.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
ce6.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter}
ce6.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
ce6.maxIntron = 100000
ce6.lift = no
ce6.genbank.mrna.xeno.load = yes
ce6.refseq.mrna.xeno.load = yes
ce6.downloadDir = ce6
cvs ci -m "Added ce6." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial ce6 &
# logFile: var/build/logs/2008.05.30-15:04:49.ce6.initalign.log
# real 137m58.675s
# real 61m33.930s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ce6
# logFile: var/dbload/hgwdev/logs/2008.05.30-20:29:01.dbload.log
# real 23m8.651s
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add ce6 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added ce6 - C. elegans WS190" \
etc/align.dbs etc/hgwdev.dbs
make etc-update
###############################################################################
## SANGER GENE TRACK (DONE - 2008-06-03 - Hiram)
## There is a tremendous amount of extraneous notations in the gff
## files. Filter them down to a manageable set, change the chrom
## names, eliminate duplicates, select only what will work in
## ldHgGene
ssh kkstore06
mkdir /cluster/data/ce6/bed/sangerGene
cd /cluster/data/ce6/bed/sangerGene
for C in I II III IV V X
do
echo -n "${C} "
zcat ../../downloads/CHROMOSOME_${C}.gff.gz | \
sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
s/CHROMOSOME_M/chrM/g;" \
-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
-e 's/CDS "//' -e 's/"//' \
> chr${C}.gff
done
C=M
echo -n "${C} "
zcat ../../downloads/CHROMOSOME_MtDNA.gff.gz | \
sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \
-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
-e 's/CDS "//' -e 's/"//' \
> chr${C}.gff
for C in I II III IV V X M
do
echo "chr${C}.gff -> filtered.chr${C}.gff"
grep -v "^#" chr${C}.gff | awk -F'\t' '
BEGIN { IGNORECASE=1 }
{
if (match($2,"curated|Coding_transcript")) {
if (match($3,"intron|coding_exon|exon|cds|three_prime_UTR|five_prime_UTR")) {
gsub("coding_exon","CDS",$3)
gsub("Transcript ","",$9)
gsub(" .*","",$9)
gsub("three_prime_UTR","3utr",$3)
gsub("five_prime_UTR","5utr",$3)
for (i = 1; i < 9; ++i) {
printf "%s\t", $i
}
printf "%s\n", $9
}
}
}
' | sort -u | sort -k4n > filtered.chr${C}.gff
done
ssh hgwdev
cd /cluster/data/ce6/bed/sangerGene
nice -n +19 ldHgGene ce6 sangerGene filtered.*.gff
nice -n +19 ldHgGene -out=filteredGenePred.tab ce6 sangerGene filtered.*.gff
# Read 52512 transcripts in 997038 lines in 7 files
# 52512 groups 7 seqs 3 sources 5 feature types
# 30296 gene predictions
## this doesn't work with the bin column, so alter that table
hgsql -e "alter table sangerGene drop column bin;" ce6
## found when trying to click through on sequence links, mRNA or
## protein when on a gene details page
## Create orfToGene table. The tar file containing this information
# was obtained from Anthony Rogers via Email 2008-06-03 ar2@sanger.ac.uk
# They had one duplicate line in their file:
sort -u /cluster/data/ce6/downloads/tarFile/gene_name.txt \
| sed -e "s/ */\t/" > orfToGene.tab
## plus, they didn't include the self with self references, which are
## needed in the gene sorter so it can find its names. So, fixing
# this table:
hgsql -N -e "select name from sangerGene;" ce6 | sort > name.sangerGene.txt
cat << '_EOF_' > fixupOrf.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %sangerGeneName;
open (FH, "<name.sangerGene.txt") or die "can not read name.sangerGene.txt";
while (my $line = <FH>) {
chomp $line;
$sangerGeneName{$line} = 1;
}
close (FH);
open (FH,"<orfToGene.tab") or die "can not read orfToGene.tab";
my %alias; # key is alias name, value is real name
my %realNames; # key is real name, value is 1 (unimportant)
my %twoDot; # key is two dot name, value is 1 (unimportant)
while (my $line = <FH>) {
chomp $line;
my ($alias, $real) = split('\s+', $line);
die "duplicate alias name found: $alias" if (exists($alias{$alias}));
if (exists($sangerGeneName{$alias})) {
$alias{$alias} = $real;
$realNames{$real} = 1;
my ($a, $b, $c, $d) = split('\.',$alias);
die "four dot name found $line" if (defined($d));
my $twoDotName = sprintf "%s.%s", $a, $b;
if (exists($sangerGeneName{$twoDotName})) {
$twoDot{$twoDotName} = $real;
}
}
}
close (FH);
my $count = 0;
my $noReal = 0;
my %needReal;
foreach my $key (keys %alias) {
++$count;
my $realName = $alias{$key};
if (!exists($alias{$realName})) {
if (!exists($needReal{$realName}) &&
exists($sangerGeneName{$realName})) {
++$noReal;
$needReal{$realName} = $key;
my ($a, $b, $c, $d) = split('\.',$realName);
die "four dot name found $realName" if (defined($d));
my $twoDotName = sprintf "%s.%s", $a, $b;
if (exists($sangerGeneName{$twoDotName})) {
$twoDot{$twoDotName} = $key;
}
}
}
}
print STDERR "found $count alias names\n";
print STDERR "can not find $noReal real names in alias list\n";
$count = 0;
foreach my $key (keys %realNames) {
++$count;
}
print STDERR "found $count real names\n";
$noReal = 0;
foreach my $key (keys %needReal) {
++$noReal;
print STDERR "$key $needReal{$key}\n" if ($noReal < 10);
}
print STDERR "need to add $noReal real names\n";
open (FH,">fixedOrfToGene.tab") or die "can not write to fixedOrfToGene.tab";
foreach my $key (keys %alias) {
printf FH "%s\t%s\n", $key, $alias{$key};
}
foreach my $key (keys %needReal) {
printf FH "%s\t%s\n", $key, $key;
}
foreach my $key (keys %twoDot) {
if ( (!exists($alias{$key})) && (!exists($needReal{$key})) ) {
print STDERR "need name for $key $twoDot{$key}\n";
printf FH "%s\t%s\n", $key, $twoDot{$key};
}
}
foreach my $key (keys %sangerGeneName) {
if ( (!exists($alias{$key})) && (!exists($needReal{$key})) ) {
if ( !exists($twoDot{$key}) ) {
print STDERR "last chance need name for $key $key\n";
printf FH "%s\t%s\n", $key, $key;
}
}
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x fixupOrf.pl
./fixupOrf.pl > fixupOrf.out 2>&1
hgLoadSqlTab ce6 orfToGene ~/kent/src/hg/lib/orfToGene.sql \
fixedOrfToGene.tab
ln -s /cluster/data/ce6/downloads/tarFile/paragraph.txt ./paragraph.tab
ln -s /cluster/data/ce6/downloads/tarFile/uniprot.txt ./uniprot.tab
cat << '_EOF_' > mkSangerLinks.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %geneToOrf; # key is uc(gene name), value is orf
my %orfToGene; # key is uc(orf), value is gene name
my $orfCount = 0;
open (FH, "<orfToGene.tab") or die "can not read orfToGene.tab";
while (my $line = <FH>) {
chomp $line;
my ($orf, $gene) = split('\t',$line);
die "gene name zero length" if (length($gene) < 1);
die "orf name zero length" if (length($orf) < 1);
$geneToOrf{uc($gene)} = $orf;
$orfToGene{uc($orf)} = $gene;
++$orfCount;
}
close (FH);
printf STDERR "read $orfCount orf designations from orfToGene.tab\n";
open (LS,">sangerLinks.tab") or die "can not write to sangerLinks.tab";
my $geneCount = 0;
open (FH, "sort -u paragraph.tab|") or die "can not read paragraph.tab";
while (my $line = <FH>) {
chomp $line;
my ($orf, $descr) = split('\t',$line);
die "gene name zero length" if (length($orf) < 1);
die "can not find orfToGene $orf" if (!exists($orfToGene{uc($orf)}));
my $orfGene = $orfToGene{uc($orf)};
die "orfGene name zero length" if (length($orfGene) < 1);
printf LS "%s\t%s\t%s\n", $orf, $orfGene, $descr;
++$geneCount;
}
close (FH);
printf STDERR "read $geneCount gene descriptions from paragraph.tab\n";
close (LS);
'_EOF_'
# << happy emacs
chmod +x mkSangerLinks.pl
./mkSangerLinks.pl
# read 33792 orf designations from orfToGene.tab
# read 33793 gene descriptions from paragraph.tab
# There is problem in paragraph.tab, it contains a duplicate entry:
F57C7.4 F57C7.4 contains similarity to Interpro domain IPR008972 ()
F57C7.4 F57C7.4 contains similarity to Plasmodium falciparum Hypothetical protein.\; TR:Q8I5U3
# manually fixup this one to elimiate the second definition
hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
sangerLinks.tab
# Add proteinID field to sangerGene table, used by Gene Sorter
hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce6
# To add index on this column
hgsql -e 'alter table sangerGene add index(proteinID);' ce6
# Use SwissProt IDs in sangerLinks table to fill proteinID
# column in the sangerGene table
hgsql -e 'update sangerGene as g, sangerLinks as l \
set g.proteinID = l.protName where g.name = l.orfName;' ce6
# check there are rows with the protName field filled
hgsql -N -e "select proteinId from sangerGene" ce6 | sort | uniq -c | wc -l
# 20127
hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
sangerLinks.tab
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/WS190/wormpep190.tar.gz"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/WS190/wormrna190.tar.gz"
tar xvzf wormrna190.tar.gz
mv README README.wormrna190
tar xvzf wormpep190.tar.gz
# creates files in directory wormpep190:
# -rw-rw-r-- 1 32316202 Apr 7 06:38 wormpep.dna190
# -rw-rw-r-- 1 980303 Apr 7 06:41 wormpep.history190
# -rw-rw-r-- 1 4826 Apr 7 06:41 wormpep.diff190
# -rw-rw-r-- 1 19139454 Apr 7 06:41 wormpep.fasta190
# -rw-rw-r-- 1 398315 Apr 7 06:41 wormpep.accession190
# -rw-rw-r-- 1 13244499 Apr 15 06:23 wormpep190
# -rw-rw-r-- 1 2608936 Apr 15 06:23 wormpep.table190
hgPepPred ce6 generic sangerPep wormpep190/wormpep190
###############################################################################
######## MAKING GENE SORTER TABLES #### (DONE - 2008-06-04 - Hiram)
# These are instructions for building the
# Gene Sorter browser. Don't start these until
# there is a sangerGene table with a proteinID column containing Swiss-Prot
# protein IDs, and also Kim lab expression data is required (in hgFixed).
# Cluster together various alt-splicing isoforms.
# Creates the sangerIsoforms and sangerCanonical tables
ssh hgwdev
hgClusterGenes ce6 sangerGene sangerIsoforms sangerCanonical
# Got 20051 clusters, from 30296 genes in 7 chromosomes
featureBits ce6 sangerCanonical
# 57412493 bases of 100281426 (57.251%) in intersection
featureBits ce4 sangerCanonical
# 57092312 bases of 100281244 (56.932%) in intersection
featureBits ce2 sangerCanonical
# 54156601 bases of 100291761 (53.999%) in intersection
featureBits ce1 sangerCanonical
# 53654286 bases of 100277784 (53.506%) in intersection
# Create Self blastp table - sangerBlastTab
# Get sangerPep peptide sequences fasta file from sangerPep dir
# and create a blast database out of them.
mkdir -p /cluster/data/ce6/bed/blastp
cd /cluster/data/ce6/bed/blastp
cat << '_EOF_' > pepTabToFa.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH,"<../sangerGene/sangerPep.tab") or
die "can not open ../sangerGene/sangerPep.tab";
while (my $line = <FH>) {
chomp $line;
my ($name,$seq) = split('\s+',$line);
printf ">%s\n%s\n", $name, $seq;
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x ./pepTabToFa.pl
./pepTabToFa.pl > wormPep190.faa
/cluster/bluearc/blast229/formatdb -i wormPep190.faa \
-t wormPep190 -n wormPep190
# Copy database over to kluster storage
mkdir -p /san/sanvol1/scratch/worms/ce6/blastp
rsync -a ./ /san/sanvol1/scratch/worms/ce6/blastp/
cd /cluster/data/ce6/bed/blastp
mkdir split
faSplit sequence wormPep190.faa 1000 split/wp
# Make parasol run directory
ssh pk
mkdir -p /cluster/data/ce6/bed/blastp/self
cd /cluster/data/ce6/bed/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome.csh
#!/bin/csh -ef
setenv BLASTMAT /cluster/bluearc/blast229/data
/cluster/bluearc/blast229/blastall -p blastp \
-d /san/sanvol1/scratch/worms/ce6/blastp/wormPep190 \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
# << happy emacs
chmod a+x blastSome.csh
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome.csh {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << Create parasol batch
ls ../../split/*.fa > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try.... etc ...
# Completed: 986 of 986 jobs
# CPU time in finished jobs: 18187s 303.12m 5.05h 0.21d 0.001 y
# IO & Wait Time: 2713s 45.22m 0.75h 0.03d 0.000 y
# Average job time: 21s 0.35m 0.01h 0.00d
# Longest finished job: 73s 1.22m 0.02h 0.00d
# Submission to last job: 302s 5.03m 0.08h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/ce6/bed/blastp/self/run/out
hgLoadBlastTab ce6 sangerBlastTab *.tab
# Scanning through 986 files
# Loading database with 1255312 rows
# CREATE EXPRESSION DISTANCE TABLES FOR
# KIM LAB EXPRESSION DATA (this appears to be an expensive command,
# about 10 to 15 minutes or so ?)
hgExpDistance ce6 hgFixed.kimWormLifeMedianRatio \
hgFixed.kimWormLifeMedianExps kimExpDistance
# Have 19134 elements in hgFixed.kimWormLifeMedianRatio
# Got 19134 unique elements in hgFixed.kimWormLifeMedianRatio
# CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES
# sangerToRefSeq
hgMapToGene ce6 refGene sangerGene sangerToRefSeq
# SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera)
# set hgNearOk to 1 on hgcentraltest to make Ce6 Gene Sorter viewable
hgsql -e 'update dbDb set hgNearOk = 1 where name = "ce6";' \
hgcentraltest
# Running joinerCheck to see what is sane:
cd ~/kent/src/hg/makeDb/schema
joinerCheck -identifier=wormBaseId -database=ce6 -times -keys ./all.joiner
# ce6.orfToGene.name - hits 30296 of 30296 ok
# ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok
# ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok
# ce6.sangerCanonical.transcript - hits 20051 of 20051 ok
# ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok
# ce6.sangerLinks.orfName - hits 30115 of 30115 ok
# ce6.sangerPep.name - hits 23771 of 23771 ok
# ce6.sangerToRefSeq.name - hits 29995 of 29995 ok
## create a gene name to WBGene ID reference table
## to be used to construct URL
mkdir /cluster/data/ce6/bed/WBGeneID
cd /cluster/data/ce6/bed/WBGeneID
# There was one duplicate entry in the file, the sort -u eliminates it
sed -e "s#http://wormbase.org/db/gene/gene?name=##; s#;class=Gene##" \
../../downloads/tarFile/url.txt | sort -u > sangerGeneToWBGeneID.tab
# add in the one-dot names for those names that are only two-dot names
# in this file:
cat << '_EOF_' > addDotName.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %sangerGeneName;
open (FH, "sangerGene.name") or die "can not read sangerGene.name";
while (my $line = <FH>) {
chomp $line;
$sangerGeneName{$line} = 1;
}
close (FH);
open (FH,"<sangerGeneToWBGeneID.tab") or
die "can not read sangerGeneToWBGeneID.tab";
my %twoDotNames;
while (my $line = <FH>) {
chomp $line;
my ($name, $wbId) = split('\s+',$line);
die "duplicate gene name $name $wbId" if (exists($twoDotNames{$name}));
if (exists($sangerGeneName{$name})) {
$twoDotNames{$name} = $wbId;
}
}
close (FH);
my %neededNames;
foreach my $key (keys %twoDotNames) {
my ($a, $b, $c, $d) = split('\.',$key);
my $wbId = $twoDotNames{$key};
die "three dot name found $key $wbId" if (defined($d));
if (defined($c)) {
my $oneDot = sprintf "%s.%s", $a, $b;
if (!exists($twoDotNames{$oneDot})) {
if (!exists($neededNames{$oneDot})) {
$neededNames{$oneDot} = $wbId;
}
}
}
}
foreach my $key (keys %twoDotNames) {
printf "%s\t%s\n", $key, $twoDotNames{$key};
}
foreach my $key (keys %neededNames) {
printf "%s\t%s\n", $key, $neededNames{$key};
}
'_EOF_'
# << happy emacs
chmod +x addDotName.pl
hgsql -N -e "select name from sangerGene;" ce6 | sort -u > sangerGene.name
./addDotName.pl > fixed.sangerGeneToWBGeneID.tab
hgLoadSqlTab ce6 sangerGeneToWBGeneID \
~/kent/src/hg/lib/sangerGeneToWBGeneID.sql \
fixed.sangerGeneToWBGeneID.tab
cd ~/kent/src/hg/makeDb/schema
joinerCheck -identifier=wormBaseId -database=ce6 -times -keys ./all.joiner
# ce6.orfToGene.name - hits 30296 of 30296 ok
# ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok
# ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok
# ce6.sangerCanonical.transcript - hits 20051 of 20051 ok
# ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok
# ce6.sangerLinks.orfName - hits 30115 of 30115 ok
# ce6.sangerPep.name - hits 23771 of 23771 ok
# ce6.sangerToRefSeq.name - hits 29995 of 29995 ok
# ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok
############################################################################
## fixup sangerLinks table (DONE - 2008-06-03 - Hiram)
## it is missing descriptions for the one dot names where only two dot
## names existed
mkdir /cluster/data/ce6/bed/fixupSangerLinks
cd /cluster/data/ce6/bed/fixupSangerLinks
ln -s ../sangerGene/sangerLinks.tab .
cat << '_EOF_' > slFixup.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH,"<sangerLinks.tab") or die "can not read sangerLinks.tab";
my %existing; # key is full gene symbol name, one or two dots
# value is the orfName and description text, tab separated
my %newNeeded; # key is shortened one dot name, value is orfName and
# description text, tab separated
my $newCount = 0;
while (my $line = <FH>) {
chomp $line;
my ($name, $orf, $desc) = split('\t',$line);
my ($a, $b, $c, $d) = split('\.', $name);
die "ERROR: found three dot name at $line" if (defined($d));
if (defined($c)) {
my $oneDotName = sprintf "%s.%s", $a, $b;
if (exists($newNeeded{$oneDotName})) {
my $newString = sprintf "%s\t%s", $orf, $desc;
die "ERROR: new one dot name, different descr $name"
if ($newNeeded{$oneDotName} ne $newString)
} else {
$newNeeded{$oneDotName} = sprintf "%s\t%s", $orf, $desc;
++$newCount;
}
}
die "ERROR: duplicate name $name at $line" if (exists($existing{$name}));
$existing{$name} = sprintf "%s\t%s", $orf, $desc;
}
close (FH);
foreach my $key (keys %newNeeded) {
die "ERROR: duplicate new name $key for $newNeeded{$key}"
if (exists($existing{$key}));
printf "%s\t%s\n", $key, $newNeeded{$key};
}
foreach my $key (keys %existing) {
die "ERROR: duplicate existing name $key for $existing{$key}"
if (exists($newNeeded{$key}));
printf "%s\t%s\n", $key, $existing{$key};
}
'_EOF_'
# << happy emacs
chmod +x ./slFixup.pl
./slFixup.pl > fixedSangerLinks.tab
hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
fixedSangerLinks.tab
# It still has a difficulty because it contains references to the RNA
# genes. joinerCheck complains. So, after loading sangerRnaGene
# below, remove the entries in sangerLinks that are in sangerRnaGene.
# I tried to figure out a delete statement that would do this, but
# to no avail. So, brain dead list management:
hgsql -N -e "select orfName from sangerLinks;" ce6 \
| sort -u > sangerLinks.orfName
hgsql -N -e "select name from sangerRnaGene;" ce6 \
| sort -u > sangerRnaGene.name
# before delete:
hgsql -e "select count(*) from sangerLinks;" ce6
# 36555
comm -12 sangerLinks.orfName sangerRnaGene.name | while read N
do
hgsql -e "delete from sangerLinks where orfName=\"$N\";" ce6
echo $N
done
# after delete:
hgsql -e "select count(*) from sangerLinks;" ce6
# 30115
# 36555 - 30115 = 6440
###############################################################################
## RNA sangerGenes (DONE - 2008-06-03 - Hiram)
ssh kkstore06
cd /cluster/data/ce6/bed/sangerGene
for C in I II III IV V X M
do
echo "chr${C}.gff -> rna.chr${C}.gff"
grep -v "^#" chr${C}.gff | awk -F'\t' '
BEGIN { IGNORECASE=1 }
{
if (match($2,"^miRNA|^ncRNA|^rRNA|^scRNA|^snRNA|^snlRNA|^snoRNA|^tRNA|^tRNAscan-SE-1.23")) {
if (match($3,"exon")) {
gsub("Transcript ","",$9)
gsub(" .*","",$9)
for (i = 1; i < 9; ++i) {
printf "%s\t", $i
}
printf "%s\n", $9
}
}
}
' | sort -u | sort -k4n > rna.chr${C}.gff
done
for C in I II III IV V X M
do
echo "chr${C}.gff -> pseudoGene.chr${C}.gff"
grep -v "^#" chr${C}.gff | awk -F'\t' '
BEGIN { IGNORECASE=1 }
{
if (match($2,"^Pseudogene")) {
if (match($3,"Pseudogene")) {
gsub("Pseudogene ","",$9)
gsub("Pseudogene","CDS",$3)
gsub(" .*","",$9)
gsub("\".*","",$9)
for (i = 1; i < 9; ++i) {
printf "%s\t", $i
}
printf "%s\n", $9
}
}
}
' | sort -u | sort -k4n > pseudoGene.chr${C}.gff
done
ssh hgwdev
cd /cluster/data/ce6/bed/sangerGene
time nice -n +19 ldHgGene ce6 sangerRnaGene rna.*.gff
# Read 6440 transcripts in 6474 lines in 7 files
# 6440 groups 7 seqs 9 sources 1 feature types
# 6440 gene predictions
time nice -n +19 ldHgGene ce6 sangerPseudoGene pseudoGene.*.gff
# Read 1462 transcripts in 1462 lines in 7 files
# 1462 groups 6 seqs 1 sources 1 feature types
# 1462 gene predictions
############################################################################
# BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("ce6", "blat9", "17788", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("ce6", "blat9", "17789", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
# reset default position, same as ce4 on the ZC101 / unc-52 locus
ssh hgwdev
hgsql -e 'update dbDb set defaultPos="chrII:14646344-14667746"
where name="ce6";' hgcentraltest
##########################################################################
## Blastz SELF (DONE - 2008-06-04 - Hiram
ssh kkstore06
# screen
mkdir /cluster/data/ce6/bed/blastzCe6.2008-06-04
cd /cluster/data/ce6/bed
ln -s blastzCe6.2008-06-04 blastzSelf
cd blastzCe6.2008-06-04
cat << '_EOF_' > DEF
# ce6 vs ce6 - Self alignment
BLASTZ_H=2000
BLASTZ_M=200
# TARGET: elegans Ce6
SEQ1_DIR=/iscratch/i/worms/ce6/ce6.2bit
SEQ1_LEN=/iscratch/i/worms/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: elegans Ce6
SEQ2_DIR=/iscratch/i/worms/ce6/ce6.2bit
SEQ2_LEN=/iscratch/i/worms/ce6/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
BASE=/cluster/data/ce6/bed/blastzCe6.2008-06-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
-smallClusterHub=kki > do.log 2>&1 &
# real 79m26.707s
XXX - running 2008-06-04 12:20
# broke down during loadUp because /iscratch/i/worms/ce6/chrom.sizes
# does not exist on hgwdev. Finish the load manually by fixing
# the chrom.sizes reference in loadUp.csh. Then:
time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
-continue=download -smallClusterHub=kki > download.log 2>&1 &
i
# real 37m22.066s
ssh hgwdev
cd /cluster/data/ce6/bed/blastz.ce6.2008-06-04
nice -n +19 featureBits ce6 chainSelfLink > fb.ce6.chainSelfLink.txt 2>&1
# 31186593 bases of 100281244 (31.099%) in intersection
############################################################################
## BLASTZ caePb2 (DONE - 2008-06-04,09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzCaePb2.2008-06-04
cd /cluster/data/ce6/bed/blastzCaePb2.2008-06-04
cat << '_EOF_' > DEF
# ce6 vs caePb2
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. PB2801 caePb2
SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit
SEQ2_LEN=/scratch/data/caePb2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit
SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes
SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzCaePb2.2008-06-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-smallClusterHub=kki > do.log 2>&1 &
# real 32m29.730s
# trouble with chaining due to no copies of genomes on /scratch/data/
# for the Iservers
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-continue=chainMerge -smallClusterHub=kki > chainMerge.log 2>&1 &
# real 26m55.624s
# forgot the -qRepeats=windowmaskerSdust
rm axtChain/ce6.caePb2.net
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk -qRepeats=windowmaskerSdust \
-continue=load -smallClusterHub=kki > load.log 2>&1 &
# real 1m18.270s
cat fb.ce6.chainCaePb2Link.txt
# 40805712 bases of 100281426 (40.691%) in intersection
# swap, this is also in caePb2.txt
mkdir /cluster/data/caePb2/bed/blastz.ce6.swap
cd /cluster/data/caePb2/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzCaePb2.2008-06-04/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 &
# a couple of minutes
cat fb.caePb2.chainCe6Link.txt
# 55092134 bases of 170473138 (32.317%) in intersection
############################################################################
## BLASTZ caeRem3 (DONE - 2008-06-04,09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzCaeRem3.2008-06-04
cd /cluster/data/ce6/bed/blastzCaeRem3.2008-06-04
cat << '_EOF_' > DEF
# ce6 vs caeRem3
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. PB2801 caeRem3
SEQ2_DIR=/scratch/data/caeRem3/caeRem3.2bit
SEQ2_LEN=/scratch/data/caeRem3/chrom.sizes
SEQ2_CTGDIR=/scratch/data/caeRem3/caeRem3.supercontigs.2bit
SEQ2_CTGLEN=/scratch/data/caeRem3/caeRem3.supercontigs.sizes
SEQ2_LIFT=/scratch/data/caeRem3/caeRem3.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzCaeRem3.2008-06-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-smallClusterHub=kki > do.log 2>&1 &
# real 28m40.865s
# trouble with chaining due to no copies of genomes on /scratch/data/
# for the Iservers
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-continue=chainMerge -smallClusterHub=kki > chainMerge.log 2>&1 &
# real 19m30.231s
# forgot the -qRepeats=windowmaskerSdust, went back and fixed that
# manually in loadUp.csh and reran the load of the net track
cat fb.ce6.chainCaeRem3Link.txt
# 41853863 bases of 100281426 (41.736%) in intersection
# swap, this is also in caeRem3.txt
mkdir /cluster/data/caeRem3/bed/blastz.ce6.swap
cd /cluster/data/caeRem3/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzCaeRem3.2008-06-04/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 &
cat fb.caeRem3.chainCe6Link.txt
# 46326709 bases of 138406388 (33.472%) in intersection
# something unusual happened with ssh during installDownloads.csh,
# continuing after finishing that manually:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-continue=cleanup -qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzCaeRem3.2008-06-04/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > cleanUp.log 2>&1 &
############################################################################
## BLASTZ cb3 (DONE - 2008-06-04,09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzCb3.2008-06-04
cd /cluster/data/ce6/bed/blastzCb3.2008-06-04
cat << '_EOF_' > DEF
# ce6 vs cb3
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. briggsae cb3
SEQ2_DIR=/san/sanvol1/scratch/worms/cb3/cb3.rmskTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/cb3/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzCb3.2008-06-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-smallClusterHub=kki > do.log 2>&1 &
# real 33m24.103s
# 42483532 bases of 100281426 (42.364%) in intersection
# swap, this is also in cb3.txt
mkdir /cluster/data/cb3/bed/blastz.ce6.swap
cd /cluster/data/cb3/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/ce6/bed/blastzCb3.2008-06-04/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 &
# real 3m20.338s
# real 22m40.205s
cat fb.cb3.chainCe6Link.txt
# 43180969 bases of 108433446 (39.823%) in intersection
############################################################################
## BLASTZ priPac1 (DONE - 2008-06-09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzPriPac1.2008-06-09
cd /cluster/data/ce6/bed/blastzPriPac1.2008-06-09
cat << '_EOF_' > DEF
# ce6 vs priPac1
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. briggsae priPac1
SEQ2_DIR=/iscratch/i/worms/priPac1/priPac1.TrfWM.2bit
SEQ2_LEN=/iscratch/i/worms/priPac1/chrom.sizes
SEQ2_CTGDIR=/iscratch/i/worms/priPac1/priPac1.TrfWM.supers.2bit
SEQ2_CTGLEN=/iscratch/i/worms/priPac1/priPac1.TrfWM.supers.sizes
SEQ2_LIFT=/iscratch/i/worms/priPac1/priPac1.supercontigs.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzPriPac1.2008-06-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-qRepeats=windowmaskerSdust -smallClusterHub=kki > do.log 2>&1 &
# after some kk parasol experiments, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-continue=cat `pwd`/DEF -verbose=2 -bigClusterHub=kk \
-qRepeats=windowmaskerSdust -smallClusterHub=kki > cat.log 2>&1 &
# real 9m46.300s
cat fb.ce6.chainPriPac1Link.txt
# 6256254 bases of 100281426 (6.239%) in intersection
# swap, this is also in priPac1.txt
mkdir /cluster/data/priPac1/bed/blastz.ce6.swap
cd /cluster/data/priPac1/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzPriPac1.2008-06-09/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 &
# real 0m33.836s
# failed during the loadUp.csh due to /iscratch/i/ missing from hgwdev
# fixup that manually in loadUp.csh and ran it on hgwdev
cat fb.priPac1.chainCe6Link.txt
# 6803322 bases of 145948246 (4.661%) in intersection
# and continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
-continue=download /cluster/data/ce6/bed/blastzPriPac1.2008-06-09/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > download.log 2>&1 &
############################################################################
## BLASTZ caeJap1 (DONE - 2008-06-09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09
cd /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09
cat << '_EOF_' > DEF
# ce6 vs caeJap1
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. japonica caeJap1
SEQ2_DIR=/cluster/bluearc/scratch/data/caeJap1/caeJap1.2bit
SEQ2_LEN=/cluster/bluearc/scratch/data/caeJap1/chrom.sizes
SEQ2_CTGDIR=/cluster/bluearc/scratch/data/caeJap1/caeJap1.TrfWM.supers.2bit
SEQ2_CTGLEN=/cluster/bluearc/scratch/data/caeJap1/caeJap1.TrfWM.supers.sizes
SEQ2_LIFT=/cluster/bluearc/scratch/data/caeJap1/caeJap1.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzCaeJap1.2008-06-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=pk \
-qRepeats=windowmaskerSdust -smallClusterHub=kki > do.log 2>&1 &
# real 35m7.220s
cat fb.ce6.chainCaeJap1Link.txt
# 26888483 bases of 100281426 (26.813%) in intersection
# swap, this is also in caeJap1.txt
mkdir /cluster/data/caeJap1/bed/blastz.ce6.swap
cd /cluster/data/caeJap1/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzCaeJap1.2008-06-09/DEF \
-bigClusterHub=pk -smallClusterHub=kki -swap > swap.log 2>&1 &
# real 1m10.794s
# failed to load because:
# loadUp: looks like previous stage was not
# successful
# (can't find /cluster/data/caeJap1/bed/blastz.ce6.swap/mafNet/).
# continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
-continue=load /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09/DEF \
-bigClusterHub=pk -smallClusterHub=kki -swap > load.log 2>&1 &
# real 0m33.440s
cat fb.caeJap1.chainCe6Link.txt
# 26209442 bases of 129347181 (20.263%) in intersection
############################################################################
## 6-Way multiple alignment (DONE - 2008-06-09 - Hiram)
mkdir /cluster/data/ce6/bed/multiz6way
cd /cluster/data/ce6/bed/multiz6way
# An additional nematode tree:
# http://nematol.unh.edu/tree/tree_des.php
# Constructing a tree from the document:
# http://www.wormbook.org/chapters/www_phylogrhabditids/phylorhab.html
# http://www.wormbook.org/chapters/www_phylogrhabditids/phylofig6B.jpg
# purely artifical for the last one: :0.2,pristiochus_priPac1:0.8
# since it is not on the graph
cat << '_EOF_' > nematodes.nh
((((((((((briggsae_cb3:0.005,remanei_caeRem3:0.003):0.004,
brenneri_caePb2:0.013):0.002,elegans_ce6:0.003):0.001,
japonica_caeJap1:0.023):0.024,ps1010:0.027):0.014,
df5070:0.056):0.009, plicata:0.102):0.099,sb341:0.060):0.038,
(df5074:0.351,df5055:0.090):0.221):0.2,pristiochus_priPac1:0.8)
'_EOF_'
# << happy emacs
# filter that for the six species in use here
/cluster/bin/phast/x86_64/tree_doctor \
--prune-all-but briggsae_cb3,remanei_caeRem3,brenneri_caePb2,elegans_ce6,japonica_caeJap1,pristiochus_priPac1 \
nematodes.nh > 6way.nh
# rearrange to get Ce6 on top for the construction of the tree image
# in the 6-way description page. This results in something like:
(((C._elegans_ce6:0.003000,
(C._brenneri_caePb2:0.013000,
(C._remanei_caeRem3:0.003000,C._briggsae_cb3:0.005000):0.004000)
:0.002000):0.001000,
C._japonica_caeJap1:0.023000):0.384000,
P._pristiochus_priPac1:0.800000);
/cluster/bin/phast/x86_64/all_dists 6way.nh > 6way.distances.txt
grep -i ce6 6way.distances.txt | sort -k3,3n
# Use this output for reference, and use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chaince6Link chain linearGap
# distance on Ce6 on other minScore
# 1 0.0120 - remanei_caeRem3 (% 41.736) (% 33.472) 1000 loose
# 2 0.0140 - briggsae_cb3 (% 42.364) (% 39.823) 1000 loose
# 3 0.0180 - brenneri_caePb2 (% 40.691) (% 32.317) 1000 loose
# 3 0.0270 - japonica_caeJap1 (% 26.813) (% 20.263) 1000 loose
# 4 1.1880 - pristiochus_priPac1 (% 6.239) (% 4.661) 1000 loose
cd /cluster/data/ce6/bed/multiz6way
# bash shell syntax here ...
export H=/cluster/data/ce6/bed
mkdir mafLinks
for G in caeRem3 cb3 caePb2 priPac1 caeJap1
do
mkdir mafLinks/$G
if [ ! -d ${H}/blastz.${G}/mafNet ]; then
echo "missing directory blastz.${G}/mafNet"
exit 255
fi
ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
done
# Copy MAFs to some appropriate NFS server for kluster run
ssh kkstore06
mkdir /san/sanvol1/scratch/worms/ce6/multiz6way
cd /san/sanvol1/scratch/worms/ce6/multiz6way
time rsync -a --copy-links --progress --stats \
/cluster/data/ce6/bed/multiz6way/mafLinks/ .
# a few seconds to copy 126 Mb
# these are x86_64 binaries
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# the autoMultiz cluster run
ssh memk
cd /cluster/data/ce6/bed/multiz6way/
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
6way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir maf run
cd run
# NOTE: you need to set the db properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = ce6
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/worms/$db/multiz6way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/worms/$db/multiz6way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/ce6/bed/multiz6way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/ce6/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
para -maxNode=1 push
para check ... push ... etc ...
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 2302s 38.37m 0.64h 0.03d 0.000 y
# IO & Wait Time: 38s 0.63m 0.01h 0.00d 0.000 y
# Average job time: 334s 5.57m 0.09h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 478s 7.97m 0.13h 0.01d
# Submission to last job: 954s 15.90m 0.27h 0.01d
ssh kkstore06
cd /cluster/data/ce6/bed/multiz6way
time nice -n +19 catDir maf > multiz6way.maf
# a few seconds to produce:
# -rw-rw-r-- 1 343585668 Jun 9 16:25 multiz6way.maf
# before getting to the annotation, load this up so we can get
# a first view of the track. This will be replaced with the annotated
# mafs
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
mkdir /gbdb/ce6/multiz6way
ln -s /cluster/data/ce6/bed/multiz6way/multiz6way.maf /gbdb/ce6/multiz6way
# this load creates a large file, do that on local disk:
cd /scratch/tmp
time nice -n +19 hgLoadMaf ce6 multiz6way
# a few seconds to load:
# Loaded 367046 mafs in 1 files from /gbdb/ce6/multiz6way
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 ce6 multiz6waySummary /gbdb/ce6/multiz6way/multiz6way.maf
# a few seconds to load:
# Created 125113 summary blocks from 1051000 components
# and 366975 mafs from /gbdb/ce6/multiz6way/multiz6way.maf
# remove the temporary .tab files:
rm multiz6*.tab
############################################################################
#### Blat sangerGene proteins to determine exons
# (DONE - 2008-06-11 - hiram)
ssh hgwdev
cd /cluster/data/ce6/bed
mkdir blat.ce6SG.2008-06-11
ln -s blat.ce6SG.2008-06-11 blat.ce6SG
cd blat.ce6SG
pepPredToFa ce6 sangerPep sangerPep.fa
ssh kkstore06
# create san nib directory from goldenPath chromosomes fa files
mkdir /san/sanvol1/scratch/worms/ce6/nib
cd /san/sanvol1/scratch/worms/ce6/nib
for C in I II III IV V X M
do
twoBitToFa -seq=chr${C} /cluster/data/ce6/ce6.2bit stdout \
| faToNib -softMask stdin chr${C}.nib
done
# The kluster run
ssh pk
cd /cluster/data/ce6/bed/blat.ce6SG
cat << '_EOF_' > blatSome
#!/bin/csh -fe
blat -t=dnax -q=prot -out=pslx /san/sanvol1/scratch/worms/ce6/nib/$1.nib sgfa/$2.fa $3
'_EOF_'
# << happy emacs
chmod +x blatSome
ls -1S /san/sanvol1/scratch/worms/ce6/nib > ce6.lst
mkdir sgfa
cd sgfa
faSplit sequence ../sangerPep.fa 3000 sg
ls -1S *.fa > ../sg.lst
cd ..
cat << '_EOF_' > template
#LOOP
blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ce6.lst sg.lst template jobList
mkdir psl
cd psl
sed -e "s/.nib//" ../ce6.lst | xargs mkdir
cd ..
para create jobList
para try ... check ... push ... etc
# Completed: 20335 of 20335 jobs
# CPU time in finished jobs: 69885s 1164.74m 19.41h 0.81d 0.002 y
# IO & Wait Time: 51640s 860.67m 14.34h 0.60d 0.002 y
# Average job time: 6s 0.10m 0.00h 0.00d
# Longest finished job: 348s 5.80m 0.10h 0.00d
# Submission to last job: 943s 15.72m 0.26h 0.01d
ssh kkstore06
cd /cluster/data/ce6/bed/blat.ce6SG.2008-06-11
pslSort dirs raw.psl /tmp psl/*
# -rw-rw-r-- 1 70627970 Jun 11 10:35 raw.psl
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
# Processed 155625 alignments
# -rw-rw-r-- 1 26423327 Jun 11 10:35 cooked.psl
pslUniq cooked.psl ce6SG.psl
# -rw-rw-r-- 1 25332830 Jun 11 10:36 ce6SG.psl
cut -f 10 ce6SG.psl > sgName.lst
faSomeRecords sangerPep.fa sgName.lst ce6SG.fa
# -rw-rw-r-- 1 10903181 Jun 11 10:11 sangerPep.fa
# -rw-rw-r-- 1 10899914 Jun 11 10:37 ce6SG.fa
grep "^>" ce6SG.fa | wc -l
# 23741
grep "^>" sangerPep.fa | wc -l
# 23771
pslxToFa ce6SG.psl ce6SG_ex.fa -liftTarget=genome.lft \
-liftQuery=protein.lft
# -rw-rw-r-- 1 5288587 Jun 11 10:39 protein.lft
# -rw-rw-r-- 1 6741328 Jun 11 10:39 genome.lft
# -rw-rw-r-- 1 12743074 Jun 11 10:39 ce6SG_ex.fa
wc -l *.psl *.lft *.fa sgName.lst
# 23741 ce6SG.psl
# 25926 cooked.psl
# 155630 raw.psl
# 151196 genome.lft
# 151196 protein.lft
# 244239 ce6SG.fa
# 302392 ce6SG_ex.fa
# 244343 sangerPep.fa
# 23741 sgName.lst
# 1322404 total
# back on hgwdev
ssh hgwdev
cd /cluster/data/ce6/bed/blat.ce6SG
# After about an hour, it exited with this message:
# sqlFreeConnection called on cache (ce6) that doesn't contain
# the given connection
# This may be a lurking error in this program, because the
# resulting file seems to have the correct number of lines:
# Once upon a time, long long ago, this table was made by the kgName
# command, but this doesn't work on C. elegans because there is no
# kgXref table. So, the following perl script was developed to do the
# same thing
# kgName ce6 ce6SG.psl blastSGRef01
# Need a reference table loaded, perl script makes up relationships
cat << '_EOF_' >> mkRef.pl
#!/usr/bin/env perl
use warnings;
use strict;
my $argc = scalar(@ARGV);
sub usage() {
print STDERR "usage: ./mkRef.pl blastSGRef01.tab\n";
print STDERR "reads ../sangerGene/genePred.tab and ",
"../sangerGene/orfToGene.tab\n";
print STDERR "writes references to the given input file, ",
"e.g. blastSGRef01.tab\n";
}
if (1 != $argc) {
usage;
exit 255;
}
my $outFile = shift;
my %orfToGene; # key is all lowercase orf name, value is gene name
my $inFile="../sangerGene/orfToGene.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
chomp $line;
my ($orf, $gene) = split('\s+',$line);
$orfToGene{lc($orf)} = $gene
}
close (FH);
my %links; # key is all lowercase orf name, value is SwissProt name
$inFile="../sangerGene/sangerLinks.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
chomp $line;
my ($orf, $SP, $comments) = split('\s+',$line);
$links{lc($orf)} = $SP
}
close (FH);
my %ce6Position; # key is all lowercase orf name, value is ce6 position
$inFile="../sangerGene/genePred.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
chomp $line;
my ($bin, $orf, $chr, $strand, $start, $end, $other) = split('\s+',$line);
$ce6Position{lc($orf)} = sprintf("%s:%d-%d", $chr, $start, $end);
}
close (FH);
open (OF,">$outFile") or die "can not write to $outFile";
$inFile="sgName.lst";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $orf=<FH>) {
chomp $orf;
if (!exists($orfToGene{lc($orf)})) {
$orfToGene{lc($orf)} = "";
}
if (!exists($links{lc($orf)})) {
$links{lc($orf)} = "";
}
if (!exists($ce6Position{lc($orf)})) {
$ce6Position{lc($orf)} = "";
}
printf OF "%s\t%s\t%s\t%s\t%s\n", $orf, $orfToGene{lc($orf)},
$ce6Position{lc($orf)}, $links{lc($orf)}, "";
}
close (FH);
close (OF);
'_EOF_'
# << happy emacs
chmod +x ./mkRef.pl
./mkRef.pl blastSGRef01.tab
wc -l blastSGRef01.tab
# 23741 blastSGRef.01.tab
hgLoadSqlTab ce6 blastSGRef01 ~/kent/src/hg/lib/blastRef.sql \
blastSGRef01.tab
wc -l sgName.lst blastSGRef01.tab ce6SG.psl
# 23741 sgName.lst
# 23741 blastSGRef01.tab
# 23741 ce6SG.psl
# And load the protein sequences
hgPepPred ce6 generic blastSGPep01 ce6SG.fa
############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS - 6-WAY
# (DONE - 2008-06-18 - Hiram)
# Estimate phastCons parameters
ssh kkstore06
# We need the fasta sequence for this business
cd /cluster/data/ce6
for C in `cut -f1 chrom.sizes | sed -e s/chr//`
do
echo "twoBitToFa -seq=chr${C} ce6.2bit ${C}/chr${C}.fa"
twoBitToFa -seq=chr${C} ce6.2bit ${C}/chr${C}.fa
done
mkdir /cluster/data/ce6/bed/multiz6way/cons
cd /cluster/data/ce6/bed/multiz6way/cons
# Create a starting-tree.mod based on chrV (the largest one)
# Set -windows large enough so it only makes one piece
faSize ../../../V/chrV.fa
# 20919568 bases (0 N's 20919568 real 18018198 upper 2901370 lower)
# in 1 sequences in 1 files
# %13.87 masked total, %13.87 masked real
time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \
--refseq ../../../V/chrV.fa --in-format MAF \
--windows 30000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
# Creating partition 1 (column 1 to column 20919568)...
# Writing partition 1 to s1.1-20919568.ss...
# 13 seconds
# the tree in use
cat ../tree-commas.nh
# (((((cb3,caeRem3),caePb2),ce6),caeJap1),priPac1)
# this s1.*.ss should match only one item
time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "(((((cb3,caeRem3),caePb2),ce6),caeJap1),priPac1)" \
--out-root starting-tree
# real 0m14.408s
rm s1.*.ss
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.383
# This 0.383 is used in the --gc argument below
# Create SS files on san filesystem
# Increasing their size this time from 1,000,000 to 4,000,000 to
# slow down the phastCons pk jobs
ssh kkstore06
mkdir -p /san/sanvol1/scratch/worms/ce6/cons/ss
cd /san/sanvol1/scratch/worms/ce6/cons/ss
time for C in `cut -f1 /cluster/data/ce6/chrom.sizes`
do
if [ -s /cluster/data/ce6/bed/multiz6way/maf/${C}.maf ]; then
mkdir ${C}
echo msa_split $C
chrN=${C/chr/}
chrN=${chrN/_random/}
/cluster/bin/phast/$MACHTYPE/msa_split \
/cluster/data/ce6/bed/multiz6way/maf/${C}.maf \
--refseq /cluster/data/ce6/${chrN}/${C}.fa \
--in-format MAF --windows 4000000,0 --between-blocks 5000 \
--out-format SS --out-root ${C}/${C}
fi
done
# real 1m7.685s
# Create a random list of 50 1 mb regions (do not use the _randoms)
cd /san/sanvol1/scratch/worms/ce6/cons/ss
ls -1l chr*/chr*.ss | grep -v random | \
awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
# Only 28 lines in stdin (i.e. all are used)
# Set up parasol directory to calculate trees on these regions
ssh pk
mkdir /san/sanvol1/scratch/worms/ce6/cons/treeRun2
cd /san/sanvol1/scratch/worms/ce6/cons/treeRun2
mkdir tree log
# Tuning this loop should come back to here to recalculate
# Create script that calls phastCons with right arguments
# expected-lengths and target-coverage taken from ce4 history
# for treeRun2 --expected-lengths 15 --target-coverage 0.55
# for treeRun1 --expected-lengths 15 --target-coverage 0.60
cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
/cluster/data/ce6/bed/multiz6way/cons/starting-tree.mod \
--gc 0.383 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-lengths 15 --target-coverage 0.55 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
# << happy emacs
chmod +x makeTree.csh
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Make cluster job and run it
gensub2 ../randomSs.list single template jobList
para create jobList
para try/push/check/etc
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 5941s 99.01m 1.65h 0.07d 0.000 y
# IO & Wait Time: 64s 1.07m 0.02h 0.00d 0.000 y
# Average job time: 214s 3.57m 0.06h 0.00d
# Longest finished job: 309s 5.15m 0.09h 0.00d
# Submission to last job: 326s 5.43m 0.09h 0.00d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kkstore06
cd /san/sanvol1/scratch/worms/ce6/cons/treeRun2
ls -1 tree/chr*/*.cons.mod > cons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
--output-average ../ave.cons.mod > cons_summary.txt 2>&1
ls -1 tree/chr*/*.noncons.mod > noncons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
--output-average ../ave.noncons.mod > noncons_summary.txt
cd ..
cp -p ave.*.mod /cluster/data/ce6/bed/multiz6way/cons
# measuring entropy
# consEntopy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
/cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .55 15 \
ave.cons.mod ave.noncons.mod
### for treeRun2: --expected-lengths 15 --target-coverage 0.55
# ( Solving for new omega: 15.000000 76.253180 42.469155 35.389619
# 34.810297 34.805819 )
# Transition parameters: gamma=0.550000, omega=15.000000, mu=0.066667,
# nu=0.081481
# Relative entropy: H=1.172307 bits/site
# Expected min. length: L_min=6.108564 sites
# Expected max. length: L_max=5.516260 sites
# Phylogenetic information threshold: PIT=L_min*H=7.161112 bits
# Recommended expected length: omega=34.805819 sites (for L_min*H=9.780000)
### for treeRun1: --expected-lengths 15 --target-coverage 0.60
# ( Solving for new omega: 15.000000 140.706073 61.140093 42.355284
# 39.561113 39.474822 )
# Transition parameters: gamma=0.600000, omega=15.000000, mu=0.066667,
# nu=0.100000
# Relative entropy: H=1.248356 bits/site
# Expected min. length: L_min=5.363742 sites
# Expected max. length: L_max=5.097039 sites
# Phylogenetic information threshold: PIT=L_min*H=6.695858 bits
# Recommended expected length: omega=39.474822 sites (for L_min*H=9.780000)
ssh pk
# Create cluster dir to do main phastCons run
mkdir /san/sanvol1/scratch/worms/ce6/cons/consRun2
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
mkdir ppRaw bed
# using the ave.cons.mod and ave.noncons.mod from treeRun2 above:
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.308500 0.191500 0.191500 0.308500
RATE_MAT:
-0.875904 0.175256 0.458101 0.242547
0.282331 -1.200805 0.180928 0.737546
0.737985 0.180928 -1.200354 0.281440
0.242547 0.457829 0.174703 -0.875079
TREE: (((((cb3:0.087550,caeRem3:0.073515):0.020860,caePb2:0.090766):0.029808,ce6:0.094770):0.038388,caeJap1:0.109779):0.126548,priPac1:0.126548);
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.308500 0.191500 0.191500 0.308500
RATE_MAT:
-0.875904 0.175256 0.458101 0.242547
0.282331 -1.200805 0.180928 0.737546
0.737985 0.180928 -1.200354 0.281440
0.242547 0.457829 0.174703 -0.875079
TREE: (((((cb3:0.328951,caeRem3:0.273909):0.078262,caePb2:0.339349):0.111927,ce6:0.355503):0.144928,caeJap1:0.413391):0.478831,priPac1:0.478831);
# Create script to run phastCons with right parameters
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
--expected-lengths 15 --target-coverage 0.55 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# << happy emacs
chmod +x doPhast
# root1 == chrom name, file1 == ss file name without .ss suffix
cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
# These jobs are very fast and very I/O intensive
# Completed: 29 of 29 jobs
# CPU time in finished jobs: 386s 6.43m 0.11h 0.00d 0.000 y
# IO & Wait Time: 85s 1.42m 0.02h 0.00d 0.000 y
# Average job time: 16s 0.27m 0.00h 0.00d
# Longest finished job: 22s 0.37m 0.01h 0.00d
# Submission to last job: 49s 0.82m 0.01h 0.00d
# combine predictions and transform scores to be in 0-1000 interval
# it uses a lot of memory, so on kolossus:
ssh kolossus
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
# The sed's and the sort get the file names in chrom,start order
# You might like to verify it is correct by first looking at the
# list it produces:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g"
# if that looks right (== in chrom and numerical order), then let it run:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin \
> phastConsElements6way.bed
# way too many negative
# scores with --expected-length 15 --target-coverage 0.60
# OK with--expected-length 15 --target-coverage 0.55
cp -p phastConsElements6way.bed /cluster/data/ce6/bed/multiz6way
# Figure out how much is actually covered by the bed file as so:
# Get the non-n genome size from faSize on all chroms:
ssh kkstore06
cd /cluster/data/ce6
faSize */chr*.fa
# 100281426 bases (0 N's 100281426 real 86981803 upper
# 13299623 lower) in 7 sequences in 7 files
# %13.26 masked total, %13.26 masked real
# 100281244 bases (0 N's 100281244 real 87008909 upper
# 13272335 lower) in 7 sequences in 7 files
# %13.24 masked total, %13.24 masked real
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
# The 100281426 comes from the non-n genome as counted above.
awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/100281426\n",100.0*sum/100281426,sum}' \
phastConsElements6way.bed
# --expected-length 15 --target-coverage 0.55
# % 29.56 = 100.0*29639749/100281426
# Aiming for %70 coverage in
# the following featureBits measurement on CDS:
# Beware of negative scores when too high. The logToBedScore
# will output an error on any negative scores.
HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce6 \
-enrichment sangerGene:cds phastConsElements6way.bed
# --expected-length 15 --target-coverage 0.55
# sangerGene:cds 25.329%, phastConsElements6way.bed 29.557%,
# both 15.469%, cover 61.07%, enrich 2.07x
# in Ce4, this was:
# sangerGene:cds 25.286%, phastConsElements5way.bed 32.368%,
# both 15.733%, cover 62.22%, enrich 1.92x
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
# the copy was already done above
# cp -p /san/sanvol1/scratch/worms/ce6/cons/consRun2/phastConsElements6way.bed .
time nice -n +19 hgLoadBed ce6 phastConsElements6way \
phastConsElements6way.bed
# Loaded 595094 elements of size 5
# should measure the same as above
time nice -n +19 featureBits ce6 -enrichment sangerGene:cds \
phastConsElements6way
# --expected-length 15 --target-coverage 0.55
# sangerGene:cds 25.329%, phastConsElements6way 29.557%, both 15.469%,
# cover 61.07%, enrich 2.07x
# on Ce4, this was:
time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \
phastConsElements5way
# sangerGene:cds 25.286%, phastConsElements5way 32.368%, both 15.733%,
# cover 62.22%, enrich 1.92x
# Create merged posterier probability file and wiggle track data files
ssh kkstore06
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
# prepare compressed copy of ascii data values for downloads
ssh pk
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons6wayScores
for D in ppRaw/chr*
do
C=${D/ppRaw\/}
out=phastCons6wayScores/${C}.data.gz
echo "========================== ${C} ${D}"
find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 1m14.178s
# use those file to encode the data for the track:
for C in I II III IV V X M
do
zcat phastCons6wayScores/chr${C}.data.gz
done | wigEncode stdin phastCons6way.wig phastCons6way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# -rw-rw-r-- 1 56813758 Jun 18 12:05 phastCons6way.wib
# -rw-rw-r-- 1 8880383 Jun 18 12:05 phastCons6way.wig
nice -n +19 cp -p phastCons6way.wi? /cluster/data/ce6/bed/multiz6way/
# copy them for downloads
ssh kkstore06
mkdir /cluster/data/ce6/bed/multiz6way/phastCons6wayScores
cd /cluster/data/ce6/bed/multiz6way/phastCons6wayScores
cp -p /san/sanvol1/scratch/worms/ce6/cons/consRun2/phastCons6wayScores/* .
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/ce6
ln -s /cluster/data/ce6/bed/multiz6way/phastCons6wayScores .
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
ln -s `pwd`/phastCons6way.wib /gbdb/ce6/wib/phastCons6way.wib
nice -n +19 hgLoadWiggle ce6 phastCons6way phastCons6way.wig
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=ce6 phastCons6way > histogram.data 2>&1
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Worm Ce6 Histogram phastCons6way track"
set xlabel " phastCons6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# enable this line in the ce6/trackDb.ra multiz6way track definition:
# wiggle phastCons6way
# to get the graph to display in the genome browser
############################################################################
# ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE - 2008-06-19 - Hiram)
ssh kolossus
mkdir /cluster/data/ce6/bed/multiz6way/anno
cd /cluster/data/ce6/bed/multiz6way/anno
# these actually already all exist from previous multiple alignments
# if there are done you will see an ls of them
# or else the twoBitInfo command will be echoed, run it if you want
for DB in `cat ../species.lst`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
mkdir maf run
cd run
rm -f sizes nBeds
for DB in `sed -e "s/ce6 //" ../../species.lst`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /cluster/data/ce6/bed/multiz6way
set c = $1
cat $dir/maf/${c}.maf | nice \
mafAddIRows -nBeds=nBeds stdin /cluster/data/ce6/ce6.2bit $2
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /cluster/data/ce6/chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 217s 3.61m 0.06h 0.00d 0.000 y
# IO & Wait Time: 60s 1.01m 0.02h 0.00d 0.000 y
# Average job time: 40s 0.66m 0.01h 0.00d
# Longest finished job: 55s 0.92m 0.02h 0.00d
# Submission to last job: 58s 0.97m 0.02h 0.00d
# Load anno/maf
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way/anno/maf
mkdir -p /gbdb/ce6/multiz6way/anno/maf
ln -s /cluster/data/ce6/bed/multiz6way/anno/maf/*.maf \
/gbdb/ce6/multiz6way/anno/maf
time nice -n +19 /cluster/bin/x86_64/hgLoadMaf \
-pathPrefix=/gbdb/ce6/multiz6way/anno/maf ce6 multiz6way
# XXX problem: Can't hSetDb(ce6) after an hAllocConn(hg18), sorry.
# Loaded 420764 mafs in 7 files from /gbdb/ce6/multiz6way/anno/maf
# real 0m9.567s
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kolossus
cd /cluster/data/ce6/bed/multiz6way/anno/maf
time cat *.maf | \
nice -n +19 hgLoadMafSummary ce6 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 -test multiz6waySummary stdin
# Created 45771 summary blocks from 1051000 components
# and 420693 mafs from stdin
# real 0m16.138s
# -rw-rw-r-- 1 2111948 Jun 20 10:25 multiz6waySummary.tab
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way/anno/maf
time nice -n +19 hgLoadSqlTab ce6 multiz6waySummary \
~/kent/src/hg/lib/mafSummary.sql multiz6waySummary.tab
# real 0m0.645
# Create per-chrom individual maf files for downloads
ssh kkstore02
cd /cluster/data/ce6/bed/multiz6way
mkdir mafDownloads
time for M in anno/maf/chr*.maf
do
B=`basename $M`
nice -n +19 cp -p ${M} mafDownloads/${B}
nice -n +19 gzip mafDownloads/${B}
echo ${B} done
done
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
# redone 2007-12-21 to fix difficulty in mafFrags when ce6 was not the
# first species listed
# upstream mafs
# There isn't any refGene table, trying sangerGene instead
#!/bin/sh
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 ce6 \
sangerGene:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ce6 multiz6way \
stdin stdout -orgs=species.lst \
| gzip -c > mafDownloads/sangerGene.upstream${S}.maf.gz
echo "done sangerGene.upstream${S}.maf.gz"
done
cd mafDownloads
md5sum *.gz > md5sum.txt
# deliver to downloads
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/ce6/multiz6way
cd /usr/local/apache/htdocs/goldenPath/ce6/multiz6way
ln -s /cluster/data/ce6/bed/multiz6way/mafDownloads/* .
#######################################################################
# MULTIZ5WAY MAF FRAMES (DONE - 2007-05-03 - Hiram)
ssh hgwdev
mkdir /cluster/data/ce6/bed/multiz6way/frames
cd /cluster/data/ce6/bed/multiz6way/frames
mkdir genes
# The following is adapted from the ce4 5-way sequence
#------------------------------------------------------------------------
# get the genes for all genomes
# using sangerGene for ce6
# using blastCe6SG for cb3, caePb2, caeRem3, caeJap1 priPac1
for qDB in ce6 cb3 caePb2 caeRem3 caeJap1 priPac1
do
if [ $qDB = "cb3" -o $qDB = "caePb2" -o $qDB = "caeRem3" -o $qDB = "caeJap1" -o $qDB = "priPac1" ]; then
geneTbl=blastCe6SG
echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
> /scratch/tmp/$qDB.tmp.psl
mrnaToGene -allCds /scratch/tmp/$qDB.tmp.psl stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
rm -f /scratch/tmp/$qDB.tmp.psl
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
elif [ $qDB = "ce6" ]; then
geneTbl=sangerGene
echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
fi
done
ssh kkstore06
cd /cluster/data/ce6/bed/multiz6way/frames
cat ../maf/*.maf | nice -n +19 genePredToMafFrames ce6 \
stdin stdout ce6 genes/ce6.gp.gz cb3 \
genes/cb3.gp.gz caePb2 genes/caePb2.gp.gz caeRem3 \
genes/caeRem3.gp.gz caeJap1 genes/caeJap1.gp.gz \
priPac1 genes/priPac1.gp.gz \
| gzip > multiz6way.mafFrames.gz
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way/frames
nice -n +19 hgLoadMafFrames ce6 multiz6wayFrames multiz6way.mafFrames.gz
##########################################################################
# verify all.joiner entries (DONE - 2008-06-20 - Hiram)
# try to get joinerCheck tests clean output on these commands
ssh hgwdev
cd ~/kent/src/hg/makeDb/schema
joinerCheck -database=ce6 -tableCoverage all.joiner
joinerCheck -database=ce6 -keys all.joiner
joinerCheck -database=ce6 -times all.joiner
##########################################################################
# make downloads (DONE - 2008-06-20 - Hiram)
# verify all tracks have html pages for their trackDb entries
cd /cluster/data/ce6
/cluster/bin/scripts/makeDownloads.pl ce6
# fix the README files
# re-build the upstream files:
cd /cluster/data/ce6/goldenPath/bigZips
for size in 1000 2000 5000
do
echo $size
featureBits ce6 sangerGene:upstream:$size -fa=stdout \
| gzip -c > sangerGene.upstream$size.fa.gz
done
##########################################################################
# create pushQ entry (DONE - 2008-06-20 - Hiram)
# verify all tracks have html pages for their trackDb entries
ssh hgwdev
cd /cluster/data/ce6/jkStuff
/cluster/bin/scripts/makePushQSql.pl ce6 > ce6.pushQ.sql
ce6 does not have seq
Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
and genbank tables) which tracks to assign these tables to:
blastSGPep01
blastSGRef01
multiz6wayFrames
#############################################################################
# Create 6-way downloads (DONE - 2008-06-23 - Hiram)
ssh hgwdev
mkdir /cluster/data/ce6/goldenPath/multiz6way
mkdir /cluster/data/ce6/goldenPath/phastCons6way
cd /cluster/data/ce6/goldenPath/phastCons6way
ln -s ../../bed/multiz6way/phastCons6wayScores/*.gz .
ln -s ../../bed/multiz6way/cons/ave.cons.mod 6way.conserved.mod
ln -s ../../bed/multiz6way/cons/ave.noncons.mod 6way.non-conserved.mod
md5sum *.gz *.mod > md5sum.txt
# copy a README and fix it up for here:
cp /usr/local/apache/htdocs/goldenPath/calJac1/phastCons9way/README.txt .
mkdir /cluster/data/ce6/goldenPath/multiz6way
cd /cluster/data/ce6/goldenPath/multiz6way
cp /usr/local/apache/htdocs/goldenPath/calJac1/multiz9way/README.txt .
# edit that README.txt to be correct for this 6-way alignment
ssh kkstore06
cd /cluster/data/ce6/goldenPath/multiz6way
ln -s ../../bed/multiz6way/6way.nh ./ce6.6way.nh
cp -p ../../bed/multiz6way//anno/maf/*.maf .
gzip *.maf
ssh hgwdev
cd /cluster/data/ce6/goldenPath/multiz6way
# creating upstream files from sangerGene, bash script:
cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=ce6
GENE=sangerGene
NWAY=multiz6way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
stdin stdout \
-orgs=/cluster/data/${DB}/bed/${NWAY}/species.lst \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
'_EOF_'
# << happy emacs
chmod +x ./mkUpstream.sh
time nice -n +19 ./mkUpstream.sh
# real 119m5.562s
# -rw-rw-r-- 1 42975041 Mar 28 14:27 upstream1000.maf.gz
# -rw-rw-r-- 1 76363192 Mar 28 15:03 upstream2000.maf.gz
# -rw-rw-r-- 1 303870318 Mar 28 15:42 upstream5000.maf.gz
# check the names in these upstream files to ensure sanity:
zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
| sort | uniq -c | sort -rn | less
# should be a list of the other 4 species with a high count,
# then sangerGene names, e.g.:
# 10134 priPac1
# 10134 cb3
# 10134 caeRem3
# 10134 caePb2
# 10134 caeJap1
# 1 cTel55X.1a.1
# 1 ZK993.1
# 1 ZK973.9
# 1 ZK973.8
# 1 ZK973.3
# ... etc ...
ssh kkstore06
cd /cluster/data/ce6/goldenPath/multiz6way
md5sum *.gz *.nh > md5sum.txt
ssh hgwdev
cd /cluster/data/ce6/goldenPath/phastCons6way
mkdir /usr/local/apache/htdocs/goldenPath/ce6/phastCons6way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/ce6/phastCons6way
cd /cluster/data/ce6/goldenPath/multiz6way
mkdir /usr/local/apache/htdocs/goldenPath/ce6/multiz6way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/ce6/multiz6way
# if your ln -s `pwd`/* made extra links to files you don't want there,
# check the goldenPath locations and remove those extra links
##########################################################################
# Make protein blast runs for Gene Sorter and Known Genes on Hg18
ssh hgwdev
mkdir /cluster/data/ce6/bed/hgNearBlastp
cd /cluster/data/ce6/bed/hgNearBlastp
# Make sure there are no joinerCheck errors at this point, because
# if left here they can spread via doHgNearBlastp:
runJoiner.csh ce6 sangerPep
# ce6.kimExpDistance.query - hits 13356000 of 19134000 ok
# ce6.kimExpDistance.target - hits 12973822 of 19134000 ok
# ce6.orfToGene.name - hits 30296 of 30296 ok
# ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok
# ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok
# ce6.sangerCanonical.transcript - hits 20051 of 20051 ok
# ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok
# ce6.sangerLinks.orfName - hits 30115 of 30115 ok
# ce6.sangerPep.name - hits 23771 of 23771 ok
# ce6.sangerToRefSeq.name - hits 29995 of 29995 ok
# ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok
pepPredToFa ce6 sangerPep ce6.sangerPep.faa
cat << '_EOF_' > config.ra
# Latest human vs. this ce6 worm for Human Gene Sorter
targetGenesetPrefix known
targetDb hg18
queryDbs ce6
hg18Fa /cluster/data/hg18/bed/ucsc.10/ucscGenes.faa
ce6Fa /cluster/data/ce6/bed/hgNearBlastp/ce6.sangerPep.faa
buildDir /cluster/data/ce6/bed/hgNearBlastp
scratchDir /cluster/data/ce6/bed/hgNearBlastp/tmpHgNearBlastp
'_EOF_'
time nice -n +19 /cluster/bin/scripts/doHgNearBlastp.pl -workhorse=hgwdev \
-noLoad -verbose=2 config.ra > do.log 2>&1 &
##########################################################################
## creating xxBlastTab tables (DONE - 2008-07-11 - Hiram)
# Make sure there are no joinerCheck errors at this point, because
# if left here they can spread via doHgNearBlastp:
ssh hgwdev
runJoiner.csh ce6 sangerPep
# ce6.kimExpDistance.query - hits 13356000 of 19134000 ok
# ce6.kimExpDistance.target - hits 12973822 of 19134000 ok
# ce6.orfToGene.name - hits 30296 of 30296 ok
# ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok
# ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok
# ce6.sangerCanonical.transcript - hits 20051 of 20051 ok
# ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok
# ce6.sangerLinks.orfName - hits 30115 of 30115 ok
# ce6.sangerPep.name - hits 23771 of 23771 ok
# ce6.sangerToRefSeq.name - hits 29995 of 29995 ok
# ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok
ssh hgwdev
mkdir /cluster/data/ce6/bed/hgNearBlastp/080711
cd /cluster/data/ce6/bed/hgNearBlastp/080711
# Get the proteins used by the other hgNear organisms:
pepPredToFa hg18 knownGenePep hg18.known.faa
pepPredToFa mm9 knownGenePep mm9.known.faa
pepPredToFa rn4 knownGenePep rn4.known.faa
pepPredToFa danRer5 ensPep danRer5.ensPep.faa
pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
pepPredToFa ce6 sangerPep ce6.sangerPep.faa
pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
# sanity check, number of lines in each faa file
wc -l *.faa
# 244343 ce6.sangerPep.faa
# 345537 danRer5.ensPep.faa
# 268201 dm3.flyBasePep.faa
# 526563 hg18.known.faa
# 453565 mm9.known.faa
# 86220 rn4.known.faa
# 64956 sacCer1.sgdPep.faa
# sanity check, count the number of protein sequences in each
for F in *.faa
do
echo -n "${F} "
grep "^>" ${F} | wc -l
done
# ce6.sangerPep.faa 23771
# danRer5.ensPep.faa 31743
# dm3.flyBasePep.faa 20736
# hg18.known.faa 45480
# mm9.known.faa 39489
# rn4.known.faa 8160
# sacCer1.sgdPep.faa 5766
cd /cluster/data/ce6/bed/hgNearBlastp
cat << '_EOF_' > config.ra
# Latest worm vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, fly, yeast
targetGenesetPrefix sanger
targetDb ce6
queryDbs hg18 mm9 rn4 danRer5 dm3 sacCer1
recipBest hg18 mm9 rn4 danRer5 dm3 sacCer1
ce6Fa /cluster/data/ce6/bed/hgNearBlastp/080711/ce6.sangerPep.faa
hg18Fa /cluster/data/ce6/bed/hgNearBlastp/080711/hg18.known.faa
mm9Fa /cluster/data/ce6/bed/hgNearBlastp/080711/mm9.known.faa
rn4Fa /cluster/data/ce6/bed/hgNearBlastp/080711/rn4.known.faa
danRer5Fa /cluster/data/ce6/bed/hgNearBlastp/080711/danRer5.ensPep.faa
dm3Fa /cluster/data/ce6/bed/hgNearBlastp/080711/dm3.flyBasePep.faa
sacCer1Fa /cluster/data/ce6/bed/hgNearBlastp/080711/sacCer1.sgdPep.faa
buildDir /cluster/data/ce6/bed/hgNearBlastp/080711
scratchDir /san/sanvol1/scratch/ce6HgNearBlastp
'_EOF_'
# << happy emacs
# Run this in a screen since it is going to take awhile
# Run with -noLoad so we can eyeball files.
# Later overload other databases' dmBlastTab tables.
ssh kkstore06
screen
cd
time nice -n +19 doHgNearBlastp.pl -workhorse=hgwdev -clusterHub=pk \
-noLoad config.ra > do.log 2>&1 &
# watch progress in do.log
# load the blast tabs into ce6
ssh hgwdev
cd /cluster/data/ce6/bed/hgNearBlastp/080711
for D in danRer5 dm3 hg18 mm9 rn4 sacCer1
do
cd run.ce6.$D
grep hgLoadBlastTab loadPairwise.csh
./loadPairwise.csh
cd ..
done
#############################################################################
# Lift nucleosome data from ce4 to ce6 (DONE - 2008-09-28,30 - Hiram)
mkdir /hive/data/genomes/ce6/bed/monoNucleosomes
cd /hive/data/genomes/ce6/bed/monoNucleosomes
ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.for.paired.bed.gz \
ce4.nucleosomeControl.fwd.bed.gz
ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.rev.paired.bed.gz \
ce4.nucleosomeControl.rev.bed.gz
ln -s \
/usr/local/apache/htdocs/goldenPath/ce4/liftOver/ce4ToCe6.over.chain.gz \
ce4ToCe6.over.chain.gz
# The name filed isn't being parsed correctly because of the blanks
# take out the blanks so liftOver can read them, put them back
# in after lift
zcat ce4.nucleosomeControl.fwd.bed.gz | sed -e "s/n = /n=/" \
| liftOver stdin ce4ToCe6.over.chain.gz stdout \
ce6.nucleosomeControl.fwd.unMapped \
| sed -e "s/n=/n = /" | gzip > ce6.nucleosomeControl.fwd.bed.gz
# only 5 items do not map correctly
zcat ce4.nucleosomeControl.rev.bed.gz | sed -e "s/n = /n=/" \
| liftOver stdin ce4ToCe6.over.chain.gz stdout \
ce6.nucleosomeControl.rev.unMapped \
| sed -e "s/n=/n = /" | gzip > ce6.nucleosomeControl.rev.bed.gz
# 71 items do not map correctly
LOAD="hgLoadBed -tab -noNameIx ce6"
zcat ce6.nucleosomeControl.fwd.bed.gz ce6.nucleosomeControl.rev.bed.gz \
| ${LOAD} nucleosomeControl stdin
# Loaded 28951421 elements of size 12
# Collapsed nucleosome track files by chromosome:
ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.for.clean.bed.gz \
./ce4.nucleosomeFragmentsSense.bed.gz
ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.rev.clean.bed.gz \
./ce4.nucleosomeFragmentsAntisense.bed.gz
zcat ce4.nucleosomeFragmentsSense.bed.gz | sed -e "s/n = /n=/" \
| liftOver stdin ce4ToCe6.over.chain.gz stdout \
ce6.nucleosomeControlFragmentsSense.unMapped \
| sed -e "s/n=/n = /" | gzip > ce6.nucleosomeFragmentsSense.bed.gz
# four items do not map
zcat ce4.nucleosomeFragmentsAntisense.bed.gz | sed -e "s/n = /n=/" \
| liftOver stdin ce4ToCe6.over.chain.gz stdout \
ce6.nucleosomeControlFragmentsAntisense.unMapped \
| sed -e "s/n=/n = /" | gzip > ce6.nucleosomeFragmentsAntisense.bed.gz
# 34 items do not map
zcat ce6.nucleosomeFragmentsSense.bed.gz \
| ${LOAD} nucleosomeFragmentsSense stdin
# Loaded 11279598 elements of size 12
zcat ce6.nucleosomeFragmentsAntisense.bed.gz \
| ${LOAD} nucleosomeFragmentsAntisense stdin
# Loaded 11269358 elements of size 12
# these three tracks grouped into the composite: "nucleosome"
# labels:
# composite container: Nucleosome predictions from SOLidD core alignments
# Control - : mononucleosome control
# fragments + : mononucleosomal fragments, sense strand reads
# fragments - : mononucleosomal fragments, antisense strand reads
# Positioning stringency wig track
ln -s \
../../../ce4/bed/monoNucleosomes/chrI-M_positioning_stringency.wig.gz \
./ce4.nucleosomeStringency.wigVariable.txt.gz
zcat ce4.nucleosomeStringency.wigVariable.txt.gz \
| varStepToBedGraph.pl stdin | liftOver stdin ce4ToCe6.over.chain.gz \
stdout ce6.nucleosomeStringency.unMapped \
| gzip > ce6.nucleosomeStringency.bedGraph.gz
# 11 items do not lift
cat << '_EOF_' > bedGraphToVarStep.sh
#!/bin/sh
SPAN=1
SRC0=ce4.nucleosomeStringency.wigVariable.txt.gz
SRC=ce6.nucleosomeStringency.bedGraph.gz
R=ce6.nucleosomeStringency.wigVariable.txt
rm -f ${R}
zcat ${SRC0} | egrep "^browser|^track" > ${R}
zcat ${SRC} | grep "^chr" | cut -f1 | sort -u > chr.list
cat chr.list | while read C
do
echo "variableStep chrom=${C} span=${SPAN}" >> ${R}
zcat "${SRC}" \
| awk '{if (match($1,"^'"${C}"'$")) { print } }' | sort -k2n | awk '
{
printf "%d\t%g\n", $2+1, $4
}
' >> ${R}
done
rm -f chr.list
'_EOF_'
# << happy emacs
time ./bedGraphToVarStep.sh
# real 35m15.201s
gzip ce6.nucleosomeStringency.wigVariable.txt
zcat ce6.nucleosomeStringency.wigVariable.txt.gz \
| wigEncode stdin nucleosomeStringency.wig nucleosomeStringency.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
ln -s `pwd`/nucleosomeStringency.wib /gbdb/ce6/wib
# Positioning stringency wig track
hgLoadWiggle ce6 nucleosomeStringency nucleosomeStringency.wig
# NSome Stringency
# Nucleosome Positioning Stringency from SOLiD core alignments
# Adjusted nucleosome coverage wig track:
ln -s \
../../../ce4/bed/monoNucleosomes/adjusted_coverage_chrI-M.wig.gz \
./ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz
zcat ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz \
| varStepToBedGraph.pl stdin | liftOver stdin ce4ToCe6.over.chain.gz \
stdout ce6.nucleosomeAdjustedCoverage.unMapped \
| gzip > ce6.nucleosomeAdjustedCoverage.bedGraph.gz
time ./bedGraphToVarStep.sh \
ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz \
ce6.nucleosomeAdjustedCoverage.bedGraph.gz \
ce6.nucleosomeAdjustedCoverage.wigVariable.txt
# real 27m40.408s
time gzip ce6.nucleosomeAdjustedCoverage.wigVariable.txt
# real 3m27.146s
time wigEncode ce6.nucleosomeAdjustedCoverage.wigVariable.txt.gz \
nucleosomeAdjustedCoverage.wig nucleosomeAdjustedCoverage.wib
# real 1m25.770s
# Converted ce6.nucleosomeAdjustedCoverage.wigVariable.txt.gz
# upper limit 8.76, lower limit -11.93
ln -s `pwd`/nucleosomeAdjustedCoverage.wib /gbdb/ce6/wib
hgLoadWiggle ce6 nucleosomeAdjustedCoverage nucleosomeAdjustedCoverage.wig
# Adj NSome Covrg
# Adjusted nucleosome coverage (25bp)
# MNase Coverage
# Micrococcal nuclease control coverage
# Cntl MNase +Covg
# Micrococcal nuclease control coverage, sense strand reads
# Cntl MNase -Covg
# Micrococcal nuclease control coverage, antisense strand reads
zcat ce6.nucleosomeControl.fwd.bed.gz | sort -k1,1 -k2,2n \
| bedItemOverlapCount ce6 stdin \
| wigEncode stdin nucleosomeControlSenseCoverage.wig \
nucleosomeControlSenseCoverage.wib
# Converted stdin, upper limit 142.00, lower limit 1.00
ln -s `pwd`/nucleosomeControlSenseCoverage.wib /gbdb/ce6/wib
zcat ce6.nucleosomeControl.rev.bed.gz | sort -k1,1 -k2,2n \
| bedItemOverlapCount ce6 stdin \
| wigEncode stdin nucleosomeControlAntisenseCoverage.wig \
nucleosomeControlAntisenseCoverage.wib
# Converted stdin, upper limit 145.00, lower limit 1.00
ln -s `pwd`/nucleosomeControlAntisenseCoverage.wib /gbdb/ce6/wib
hgLoadWiggle ce6 nucleosomeControlSenseCoverage \
nucleosomeControlSenseCoverage.wig
hgLoadWiggle ce6 nucleosomeControlAntisenseCoverage \
nucleosomeControlAntisenseCoverage.wig
mkdir ce4.monoNucleosomes
cd ce4.monoNucleosomes
ln -s ../../../../ce4/bed/monoNucleosomes/stanford/clean/* .
zcat ce4.monoNucleosomes/chr*.forward.bed.gz \
| liftOver stdin ce4ToCe6.over.chain.gz \
stdout ce6.monoNucleosomesSense.unMapped \
| gzip > ce6.monoNucleosomesSense.bed.gz
zcat ce4.monoNucleosomes/chr*.reverse.bed.gz \
| liftOver stdin ce4ToCe6.over.chain.gz \
stdout ce6.monoNucleosomesAntiSense.unMapped \
| gzip > ce6.monoNucleosomesAntiSense.bed.gz
bedItemOverlapCount ce6 ce6.monoNucleosomesSense.bed.gz \
| wigEncode stdin monoNucleosomesSense.wig monoNucleosomesSense.wib
# Converted stdin, upper limit 16195.00, lower limit 1.00
bedItemOverlapCount ce6 ce6.monoNucleosomesAntiSense.bed.gz \
| wigEncode stdin monoNucleosomesAntiSense.wig \
monoNucleosomesAntiSense.wib
# Converted stdin, upper limit 19137.00, lower limit 1.00
ln -s `pwd`/monoNucleosomesSense.wib /gbdb/ce6/wib
ln -s `pwd`/monoNucleosomesAntiSense.wib /gbdb/ce6/wib
# for downloads:
bedItemOverlapCount ce6 ce6.monoNucleosomesSense.bed.gz \
| gzip --rsyncable > monoNucleosomesSense.wigVariable.gz
bedItemOverlapCount ce6 ce6.monoNucleosomesAntiSense.bed.gz \
| gzip --rsyncable > monoNucleosomesAntiSense.wigVariable.gz
# NSome Coverage
# Coverage of nucleosome predictions from SOLiD core alignments
# Control
# mononucleosome control
hgLoadWiggle ce6 monoNucleosomesSense monoNucleosomesSense.wig
# NSome Core +Covg
# Coverage of mononucleosomal fragments, sense strand reads
hgLoadWiggle ce6 monoNucleosomesAntiSense monoNucleosomesAntiSense.wig
# NSome Core -Covg
# Coverage of mononucleosomal fragments, antisense strand reads
#########################################################################
# lift 25mers repeat track from ce4 (DONE - 2008-09-30 - Hiram)
mkdir /cluster/data/ce6/bed/25mersRepeats
cd /cluster/data/ce6/bed/25mersRepeats
ln -s /cluster/data/ce4/bed/25mersRepeats/withScore.bed \
./ce4.25mersRepeats.bed
ln -s \
/usr/local/apache/htdocs/goldenPath/ce4/liftOver/ce4ToCe6.over.chain.gz \
ce4ToCe6.over.chain.gz
liftOver ce4.25mersRepeats.bed ce4ToCe6.over.chain.gz \
ce6.25mersRepeats.bed ce6.unMapped
# no elements failed the eliftOver
hgLoadBed ce6 25mersRepeats ce6.25mersRepeats.bed
# Loaded 757742 elements of size 9
# clean up
rm bed.tab
gzip ce6.25mersRepeats.bed
#########################################################################
# hgPal downloads (DONE braney 2008-09-30)
ssh hgwdev
screen
bash
rm -rf /cluster/data/ce6/bed/multiz6way/pal
mkdir /cluster/data/ce6/bed/multiz6way/pal
cd /cluster/data/ce6/bed/multiz6way/pal
mz=multiz6way
gp=refGene
db=ce6
echo $db > order.lst
echo "select settings from trackDb where tableName='$mz'" | \
hgsql $db | sed 's/\\n/\n/g'| grep speciesOrder | tr ' ' '\n' | \
tail -n +2 >> order.lst
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > refGene.jobs
time sh -x refGene.jobs > refGene.job.log 2>&1 &
sleep 1
tail -f refGene.job.log
# real 14m44.986s
# user 2m26.113s
# sys 1m41.837s
zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
rm -rf $pd/$gp.exonAA.fa.gz $pd/$gp.exonNuc.fa.gz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
mz=multiz6way
gp=sangerGene
db=ce6
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 16m2.266s
# user 2m43.764s
# sys 2m7.173s
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
pd=/usr/local/apache/htdocs/goldenPath/ce6/multiz6way
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
#########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
ce6.upstreamGeneTbl = refGene
ce6.upstreamMaf = multiz6way /hive/data/genomes/ce6/bed/multiz6way/species.lst
############################################################################
# CE6->CE7 LIFTOVER (DONE - 2008-06-24 - Hiram)
cd /hive/data/genomes/ce6
# test procedure with -debug first
doSameSpeciesLiftOver.pl -bigClusterHub=swarm -workhorse=hgwdev \
-ooc /hive/data/genomes/ce6/jkStuff/11.ooc ce6 ce7 -debug
cd bed/blat.ce7.2009-07-28
time nice -n +19 doSameSpeciesLiftOver.pl -bigClusterHub=swarm \
-workhorse=hgwdev \
-ooc /hive/data/genomes/ce6/jkStuff/11.ooc ce6 ce7 > do.log 2>&1 &
# real 8m7.833s
#########################################################################
# ALG Binding sites (DONE - 2010-01-14 - Hiram)
# From the Gene Yeo lab:
# fetch files
T="http://www.snl.salk.edu/~lovci/Amy/ce6/alltheexperiments/for_ucsc"
one() {
wget --timestamping \
"${T}/$1" -O $1
}
one all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz
one all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz
one MT1.neg.off.WIG.gz
one MT1.pos.off.WIG.gz
one MT2.neg.off.WIG.gz
one MT2.pos.off.WIG.gz
one MT3.neg.off.WIG.gz
one MT3.pos.off.WIG.gz
one WT1.neg.off.WIG.gz
one WT1.pos.off.WIG.gz
one WT2.neg.off.WIG.gz
one WT2.pos.off.WIG.gz
one WT3.neg.off.WIG.gz
one WT3.pos.off.WIG.gz
one WT_region_specific_clusters.weight_normal.3.rp.bed.WT.good_clusters.25.gz
# encode and load wiggle tracks
one() {
ID=$1
NP=$2
RF=$3
UC=$4
LC=`echo $ID | tr '[A-Z]' '[a-z]'`
zcat $ID.$NP.off.WIG.gz \
| wigEncode stdin ${LC}${UC}.wig ${LC}${UC}.wib > ${LC}$UC.log 2>&1
ln -s `pwd`/${LC}${UC}.wib /gbdb/ce6/wib
hgLoadWiggle ce6 ${LC}${UC} ${LC}${UC}.wig
}
one MT1 neg reverse Reverse
one MT1 pos forward Forward
one MT2 neg reverse Reverse
one MT2 pos forward Forward
one MT3 neg reverse Reverse
one MT3 pos forward Forward
one WT1 neg reverse Reverse
one WT1 pos forward Forward
one WT2 neg reverse Reverse
one WT2 pos forward Forward
one WT3 neg reverse Reverse
one WT3 pos forward Forward
# load BED tables:
zcat WT_region_specific_clusters.weight_normal.3.rp.bed.WT.good_clusters.25.gz \
| grep -v track \
| awk -F'\t' '{printf "%s %d %d %s 1 %s\n", $1, $2, $3, $4, $6}' \
| hgLoadBed -noNameIx ce6 algBindSites stdin
zcat all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz \
| grep -v track | awk -F'\t' '{printf "%s %d %d %s 0 %s\n", $1, $2, $3, $4, $6}' | hgLoadBed -noNameIx ce6 algBindGenic stdin
+############################################################################
+# CE6->CE4 LIFTOVER (working - 2010-04-15 - chin)
+
+ mkdir /hive/data/genomes/ce6/bed/blat.ce4.2010-04-15
+ cd /hive/data/genomes/ce6/bed/blat.ce4.2010-04-15
+ # -debug run to create run dir, preview scripts...
+ doSameSpeciesLiftOver.pl -debug -ooc /scratch/data/ce6/11.ooc \
+ ce6 ce4
+ # Real run:
+ time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
+ -ooc /scratch/data/ce6/11.ooc \
+ -bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
+ ce6 ce4 > do.log 2>&1 &
+ # real 6m22.653s
+
#########################################################################