src/hg/makeDb/doc/ce6.txt 1.18
1.18 2010/01/19 20:14:21 hiram
adding algBinding tracks
Index: src/hg/makeDb/doc/ce6.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/ce6.txt,v
retrieving revision 1.17
retrieving revision 1.18
diff -b -B -U 1000000 -r1.17 -r1.18
--- src/hg/makeDb/doc/ce6.txt 28 Jul 2009 21:55:25 -0000 1.17
+++ src/hg/makeDb/doc/ce6.txt 19 Jan 2010 20:14:21 -0000 1.18
@@ -1,2746 +1,2808 @@
# for emacs: -*- mode: sh; -*-
# Caenorhabditis elegans
# Washington University School of Medicine GSC and Sanger Institute WS190
# $Id$
#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2008-05-30 - Hiram)
ssh kkstore06
mkdir /cluster/store3/worm/ce6
ln -s /cluster/store3/worm/ce6 /cluster/data/ce6
mkdir /cluster/data/ce6/downloads
cd /cluster/data/ce6/downloads
for C in I II III IV V X MtDNA
do
F=WS190/CHROMOSOMES/CHROMOSOME_${C}
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.agp"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.dna.gz"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/${F}.gff.gz"
done
#########################################################################
# NORMALIZE SEQUENCE NAMES TO BEGIN WITH chr (DONE - 2008-05-30 - Hiram)
ssh kkstore06
cd /cluster/data/ce6/downloads
# Fix fasta names:
zcat CHR*.dna.gz \
| sed -e '/^$/ d; s/^>CHROMOSOME_MtDNA/>chrM/; s/^>CHROMOSOME_/>chr/;' \
| gzip -c > UCSC.fa.gz
faSize -detailed UCSC.fa.gz
# chrI 15072421
# chrII 15279323
# chrIII 13783681
# chrIV 17493785
# chrM 13794
# chrV 20919568
# chrX 17718854
# Make sure we get the same sizes from this command:
zcat CHR*.dna.gz | sed -e '/^$/ d;' | faSize -detailed stdin
# Fix AGP names:
sed -e 's/^/chr/' CHR*.agp > UCSC.agp
# And add a fake mitochondrial AGP entry for the sake of downstream
# tools (make sure the GenBank sequence is identical to given):
echo -e "chrM\t1\t13794\t1\tF\tNC_001328.1\t1\t13794\t+" >> UCSC.agp
#########################################################################
# run the makeGenomeDb procedure to create the db and unmasked sequence
# (DONE - 2008-05-30 - Hiram)
ssh kkstore06
cd /cluster/data/ce6
cat << '_EOF_' > ce6.config.ra
# Config parameters for makeGenomeDb.pl:
db ce6
clade worm
genomeCladePriority 10
scientificName Caenorhabditis elegans
commonName C. elegans
assemblyDate May 2008
assemblyLabel Washington University School of Medicine GSC and Sanger Institute WS190
orderKey 826
mitoAcc none
fastaFiles /cluster/data/ce6/downloads/UCSC.fa.gz
agpFiles /cluster/data/ce6/downloads/UCSC.agp
# qualFiles /dev/null
dbDbSpeciesDir worm
'_EOF_'
# << emacs
mkdir jkStuff
# run just to AGP to make sure things are sane first
nice makeGenomeDb.pl ce6.config.ra -stop agp \
> jkStuff/makeGenomeDb.agp 2>&1
# now, continuing to make the Db and all
time nice -n +19 makeGenomeDb.pl ce6.config.ra -continue db \
> jkStuff/makeGenomeDb.log 2>&1
# that takes 2m30s to run to completion
# take the trackDb business there and check it into the source tree
# fixup the description, gap and gold html page descriptions
#########################################################################
# REPEATMASKER (DONE - 2008-05-30 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/repeatMasker
cd /cluster/data/ce6/bed/repeatMasker
time nice -n +19 doRepeatMasker.pl -bigClusterHub=pk \
-buildDir=/cluster/data/ce6/bed/repeatMasker ce6 > do.log 2>&1 &
# real 133m17.088s
# from the do.log:
# RepeatMasker version development-$Id:
# RepeatMasker,v 1.20 2008/01/16 18:15:45 angie Exp $
# CC RELEASE 20071204; *
#########################################################################
# SIMPLE REPEATS (DONE - 2008-05-30 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/simpleRepeat
cd /cluster/data/ce6/bed/simpleRepeat
time nice -n +19 doSimpleRepeat.pl -smallClusterHub=kki \
-buildDir=/cluster/data/ce6/bed/simpleRepeat ce6 > do.log 2>&1 &
# about 30 minutes
#########################################################################
# MASK SEQUENCE WITH RM+TRF (DONE - 2008-05-30 - Hiram)
# Since both doRepeatMasker.pl and doSimpleRepeats.pl have completed,
# now it's time to combine the masking into the final ce6.2bit,
# following the instructions at the end of doSimpleRepeat's output.
ssh kolossus
cd /cluster/data/ce6
twoBitMask ce6.rmsk.2bit -add bed/simpleRepeat/trfMask.bed ce6.2bit
# You can safely ignore the warning about extra BED columns
twoBitToFa ce6.2bit stdout | faSize stdin
# 100281426 bases (0 N's 100281426 real 86981803 upper 13299623 lower)
# in 7 sequences in 1 files
# Total size: mean 14325918.0 sd 6728308.1 min 13794 (chrM)
# max 20919568 (chrV) median 15279323
# %13.26 masked total, %13.26 masked real
# set the symlink on hgwdev to /gbdb/ce6
cd /gbdb/ce6
ln -s /cluster/data/ce6/ce6.2bit .
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2008-05-30 - Hiram)
# Use -repMatch=100 (based on size -- for human we use 1024, and
# worm size is ~3.4% of human judging by gapless ce4 vs. hg18 genome
# size from featureBits. So we would use 34, but that yields a very
# high number of tiles to ignore, especially for a small more compact
# genome. Bump that up a bit to be more conservative.
ssh kolossus
cd /cluster/data/ce6
blat ce6.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=jkStuff/11.ooc -repMatch=100
# Wrote 8523 overused 11-mers to jkStuff/11.ooc
# copy all of this stuff to the Iservers
ssh kkr1u00
mkdir /iscratch/i/worms/ce6
cd /iscratch/i/worms/ce6
cp -p /cluster/data/ce6/ce6.2bit .
cp -p /cluster/data/ce6/jkStuff/11.ooc .
cp -p /cluster/data/ce6/chrom.sizes .
-
cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/ce6
#########################################################################
# GENBANK AUTO UPDATE (DONE - 2008-05-30 - Hiram)
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add ce6 just before ce4
# ce6 (C. elegans)
ce6.serverGenome = /cluster/data/ce6/ce6.2bit
ce6.clusterGenome = /iscratch/i/worms/ce6/ce6.2bit
ce6.ooc = /iscratch/i/worms/ce6/11.ooc
ce6.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter}
ce6.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter}
ce6.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
ce6.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter}
ce6.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter}
ce6.maxIntron = 100000
ce6.lift = no
ce6.genbank.mrna.xeno.load = yes
ce6.refseq.mrna.xeno.load = yes
ce6.downloadDir = ce6
cvs ci -m "Added ce6." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial ce6 &
# logFile: var/build/logs/2008.05.30-15:04:49.ce6.initalign.log
# real 137m58.675s
# real 61m33.930s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad ce6
# logFile: var/dbload/hgwdev/logs/2008.05.30-20:29:01.dbload.log
# real 23m8.651s
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add ce6 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added ce6 - C. elegans WS190" \
etc/align.dbs etc/hgwdev.dbs
make etc-update
###############################################################################
## SANGER GENE TRACK (DONE - 2008-06-03 - Hiram)
## There is a tremendous amount of extraneous notations in the gff
## files. Filter them down to a manageable set, change the chrom
## names, eliminate duplicates, select only what will work in
## ldHgGene
ssh kkstore06
mkdir /cluster/data/ce6/bed/sangerGene
cd /cluster/data/ce6/bed/sangerGene
for C in I II III IV V X
do
echo -n "${C} "
zcat ../../downloads/CHROMOSOME_${C}.gff.gz | \
sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
s/CHROMOSOME_M/chrM/g;" \
-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
-e 's/CDS "//' -e 's/"//' \
> chr${C}.gff
done
C=M
echo -n "${C} "
zcat ../../downloads/CHROMOSOME_MtDNA.gff.gz | \
sed -e "s/CHROMOSOME_III/chrIII/g; s/CHROMOSOME_II/chrII/g; \
s/CHROMOSOME_IV/chrIV/g; s/CHROMOSOME_I/chrI/g; \
s/CHROMOSOME_X/chrX/g; s/CHROMOSOME_V/chrV/g; \
s/CHROMOSOME_M/chrM/g; s/chrMtDNA/chrM/g;" \
-e 's/Sequence "\(.*\)"$/\1/' -e 's/Transcript "\(.*\)"$/\1/' \
-e 's/CDS "//' -e 's/"//' \
> chr${C}.gff
for C in I II III IV V X M
do
echo "chr${C}.gff -> filtered.chr${C}.gff"
grep -v "^#" chr${C}.gff | awk -F'\t' '
BEGIN { IGNORECASE=1 }
{
if (match($2,"curated|Coding_transcript")) {
if (match($3,"intron|coding_exon|exon|cds|three_prime_UTR|five_prime_UTR")) {
gsub("coding_exon","CDS",$3)
gsub("Transcript ","",$9)
gsub(" .*","",$9)
gsub("three_prime_UTR","3utr",$3)
gsub("five_prime_UTR","5utr",$3)
for (i = 1; i < 9; ++i) {
printf "%s\t", $i
}
printf "%s\n", $9
}
}
}
' | sort -u | sort -k4n > filtered.chr${C}.gff
done
ssh hgwdev
cd /cluster/data/ce6/bed/sangerGene
nice -n +19 ldHgGene ce6 sangerGene filtered.*.gff
nice -n +19 ldHgGene -out=filteredGenePred.tab ce6 sangerGene filtered.*.gff
# Read 52512 transcripts in 997038 lines in 7 files
# 52512 groups 7 seqs 3 sources 5 feature types
# 30296 gene predictions
## this doesn't work with the bin column, so alter that table
hgsql -e "alter table sangerGene drop column bin;" ce6
## found when trying to click through on sequence links, mRNA or
## protein when on a gene details page
## Create orfToGene table. The tar file containing this information
# was obtained from Anthony Rogers via Email 2008-06-03 ar2@sanger.ac.uk
# They had one duplicate line in their file:
sort -u /cluster/data/ce6/downloads/tarFile/gene_name.txt \
| sed -e "s/ */\t/" > orfToGene.tab
## plus, they didn't include the self with self references, which are
## needed in the gene sorter so it can find its names. So, fixing
# this table:
hgsql -N -e "select name from sangerGene;" ce6 | sort > name.sangerGene.txt
cat << '_EOF_' > fixupOrf.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %sangerGeneName;
open (FH, "<name.sangerGene.txt") or die "can not read name.sangerGene.txt";
while (my $line = <FH>) {
chomp $line;
$sangerGeneName{$line} = 1;
}
close (FH);
open (FH,"<orfToGene.tab") or die "can not read orfToGene.tab";
my %alias; # key is alias name, value is real name
my %realNames; # key is real name, value is 1 (unimportant)
my %twoDot; # key is two dot name, value is 1 (unimportant)
while (my $line = <FH>) {
chomp $line;
my ($alias, $real) = split('\s+', $line);
die "duplicate alias name found: $alias" if (exists($alias{$alias}));
if (exists($sangerGeneName{$alias})) {
$alias{$alias} = $real;
$realNames{$real} = 1;
my ($a, $b, $c, $d) = split('\.',$alias);
die "four dot name found $line" if (defined($d));
my $twoDotName = sprintf "%s.%s", $a, $b;
if (exists($sangerGeneName{$twoDotName})) {
$twoDot{$twoDotName} = $real;
}
}
}
close (FH);
my $count = 0;
my $noReal = 0;
my %needReal;
foreach my $key (keys %alias) {
++$count;
my $realName = $alias{$key};
if (!exists($alias{$realName})) {
if (!exists($needReal{$realName}) &&
exists($sangerGeneName{$realName})) {
++$noReal;
$needReal{$realName} = $key;
my ($a, $b, $c, $d) = split('\.',$realName);
die "four dot name found $realName" if (defined($d));
my $twoDotName = sprintf "%s.%s", $a, $b;
if (exists($sangerGeneName{$twoDotName})) {
$twoDot{$twoDotName} = $key;
}
}
}
}
print STDERR "found $count alias names\n";
print STDERR "can not find $noReal real names in alias list\n";
$count = 0;
foreach my $key (keys %realNames) {
++$count;
}
print STDERR "found $count real names\n";
$noReal = 0;
foreach my $key (keys %needReal) {
++$noReal;
print STDERR "$key $needReal{$key}\n" if ($noReal < 10);
}
print STDERR "need to add $noReal real names\n";
open (FH,">fixedOrfToGene.tab") or die "can not write to fixedOrfToGene.tab";
foreach my $key (keys %alias) {
printf FH "%s\t%s\n", $key, $alias{$key};
}
foreach my $key (keys %needReal) {
printf FH "%s\t%s\n", $key, $key;
}
foreach my $key (keys %twoDot) {
if ( (!exists($alias{$key})) && (!exists($needReal{$key})) ) {
print STDERR "need name for $key $twoDot{$key}\n";
printf FH "%s\t%s\n", $key, $twoDot{$key};
}
}
foreach my $key (keys %sangerGeneName) {
if ( (!exists($alias{$key})) && (!exists($needReal{$key})) ) {
if ( !exists($twoDot{$key}) ) {
print STDERR "last chance need name for $key $key\n";
printf FH "%s\t%s\n", $key, $key;
}
}
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x fixupOrf.pl
./fixupOrf.pl > fixupOrf.out 2>&1
hgLoadSqlTab ce6 orfToGene ~/kent/src/hg/lib/orfToGene.sql \
fixedOrfToGene.tab
ln -s /cluster/data/ce6/downloads/tarFile/paragraph.txt ./paragraph.tab
ln -s /cluster/data/ce6/downloads/tarFile/uniprot.txt ./uniprot.tab
cat << '_EOF_' > mkSangerLinks.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %geneToOrf; # key is uc(gene name), value is orf
my %orfToGene; # key is uc(orf), value is gene name
my $orfCount = 0;
open (FH, "<orfToGene.tab") or die "can not read orfToGene.tab";
while (my $line = <FH>) {
chomp $line;
my ($orf, $gene) = split('\t',$line);
die "gene name zero length" if (length($gene) < 1);
die "orf name zero length" if (length($orf) < 1);
$geneToOrf{uc($gene)} = $orf;
$orfToGene{uc($orf)} = $gene;
++$orfCount;
}
close (FH);
printf STDERR "read $orfCount orf designations from orfToGene.tab\n";
open (LS,">sangerLinks.tab") or die "can not write to sangerLinks.tab";
my $geneCount = 0;
open (FH, "sort -u paragraph.tab|") or die "can not read paragraph.tab";
while (my $line = <FH>) {
chomp $line;
my ($orf, $descr) = split('\t',$line);
die "gene name zero length" if (length($orf) < 1);
die "can not find orfToGene $orf" if (!exists($orfToGene{uc($orf)}));
my $orfGene = $orfToGene{uc($orf)};
die "orfGene name zero length" if (length($orfGene) < 1);
printf LS "%s\t%s\t%s\n", $orf, $orfGene, $descr;
++$geneCount;
}
close (FH);
printf STDERR "read $geneCount gene descriptions from paragraph.tab\n";
close (LS);
'_EOF_'
# << happy emacs
chmod +x mkSangerLinks.pl
./mkSangerLinks.pl
# read 33792 orf designations from orfToGene.tab
# read 33793 gene descriptions from paragraph.tab
# There is problem in paragraph.tab, it contains a duplicate entry:
F57C7.4 F57C7.4 contains similarity to Interpro domain IPR008972 ()
F57C7.4 F57C7.4 contains similarity to Plasmodium falciparum Hypothetical protein.\; TR:Q8I5U3
# manually fixup this one to elimiate the second definition
hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
sangerLinks.tab
# Add proteinID field to sangerGene table, used by Gene Sorter
hgsql -e 'alter table sangerGene add proteinID varchar(40) NOT NULL;' ce6
# To add index on this column
hgsql -e 'alter table sangerGene add index(proteinID);' ce6
# Use SwissProt IDs in sangerLinks table to fill proteinID
# column in the sangerGene table
hgsql -e 'update sangerGene as g, sangerLinks as l \
set g.proteinID = l.protName where g.name = l.orfName;' ce6
# check there are rows with the protName field filled
hgsql -N -e "select proteinId from sangerGene" ce6 | sort | uniq -c | wc -l
# 20127
hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
sangerLinks.tab
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/WS190/wormpep190.tar.gz"
wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/wormbase/WS190/wormrna190.tar.gz"
tar xvzf wormrna190.tar.gz
mv README README.wormrna190
tar xvzf wormpep190.tar.gz
# creates files in directory wormpep190:
# -rw-rw-r-- 1 32316202 Apr 7 06:38 wormpep.dna190
# -rw-rw-r-- 1 980303 Apr 7 06:41 wormpep.history190
# -rw-rw-r-- 1 4826 Apr 7 06:41 wormpep.diff190
# -rw-rw-r-- 1 19139454 Apr 7 06:41 wormpep.fasta190
# -rw-rw-r-- 1 398315 Apr 7 06:41 wormpep.accession190
# -rw-rw-r-- 1 13244499 Apr 15 06:23 wormpep190
# -rw-rw-r-- 1 2608936 Apr 15 06:23 wormpep.table190
hgPepPred ce6 generic sangerPep wormpep190/wormpep190
###############################################################################
######## MAKING GENE SORTER TABLES #### (DONE - 2008-06-04 - Hiram)
# These are instructions for building the
# Gene Sorter browser. Don't start these until
# there is a sangerGene table with a proteinID column containing Swiss-Prot
# protein IDs, and also Kim lab expression data is required (in hgFixed).
# Cluster together various alt-splicing isoforms.
# Creates the sangerIsoforms and sangerCanonical tables
ssh hgwdev
hgClusterGenes ce6 sangerGene sangerIsoforms sangerCanonical
# Got 20051 clusters, from 30296 genes in 7 chromosomes
featureBits ce6 sangerCanonical
# 57412493 bases of 100281426 (57.251%) in intersection
featureBits ce4 sangerCanonical
# 57092312 bases of 100281244 (56.932%) in intersection
featureBits ce2 sangerCanonical
# 54156601 bases of 100291761 (53.999%) in intersection
featureBits ce1 sangerCanonical
# 53654286 bases of 100277784 (53.506%) in intersection
# Create Self blastp table - sangerBlastTab
# Get sangerPep peptide sequences fasta file from sangerPep dir
# and create a blast database out of them.
mkdir -p /cluster/data/ce6/bed/blastp
cd /cluster/data/ce6/bed/blastp
cat << '_EOF_' > pepTabToFa.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH,"<../sangerGene/sangerPep.tab") or
die "can not open ../sangerGene/sangerPep.tab";
while (my $line = <FH>) {
chomp $line;
my ($name,$seq) = split('\s+',$line);
printf ">%s\n%s\n", $name, $seq;
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x ./pepTabToFa.pl
./pepTabToFa.pl > wormPep190.faa
/cluster/bluearc/blast229/formatdb -i wormPep190.faa \
-t wormPep190 -n wormPep190
# Copy database over to kluster storage
mkdir -p /san/sanvol1/scratch/worms/ce6/blastp
rsync -a ./ /san/sanvol1/scratch/worms/ce6/blastp/
cd /cluster/data/ce6/bed/blastp
mkdir split
faSplit sequence wormPep190.faa 1000 split/wp
# Make parasol run directory
ssh pk
mkdir -p /cluster/data/ce6/bed/blastp/self
cd /cluster/data/ce6/bed/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat << '_EOF_' > blastSome.csh
#!/bin/csh -ef
setenv BLASTMAT /cluster/bluearc/blast229/data
/cluster/bluearc/blast229/blastall -p blastp \
-d /san/sanvol1/scratch/worms/ce6/blastp/wormPep190 \
-i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
# << happy emacs
chmod a+x blastSome.csh
# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome.csh {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'
# << Create parasol batch
ls ../../split/*.fa > split.lst
gensub2 split.lst single gsub jobList
para create jobList
para try.... etc ...
# Completed: 986 of 986 jobs
# CPU time in finished jobs: 18187s 303.12m 5.05h 0.21d 0.001 y
# IO & Wait Time: 2713s 45.22m 0.75h 0.03d 0.000 y
# Average job time: 21s 0.35m 0.01h 0.00d
# Longest finished job: 73s 1.22m 0.02h 0.00d
# Submission to last job: 302s 5.03m 0.08h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/ce6/bed/blastp/self/run/out
hgLoadBlastTab ce6 sangerBlastTab *.tab
# Scanning through 986 files
# Loading database with 1255312 rows
# CREATE EXPRESSION DISTANCE TABLES FOR
# KIM LAB EXPRESSION DATA (this appears to be an expensive command,
# about 10 to 15 minutes or so ?)
hgExpDistance ce6 hgFixed.kimWormLifeMedianRatio \
hgFixed.kimWormLifeMedianExps kimExpDistance
# Have 19134 elements in hgFixed.kimWormLifeMedianRatio
# Got 19134 unique elements in hgFixed.kimWormLifeMedianRatio
# CREATE TABLE TO MAP BETWEEN SANGERGENES AND REFSEQ GENES
# sangerToRefSeq
hgMapToGene ce6 refGene sangerGene sangerToRefSeq
# SET dbDb TABLE ENTRY FOR GENE SORTER (DONE, 2004-05-25, hartera)
# set hgNearOk to 1 on hgcentraltest to make Ce6 Gene Sorter viewable
hgsql -e 'update dbDb set hgNearOk = 1 where name = "ce6";' \
hgcentraltest
# Running joinerCheck to see what is sane:
cd ~/kent/src/hg/makeDb/schema
joinerCheck -identifier=wormBaseId -database=ce6 -times -keys ./all.joiner
# ce6.orfToGene.name - hits 30296 of 30296 ok
# ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok
# ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok
# ce6.sangerCanonical.transcript - hits 20051 of 20051 ok
# ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok
# ce6.sangerLinks.orfName - hits 30115 of 30115 ok
# ce6.sangerPep.name - hits 23771 of 23771 ok
# ce6.sangerToRefSeq.name - hits 29995 of 29995 ok
## create a gene name to WBGene ID reference table
## to be used to construct URL
mkdir /cluster/data/ce6/bed/WBGeneID
cd /cluster/data/ce6/bed/WBGeneID
# There was one duplicate entry in the file, the sort -u eliminates it
sed -e "s#http://wormbase.org/db/gene/gene?name=##; s#;class=Gene##" \
../../downloads/tarFile/url.txt | sort -u > sangerGeneToWBGeneID.tab
# add in the one-dot names for those names that are only two-dot names
# in this file:
cat << '_EOF_' > addDotName.pl
#!/usr/bin/env perl
use strict;
use warnings;
my %sangerGeneName;
open (FH, "sangerGene.name") or die "can not read sangerGene.name";
while (my $line = <FH>) {
chomp $line;
$sangerGeneName{$line} = 1;
}
close (FH);
open (FH,"<sangerGeneToWBGeneID.tab") or
die "can not read sangerGeneToWBGeneID.tab";
my %twoDotNames;
while (my $line = <FH>) {
chomp $line;
my ($name, $wbId) = split('\s+',$line);
die "duplicate gene name $name $wbId" if (exists($twoDotNames{$name}));
if (exists($sangerGeneName{$name})) {
$twoDotNames{$name} = $wbId;
}
}
close (FH);
my %neededNames;
foreach my $key (keys %twoDotNames) {
my ($a, $b, $c, $d) = split('\.',$key);
my $wbId = $twoDotNames{$key};
die "three dot name found $key $wbId" if (defined($d));
if (defined($c)) {
my $oneDot = sprintf "%s.%s", $a, $b;
if (!exists($twoDotNames{$oneDot})) {
if (!exists($neededNames{$oneDot})) {
$neededNames{$oneDot} = $wbId;
}
}
}
}
foreach my $key (keys %twoDotNames) {
printf "%s\t%s\n", $key, $twoDotNames{$key};
}
foreach my $key (keys %neededNames) {
printf "%s\t%s\n", $key, $neededNames{$key};
}
'_EOF_'
# << happy emacs
chmod +x addDotName.pl
hgsql -N -e "select name from sangerGene;" ce6 | sort -u > sangerGene.name
./addDotName.pl > fixed.sangerGeneToWBGeneID.tab
hgLoadSqlTab ce6 sangerGeneToWBGeneID \
~/kent/src/hg/lib/sangerGeneToWBGeneID.sql \
fixed.sangerGeneToWBGeneID.tab
cd ~/kent/src/hg/makeDb/schema
joinerCheck -identifier=wormBaseId -database=ce6 -times -keys ./all.joiner
# ce6.orfToGene.name - hits 30296 of 30296 ok
# ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok
# ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok
# ce6.sangerCanonical.transcript - hits 20051 of 20051 ok
# ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok
# ce6.sangerLinks.orfName - hits 30115 of 30115 ok
# ce6.sangerPep.name - hits 23771 of 23771 ok
# ce6.sangerToRefSeq.name - hits 29995 of 29995 ok
# ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok
############################################################################
## fixup sangerLinks table (DONE - 2008-06-03 - Hiram)
## it is missing descriptions for the one dot names where only two dot
## names existed
mkdir /cluster/data/ce6/bed/fixupSangerLinks
cd /cluster/data/ce6/bed/fixupSangerLinks
ln -s ../sangerGene/sangerLinks.tab .
cat << '_EOF_' > slFixup.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH,"<sangerLinks.tab") or die "can not read sangerLinks.tab";
my %existing; # key is full gene symbol name, one or two dots
# value is the orfName and description text, tab separated
my %newNeeded; # key is shortened one dot name, value is orfName and
# description text, tab separated
my $newCount = 0;
while (my $line = <FH>) {
chomp $line;
my ($name, $orf, $desc) = split('\t',$line);
my ($a, $b, $c, $d) = split('\.', $name);
die "ERROR: found three dot name at $line" if (defined($d));
if (defined($c)) {
my $oneDotName = sprintf "%s.%s", $a, $b;
if (exists($newNeeded{$oneDotName})) {
my $newString = sprintf "%s\t%s", $orf, $desc;
die "ERROR: new one dot name, different descr $name"
if ($newNeeded{$oneDotName} ne $newString)
} else {
$newNeeded{$oneDotName} = sprintf "%s\t%s", $orf, $desc;
++$newCount;
}
}
die "ERROR: duplicate name $name at $line" if (exists($existing{$name}));
$existing{$name} = sprintf "%s\t%s", $orf, $desc;
}
close (FH);
foreach my $key (keys %newNeeded) {
die "ERROR: duplicate new name $key for $newNeeded{$key}"
if (exists($existing{$key}));
printf "%s\t%s\n", $key, $newNeeded{$key};
}
foreach my $key (keys %existing) {
die "ERROR: duplicate existing name $key for $existing{$key}"
if (exists($newNeeded{$key}));
printf "%s\t%s\n", $key, $existing{$key};
}
'_EOF_'
# << happy emacs
chmod +x ./slFixup.pl
./slFixup.pl > fixedSangerLinks.tab
hgLoadSqlTab ce6 sangerLinks ~/kent/src/hg/lib/sangerLinks.sql \
fixedSangerLinks.tab
# It still has a difficulty because it contains references to the RNA
# genes. joinerCheck complains. So, after loading sangerRnaGene
# below, remove the entries in sangerLinks that are in sangerRnaGene.
# I tried to figure out a delete statement that would do this, but
# to no avail. So, brain dead list management:
hgsql -N -e "select orfName from sangerLinks;" ce6 \
| sort -u > sangerLinks.orfName
hgsql -N -e "select name from sangerRnaGene;" ce6 \
| sort -u > sangerRnaGene.name
# before delete:
hgsql -e "select count(*) from sangerLinks;" ce6
# 36555
comm -12 sangerLinks.orfName sangerRnaGene.name | while read N
do
hgsql -e "delete from sangerLinks where orfName=\"$N\";" ce6
echo $N
done
# after delete:
hgsql -e "select count(*) from sangerLinks;" ce6
# 30115
# 36555 - 30115 = 6440
###############################################################################
## RNA sangerGenes (DONE - 2008-06-03 - Hiram)
ssh kkstore06
cd /cluster/data/ce6/bed/sangerGene
for C in I II III IV V X M
do
echo "chr${C}.gff -> rna.chr${C}.gff"
grep -v "^#" chr${C}.gff | awk -F'\t' '
BEGIN { IGNORECASE=1 }
{
if (match($2,"^miRNA|^ncRNA|^rRNA|^scRNA|^snRNA|^snlRNA|^snoRNA|^tRNA|^tRNAscan-SE-1.23")) {
if (match($3,"exon")) {
gsub("Transcript ","",$9)
gsub(" .*","",$9)
for (i = 1; i < 9; ++i) {
printf "%s\t", $i
}
printf "%s\n", $9
}
}
}
' | sort -u | sort -k4n > rna.chr${C}.gff
done
for C in I II III IV V X M
do
echo "chr${C}.gff -> pseudoGene.chr${C}.gff"
grep -v "^#" chr${C}.gff | awk -F'\t' '
BEGIN { IGNORECASE=1 }
{
if (match($2,"^Pseudogene")) {
if (match($3,"Pseudogene")) {
gsub("Pseudogene ","",$9)
gsub("Pseudogene","CDS",$3)
gsub(" .*","",$9)
gsub("\".*","",$9)
for (i = 1; i < 9; ++i) {
printf "%s\t", $i
}
printf "%s\n", $9
}
}
}
' | sort -u | sort -k4n > pseudoGene.chr${C}.gff
done
ssh hgwdev
cd /cluster/data/ce6/bed/sangerGene
time nice -n +19 ldHgGene ce6 sangerRnaGene rna.*.gff
# Read 6440 transcripts in 6474 lines in 7 files
# 6440 groups 7 seqs 9 sources 1 feature types
# 6440 gene predictions
time nice -n +19 ldHgGene ce6 sangerPseudoGene pseudoGene.*.gff
# Read 1462 transcripts in 1462 lines in 7 files
# 1462 groups 6 seqs 1 sources 1 feature types
# 1462 gene predictions
############################################################################
# BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("ce6", "blat9", "17788", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("ce6", "blat9", "17789", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
# reset default position, same as ce4 on the ZC101 / unc-52 locus
ssh hgwdev
hgsql -e 'update dbDb set defaultPos="chrII:14646344-14667746"
where name="ce6";' hgcentraltest
##########################################################################
## Blastz SELF (DONE - 2008-06-04 - Hiram
ssh kkstore06
# screen
mkdir /cluster/data/ce6/bed/blastzCe6.2008-06-04
cd /cluster/data/ce6/bed
ln -s blastzCe6.2008-06-04 blastzSelf
cd blastzCe6.2008-06-04
cat << '_EOF_' > DEF
# ce6 vs ce6 - Self alignment
BLASTZ_H=2000
BLASTZ_M=200
# TARGET: elegans Ce6
SEQ1_DIR=/iscratch/i/worms/ce6/ce6.2bit
SEQ1_LEN=/iscratch/i/worms/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: elegans Ce6
SEQ2_DIR=/iscratch/i/worms/ce6/ce6.2bit
SEQ2_LEN=/iscratch/i/worms/ce6/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
BASE=/cluster/data/ce6/bed/blastzCe6.2008-06-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
-smallClusterHub=kki > do.log 2>&1 &
# real 79m26.707s
XXX - running 2008-06-04 12:20
# broke down during loadUp because /iscratch/i/worms/ce6/chrom.sizes
# does not exist on hgwdev. Finish the load manually by fixing
# the chrom.sizes reference in loadUp.csh. Then:
time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 \
-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
-continue=download -smallClusterHub=kki > download.log 2>&1 &
i
# real 37m22.066s
ssh hgwdev
cd /cluster/data/ce6/bed/blastz.ce6.2008-06-04
nice -n +19 featureBits ce6 chainSelfLink > fb.ce6.chainSelfLink.txt 2>&1
# 31186593 bases of 100281244 (31.099%) in intersection
############################################################################
## BLASTZ caePb2 (DONE - 2008-06-04,09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzCaePb2.2008-06-04
cd /cluster/data/ce6/bed/blastzCaePb2.2008-06-04
cat << '_EOF_' > DEF
# ce6 vs caePb2
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. PB2801 caePb2
SEQ2_DIR=/scratch/data/caePb2/caePb2.2bit
SEQ2_LEN=/scratch/data/caePb2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/caePb2/caePb2.supercontigs.2bit
SEQ2_CTGLEN=/scratch/data/caePb2/caePb2.supercontigs.sizes
SEQ2_LIFT=/scratch/data/caePb2/caePb2.supercontigs.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzCaePb2.2008-06-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-smallClusterHub=kki > do.log 2>&1 &
# real 32m29.730s
# trouble with chaining due to no copies of genomes on /scratch/data/
# for the Iservers
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-continue=chainMerge -smallClusterHub=kki > chainMerge.log 2>&1 &
# real 26m55.624s
# forgot the -qRepeats=windowmaskerSdust
rm axtChain/ce6.caePb2.net
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk -qRepeats=windowmaskerSdust \
-continue=load -smallClusterHub=kki > load.log 2>&1 &
# real 1m18.270s
cat fb.ce6.chainCaePb2Link.txt
# 40805712 bases of 100281426 (40.691%) in intersection
# swap, this is also in caePb2.txt
mkdir /cluster/data/caePb2/bed/blastz.ce6.swap
cd /cluster/data/caePb2/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzCaePb2.2008-06-04/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 &
# a couple of minutes
cat fb.caePb2.chainCe6Link.txt
# 55092134 bases of 170473138 (32.317%) in intersection
############################################################################
## BLASTZ caeRem3 (DONE - 2008-06-04,09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzCaeRem3.2008-06-04
cd /cluster/data/ce6/bed/blastzCaeRem3.2008-06-04
cat << '_EOF_' > DEF
# ce6 vs caeRem3
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. PB2801 caeRem3
SEQ2_DIR=/scratch/data/caeRem3/caeRem3.2bit
SEQ2_LEN=/scratch/data/caeRem3/chrom.sizes
SEQ2_CTGDIR=/scratch/data/caeRem3/caeRem3.supercontigs.2bit
SEQ2_CTGLEN=/scratch/data/caeRem3/caeRem3.supercontigs.sizes
SEQ2_LIFT=/scratch/data/caeRem3/caeRem3.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzCaeRem3.2008-06-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-smallClusterHub=kki > do.log 2>&1 &
# real 28m40.865s
# trouble with chaining due to no copies of genomes on /scratch/data/
# for the Iservers
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-continue=chainMerge -smallClusterHub=kki > chainMerge.log 2>&1 &
# real 19m30.231s
# forgot the -qRepeats=windowmaskerSdust, went back and fixed that
# manually in loadUp.csh and reran the load of the net track
cat fb.ce6.chainCaeRem3Link.txt
# 41853863 bases of 100281426 (41.736%) in intersection
# swap, this is also in caeRem3.txt
mkdir /cluster/data/caeRem3/bed/blastz.ce6.swap
cd /cluster/data/caeRem3/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzCaeRem3.2008-06-04/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 &
cat fb.caeRem3.chainCe6Link.txt
# 46326709 bases of 138406388 (33.472%) in intersection
# something unusual happened with ssh during installDownloads.csh,
# continuing after finishing that manually:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-continue=cleanup -qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzCaeRem3.2008-06-04/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > cleanUp.log 2>&1 &
############################################################################
## BLASTZ cb3 (DONE - 2008-06-04,09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzCb3.2008-06-04
cd /cluster/data/ce6/bed/blastzCb3.2008-06-04
cat << '_EOF_' > DEF
# ce6 vs cb3
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. briggsae cb3
SEQ2_DIR=/san/sanvol1/scratch/worms/cb3/cb3.rmskTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/cb3/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzCb3.2008-06-04
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-smallClusterHub=kki > do.log 2>&1 &
# real 33m24.103s
# 42483532 bases of 100281426 (42.364%) in intersection
# swap, this is also in cb3.txt
mkdir /cluster/data/cb3/bed/blastz.ce6.swap
cd /cluster/data/cb3/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/cluster/data/ce6/bed/blastzCb3.2008-06-04/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 &
# real 3m20.338s
# real 22m40.205s
cat fb.cb3.chainCe6Link.txt
# 43180969 bases of 108433446 (39.823%) in intersection
############################################################################
## BLASTZ priPac1 (DONE - 2008-06-09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzPriPac1.2008-06-09
cd /cluster/data/ce6/bed/blastzPriPac1.2008-06-09
cat << '_EOF_' > DEF
# ce6 vs priPac1
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. briggsae priPac1
SEQ2_DIR=/iscratch/i/worms/priPac1/priPac1.TrfWM.2bit
SEQ2_LEN=/iscratch/i/worms/priPac1/chrom.sizes
SEQ2_CTGDIR=/iscratch/i/worms/priPac1/priPac1.TrfWM.supers.2bit
SEQ2_CTGLEN=/iscratch/i/worms/priPac1/priPac1.TrfWM.supers.sizes
SEQ2_LIFT=/iscratch/i/worms/priPac1/priPac1.supercontigs.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzPriPac1.2008-06-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=kk \
-qRepeats=windowmaskerSdust -smallClusterHub=kki > do.log 2>&1 &
# after some kk parasol experiments, continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-continue=cat `pwd`/DEF -verbose=2 -bigClusterHub=kk \
-qRepeats=windowmaskerSdust -smallClusterHub=kki > cat.log 2>&1 &
# real 9m46.300s
cat fb.ce6.chainPriPac1Link.txt
# 6256254 bases of 100281426 (6.239%) in intersection
# swap, this is also in priPac1.txt
mkdir /cluster/data/priPac1/bed/blastz.ce6.swap
cd /cluster/data/priPac1/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzPriPac1.2008-06-09/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > swap.log 2>&1 &
# real 0m33.836s
# failed during the loadUp.csh due to /iscratch/i/ missing from hgwdev
# fixup that manually in loadUp.csh and ran it on hgwdev
cat fb.priPac1.chainCe6Link.txt
# 6803322 bases of 145948246 (4.661%) in intersection
# and continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
-continue=download /cluster/data/ce6/bed/blastzPriPac1.2008-06-09/DEF \
-bigClusterHub=kk -smallClusterHub=kki -swap > download.log 2>&1 &
############################################################################
## BLASTZ caeJap1 (DONE - 2008-06-09 - Hiram)
ssh kkstore06
screen # use screen to control the job
mkdir /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09
cd /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09
cat << '_EOF_' > DEF
# ce6 vs caeJap1
BLASTZ_H=2000
BLASTZ_M=50
# TARGET: elegans Ce6
SEQ1_DIR=/scratch/data/ce6/ce6.2bit
SEQ1_LEN=/scratch/data/ce6/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
# QUERY: C. japonica caeJap1
SEQ2_DIR=/cluster/bluearc/scratch/data/caeJap1/caeJap1.2bit
SEQ2_LEN=/cluster/bluearc/scratch/data/caeJap1/chrom.sizes
SEQ2_CTGDIR=/cluster/bluearc/scratch/data/caeJap1/caeJap1.TrfWM.supers.2bit
SEQ2_CTGLEN=/cluster/bluearc/scratch/data/caeJap1/caeJap1.TrfWM.supers.sizes
SEQ2_LIFT=/cluster/bluearc/scratch/data/caeJap1/caeJap1.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/cluster/data/ce6/bed/blastzCaeJap1.2008-06-09
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF -verbose=2 -bigClusterHub=pk \
-qRepeats=windowmaskerSdust -smallClusterHub=kki > do.log 2>&1 &
# real 35m7.220s
cat fb.ce6.chainCaeJap1Link.txt
# 26888483 bases of 100281426 (26.813%) in intersection
# swap, this is also in caeJap1.txt
mkdir /cluster/data/caeJap1/bed/blastz.ce6.swap
cd /cluster/data/caeJap1/bed/blastz.ce6.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
/cluster/data/ce6/bed/blastzCaeJap1.2008-06-09/DEF \
-bigClusterHub=pk -smallClusterHub=kki -swap > swap.log 2>&1 &
# real 1m10.794s
# failed to load because:
# loadUp: looks like previous stage was not
# successful
# (can't find /cluster/data/caeJap1/bed/blastz.ce6.swap/mafNet/).
# continuing:
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
-qRepeats=windowmaskerSdust \
-continue=load /cluster/data/ce6/bed/blastzCaeJap1.2008-06-09/DEF \
-bigClusterHub=pk -smallClusterHub=kki -swap > load.log 2>&1 &
# real 0m33.440s
cat fb.caeJap1.chainCe6Link.txt
# 26209442 bases of 129347181 (20.263%) in intersection
############################################################################
## 6-Way multiple alignment (DONE - 2008-06-09 - Hiram)
mkdir /cluster/data/ce6/bed/multiz6way
cd /cluster/data/ce6/bed/multiz6way
# An additional nematode tree:
# http://nematol.unh.edu/tree/tree_des.php
# Constructing a tree from the document:
# http://www.wormbook.org/chapters/www_phylogrhabditids/phylorhab.html
# http://www.wormbook.org/chapters/www_phylogrhabditids/phylofig6B.jpg
# purely artifical for the last one: :0.2,pristiochus_priPac1:0.8
# since it is not on the graph
cat << '_EOF_' > nematodes.nh
((((((((((briggsae_cb3:0.005,remanei_caeRem3:0.003):0.004,
brenneri_caePb2:0.013):0.002,elegans_ce6:0.003):0.001,
japonica_caeJap1:0.023):0.024,ps1010:0.027):0.014,
df5070:0.056):0.009, plicata:0.102):0.099,sb341:0.060):0.038,
(df5074:0.351,df5055:0.090):0.221):0.2,pristiochus_priPac1:0.8)
'_EOF_'
# << happy emacs
# filter that for the six species in use here
/cluster/bin/phast/x86_64/tree_doctor \
--prune-all-but briggsae_cb3,remanei_caeRem3,brenneri_caePb2,elegans_ce6,japonica_caeJap1,pristiochus_priPac1 \
nematodes.nh > 6way.nh
# rearrange to get Ce6 on top for the construction of the tree image
# in the 6-way description page. This results in something like:
(((C._elegans_ce6:0.003000,
(C._brenneri_caePb2:0.013000,
(C._remanei_caeRem3:0.003000,C._briggsae_cb3:0.005000):0.004000)
:0.002000):0.001000,
C._japonica_caeJap1:0.023000):0.384000,
P._pristiochus_priPac1:0.800000);
/cluster/bin/phast/x86_64/all_dists 6way.nh > 6way.distances.txt
grep -i ce6 6way.distances.txt | sort -k3,3n
# Use this output for reference, and use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chaince6Link chain linearGap
# distance on Ce6 on other minScore
# 1 0.0120 - remanei_caeRem3 (% 41.736) (% 33.472) 1000 loose
# 2 0.0140 - briggsae_cb3 (% 42.364) (% 39.823) 1000 loose
# 3 0.0180 - brenneri_caePb2 (% 40.691) (% 32.317) 1000 loose
# 3 0.0270 - japonica_caeJap1 (% 26.813) (% 20.263) 1000 loose
# 4 1.1880 - pristiochus_priPac1 (% 6.239) (% 4.661) 1000 loose
cd /cluster/data/ce6/bed/multiz6way
# bash shell syntax here ...
export H=/cluster/data/ce6/bed
mkdir mafLinks
for G in caeRem3 cb3 caePb2 priPac1 caeJap1
do
mkdir mafLinks/$G
if [ ! -d ${H}/blastz.${G}/mafNet ]; then
echo "missing directory blastz.${G}/mafNet"
exit 255
fi
ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
done
# Copy MAFs to some appropriate NFS server for kluster run
ssh kkstore06
mkdir /san/sanvol1/scratch/worms/ce6/multiz6way
cd /san/sanvol1/scratch/worms/ce6/multiz6way
time rsync -a --copy-links --progress --stats \
/cluster/data/ce6/bed/multiz6way/mafLinks/ .
# a few seconds to copy 126 Mb
# these are x86_64 binaries
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# the autoMultiz cluster run
ssh memk
cd /cluster/data/ce6/bed/multiz6way/
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z0-9]*_//ig; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
6way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir maf run
cd run
# NOTE: you need to set the db properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = ce6
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/worms/$db/multiz6way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/worms/$db/multiz6way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/ce6/bed/multiz6way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/ce6/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
para -maxNode=1 push
para check ... push ... etc ...
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 2302s 38.37m 0.64h 0.03d 0.000 y
# IO & Wait Time: 38s 0.63m 0.01h 0.00d 0.000 y
# Average job time: 334s 5.57m 0.09h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 478s 7.97m 0.13h 0.01d
# Submission to last job: 954s 15.90m 0.27h 0.01d
ssh kkstore06
cd /cluster/data/ce6/bed/multiz6way
time nice -n +19 catDir maf > multiz6way.maf
# a few seconds to produce:
# -rw-rw-r-- 1 343585668 Jun 9 16:25 multiz6way.maf
# before getting to the annotation, load this up so we can get
# a first view of the track. This will be replaced with the annotated
# mafs
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
mkdir /gbdb/ce6/multiz6way
ln -s /cluster/data/ce6/bed/multiz6way/multiz6way.maf /gbdb/ce6/multiz6way
# this load creates a large file, do that on local disk:
cd /scratch/tmp
time nice -n +19 hgLoadMaf ce6 multiz6way
# a few seconds to load:
# Loaded 367046 mafs in 1 files from /gbdb/ce6/multiz6way
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 ce6 multiz6waySummary /gbdb/ce6/multiz6way/multiz6way.maf
# a few seconds to load:
# Created 125113 summary blocks from 1051000 components
# and 366975 mafs from /gbdb/ce6/multiz6way/multiz6way.maf
# remove the temporary .tab files:
rm multiz6*.tab
############################################################################
#### Blat sangerGene proteins to determine exons
# (DONE - 2008-06-11 - hiram)
ssh hgwdev
cd /cluster/data/ce6/bed
mkdir blat.ce6SG.2008-06-11
ln -s blat.ce6SG.2008-06-11 blat.ce6SG
cd blat.ce6SG
pepPredToFa ce6 sangerPep sangerPep.fa
ssh kkstore06
# create san nib directory from goldenPath chromosomes fa files
mkdir /san/sanvol1/scratch/worms/ce6/nib
cd /san/sanvol1/scratch/worms/ce6/nib
for C in I II III IV V X M
do
twoBitToFa -seq=chr${C} /cluster/data/ce6/ce6.2bit stdout \
| faToNib -softMask stdin chr${C}.nib
done
# The kluster run
ssh pk
cd /cluster/data/ce6/bed/blat.ce6SG
cat << '_EOF_' > blatSome
#!/bin/csh -fe
blat -t=dnax -q=prot -out=pslx /san/sanvol1/scratch/worms/ce6/nib/$1.nib sgfa/$2.fa $3
'_EOF_'
# << happy emacs
chmod +x blatSome
ls -1S /san/sanvol1/scratch/worms/ce6/nib > ce6.lst
mkdir sgfa
cd sgfa
faSplit sequence ../sangerPep.fa 3000 sg
ls -1S *.fa > ../sg.lst
cd ..
cat << '_EOF_' > template
#LOOP
blatSome $(root1) $(root2) {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ce6.lst sg.lst template jobList
mkdir psl
cd psl
sed -e "s/.nib//" ../ce6.lst | xargs mkdir
cd ..
para create jobList
para try ... check ... push ... etc
# Completed: 20335 of 20335 jobs
# CPU time in finished jobs: 69885s 1164.74m 19.41h 0.81d 0.002 y
# IO & Wait Time: 51640s 860.67m 14.34h 0.60d 0.002 y
# Average job time: 6s 0.10m 0.00h 0.00d
# Longest finished job: 348s 5.80m 0.10h 0.00d
# Submission to last job: 943s 15.72m 0.26h 0.01d
ssh kkstore06
cd /cluster/data/ce6/bed/blat.ce6SG.2008-06-11
pslSort dirs raw.psl /tmp psl/*
# -rw-rw-r-- 1 70627970 Jun 11 10:35 raw.psl
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cooked.psl /dev/null
# Processed 155625 alignments
# -rw-rw-r-- 1 26423327 Jun 11 10:35 cooked.psl
pslUniq cooked.psl ce6SG.psl
# -rw-rw-r-- 1 25332830 Jun 11 10:36 ce6SG.psl
cut -f 10 ce6SG.psl > sgName.lst
faSomeRecords sangerPep.fa sgName.lst ce6SG.fa
# -rw-rw-r-- 1 10903181 Jun 11 10:11 sangerPep.fa
# -rw-rw-r-- 1 10899914 Jun 11 10:37 ce6SG.fa
grep "^>" ce6SG.fa | wc -l
# 23741
grep "^>" sangerPep.fa | wc -l
# 23771
pslxToFa ce6SG.psl ce6SG_ex.fa -liftTarget=genome.lft \
-liftQuery=protein.lft
# -rw-rw-r-- 1 5288587 Jun 11 10:39 protein.lft
# -rw-rw-r-- 1 6741328 Jun 11 10:39 genome.lft
# -rw-rw-r-- 1 12743074 Jun 11 10:39 ce6SG_ex.fa
wc -l *.psl *.lft *.fa sgName.lst
# 23741 ce6SG.psl
# 25926 cooked.psl
# 155630 raw.psl
# 151196 genome.lft
# 151196 protein.lft
# 244239 ce6SG.fa
# 302392 ce6SG_ex.fa
# 244343 sangerPep.fa
# 23741 sgName.lst
# 1322404 total
# back on hgwdev
ssh hgwdev
cd /cluster/data/ce6/bed/blat.ce6SG
# After about an hour, it exited with this message:
# sqlFreeConnection called on cache (ce6) that doesn't contain
# the given connection
# This may be a lurking error in this program, because the
# resulting file seems to have the correct number of lines:
# Once upon a time, long long ago, this table was made by the kgName
# command, but this doesn't work on C. elegans because there is no
# kgXref table. So, the following perl script was developed to do the
# same thing
# kgName ce6 ce6SG.psl blastSGRef01
# Need a reference table loaded, perl script makes up relationships
cat << '_EOF_' >> mkRef.pl
#!/usr/bin/env perl
use warnings;
use strict;
my $argc = scalar(@ARGV);
sub usage() {
print STDERR "usage: ./mkRef.pl blastSGRef01.tab\n";
print STDERR "reads ../sangerGene/genePred.tab and ",
"../sangerGene/orfToGene.tab\n";
print STDERR "writes references to the given input file, ",
"e.g. blastSGRef01.tab\n";
}
if (1 != $argc) {
usage;
exit 255;
}
my $outFile = shift;
my %orfToGene; # key is all lowercase orf name, value is gene name
my $inFile="../sangerGene/orfToGene.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
chomp $line;
my ($orf, $gene) = split('\s+',$line);
$orfToGene{lc($orf)} = $gene
}
close (FH);
my %links; # key is all lowercase orf name, value is SwissProt name
$inFile="../sangerGene/sangerLinks.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
chomp $line;
my ($orf, $SP, $comments) = split('\s+',$line);
$links{lc($orf)} = $SP
}
close (FH);
my %ce6Position; # key is all lowercase orf name, value is ce6 position
$inFile="../sangerGene/genePred.tab";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $line=<FH>) {
chomp $line;
my ($bin, $orf, $chr, $strand, $start, $end, $other) = split('\s+',$line);
$ce6Position{lc($orf)} = sprintf("%s:%d-%d", $chr, $start, $end);
}
close (FH);
open (OF,">$outFile") or die "can not write to $outFile";
$inFile="sgName.lst";
open (FH,"<$inFile") or die "can not read $inFile";
while (my $orf=<FH>) {
chomp $orf;
if (!exists($orfToGene{lc($orf)})) {
$orfToGene{lc($orf)} = "";
}
if (!exists($links{lc($orf)})) {
$links{lc($orf)} = "";
}
if (!exists($ce6Position{lc($orf)})) {
$ce6Position{lc($orf)} = "";
}
printf OF "%s\t%s\t%s\t%s\t%s\n", $orf, $orfToGene{lc($orf)},
$ce6Position{lc($orf)}, $links{lc($orf)}, "";
}
close (FH);
close (OF);
'_EOF_'
# << happy emacs
chmod +x ./mkRef.pl
./mkRef.pl blastSGRef01.tab
wc -l blastSGRef01.tab
# 23741 blastSGRef.01.tab
hgLoadSqlTab ce6 blastSGRef01 ~/kent/src/hg/lib/blastRef.sql \
blastSGRef01.tab
wc -l sgName.lst blastSGRef01.tab ce6SG.psl
# 23741 sgName.lst
# 23741 blastSGRef01.tab
# 23741 ce6SG.psl
# And load the protein sequences
hgPepPred ce6 generic blastSGPep01 ce6SG.fa
############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS - 6-WAY
# (DONE - 2008-06-18 - Hiram)
# Estimate phastCons parameters
ssh kkstore06
# We need the fasta sequence for this business
cd /cluster/data/ce6
for C in `cut -f1 chrom.sizes | sed -e s/chr//`
do
echo "twoBitToFa -seq=chr${C} ce6.2bit ${C}/chr${C}.fa"
twoBitToFa -seq=chr${C} ce6.2bit ${C}/chr${C}.fa
done
mkdir /cluster/data/ce6/bed/multiz6way/cons
cd /cluster/data/ce6/bed/multiz6way/cons
# Create a starting-tree.mod based on chrV (the largest one)
# Set -windows large enough so it only makes one piece
faSize ../../../V/chrV.fa
# 20919568 bases (0 N's 20919568 real 18018198 upper 2901370 lower)
# in 1 sequences in 1 files
# %13.87 masked total, %13.87 masked real
time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrV.maf \
--refseq ../../../V/chrV.fa --in-format MAF \
--windows 30000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
# Creating partition 1 (column 1 to column 20919568)...
# Writing partition 1 to s1.1-20919568.ss...
# 13 seconds
# the tree in use
cat ../tree-commas.nh
# (((((cb3,caeRem3),caePb2),ce6),caeJap1),priPac1)
# this s1.*.ss should match only one item
time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "(((((cb3,caeRem3),caePb2),ce6),caeJap1),priPac1)" \
--out-root starting-tree
# real 0m14.408s
rm s1.*.ss
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.383
# This 0.383 is used in the --gc argument below
# Create SS files on san filesystem
# Increasing their size this time from 1,000,000 to 4,000,000 to
# slow down the phastCons pk jobs
ssh kkstore06
mkdir -p /san/sanvol1/scratch/worms/ce6/cons/ss
cd /san/sanvol1/scratch/worms/ce6/cons/ss
time for C in `cut -f1 /cluster/data/ce6/chrom.sizes`
do
if [ -s /cluster/data/ce6/bed/multiz6way/maf/${C}.maf ]; then
mkdir ${C}
echo msa_split $C
chrN=${C/chr/}
chrN=${chrN/_random/}
/cluster/bin/phast/$MACHTYPE/msa_split \
/cluster/data/ce6/bed/multiz6way/maf/${C}.maf \
--refseq /cluster/data/ce6/${chrN}/${C}.fa \
--in-format MAF --windows 4000000,0 --between-blocks 5000 \
--out-format SS --out-root ${C}/${C}
fi
done
# real 1m7.685s
# Create a random list of 50 1 mb regions (do not use the _randoms)
cd /san/sanvol1/scratch/worms/ce6/cons/ss
ls -1l chr*/chr*.ss | grep -v random | \
awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
# Only 28 lines in stdin (i.e. all are used)
# Set up parasol directory to calculate trees on these regions
ssh pk
mkdir /san/sanvol1/scratch/worms/ce6/cons/treeRun2
cd /san/sanvol1/scratch/worms/ce6/cons/treeRun2
mkdir tree log
# Tuning this loop should come back to here to recalculate
# Create script that calls phastCons with right arguments
# expected-lengths and target-coverage taken from ce4 history
# for treeRun2 --expected-lengths 15 --target-coverage 0.55
# for treeRun1 --expected-lengths 15 --target-coverage 0.60
cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
/cluster/data/ce6/bed/multiz6way/cons/starting-tree.mod \
--gc 0.383 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-lengths 15 --target-coverage 0.55 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
# << happy emacs
chmod +x makeTree.csh
cat > template << '_EOF_'
#LOOP
makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Make cluster job and run it
gensub2 ../randomSs.list single template jobList
para create jobList
para try/push/check/etc
# Completed: 28 of 28 jobs
# CPU time in finished jobs: 5941s 99.01m 1.65h 0.07d 0.000 y
# IO & Wait Time: 64s 1.07m 0.02h 0.00d 0.000 y
# Average job time: 214s 3.57m 0.06h 0.00d
# Longest finished job: 309s 5.15m 0.09h 0.00d
# Submission to last job: 326s 5.43m 0.09h 0.00d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh kkstore06
cd /san/sanvol1/scratch/worms/ce6/cons/treeRun2
ls -1 tree/chr*/*.cons.mod > cons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
--output-average ../ave.cons.mod > cons_summary.txt 2>&1
ls -1 tree/chr*/*.noncons.mod > noncons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
--output-average ../ave.noncons.mod > noncons_summary.txt
cd ..
cp -p ave.*.mod /cluster/data/ce6/bed/multiz6way/cons
# measuring entropy
# consEntopy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
/cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.78 .55 15 \
ave.cons.mod ave.noncons.mod
### for treeRun2: --expected-lengths 15 --target-coverage 0.55
# ( Solving for new omega: 15.000000 76.253180 42.469155 35.389619
# 34.810297 34.805819 )
# Transition parameters: gamma=0.550000, omega=15.000000, mu=0.066667,
# nu=0.081481
# Relative entropy: H=1.172307 bits/site
# Expected min. length: L_min=6.108564 sites
# Expected max. length: L_max=5.516260 sites
# Phylogenetic information threshold: PIT=L_min*H=7.161112 bits
# Recommended expected length: omega=34.805819 sites (for L_min*H=9.780000)
### for treeRun1: --expected-lengths 15 --target-coverage 0.60
# ( Solving for new omega: 15.000000 140.706073 61.140093 42.355284
# 39.561113 39.474822 )
# Transition parameters: gamma=0.600000, omega=15.000000, mu=0.066667,
# nu=0.100000
# Relative entropy: H=1.248356 bits/site
# Expected min. length: L_min=5.363742 sites
# Expected max. length: L_max=5.097039 sites
# Phylogenetic information threshold: PIT=L_min*H=6.695858 bits
# Recommended expected length: omega=39.474822 sites (for L_min*H=9.780000)
ssh pk
# Create cluster dir to do main phastCons run
mkdir /san/sanvol1/scratch/worms/ce6/cons/consRun2
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
mkdir ppRaw bed
# using the ave.cons.mod and ave.noncons.mod from treeRun2 above:
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.308500 0.191500 0.191500 0.308500
RATE_MAT:
-0.875904 0.175256 0.458101 0.242547
0.282331 -1.200805 0.180928 0.737546
0.737985 0.180928 -1.200354 0.281440
0.242547 0.457829 0.174703 -0.875079
TREE: (((((cb3:0.087550,caeRem3:0.073515):0.020860,caePb2:0.090766):0.029808,ce6:0.094770):0.038388,caeJap1:0.109779):0.126548,priPac1:0.126548);
ALPHABET: A C G T
ORDER: 0
SUBST_MOD: REV
BACKGROUND: 0.308500 0.191500 0.191500 0.308500
RATE_MAT:
-0.875904 0.175256 0.458101 0.242547
0.282331 -1.200805 0.180928 0.737546
0.737985 0.180928 -1.200354 0.281440
0.242547 0.457829 0.174703 -0.875079
TREE: (((((cb3:0.328951,caeRem3:0.273909):0.078262,caePb2:0.339349):0.111927,ce6:0.355503):0.144928,caeJap1:0.413391):0.478831,priPac1:0.478831);
# Create script to run phastCons with right parameters
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.cons.mod ../ave.noncons.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
--expected-lengths 15 --target-coverage 0.55 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# << happy emacs
chmod +x doPhast
# root1 == chrom name, file1 == ss file name without .ss suffix
cat > template << '_EOF_'
#LOOP
doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
# These jobs are very fast and very I/O intensive
# Completed: 29 of 29 jobs
# CPU time in finished jobs: 386s 6.43m 0.11h 0.00d 0.000 y
# IO & Wait Time: 85s 1.42m 0.02h 0.00d 0.000 y
# Average job time: 16s 0.27m 0.00h 0.00d
# Longest finished job: 22s 0.37m 0.01h 0.00d
# Submission to last job: 49s 0.82m 0.01h 0.00d
# combine predictions and transform scores to be in 0-1000 interval
# it uses a lot of memory, so on kolossus:
ssh kolossus
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
# The sed's and the sort get the file names in chrom,start order
# You might like to verify it is correct by first looking at the
# list it produces:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g"
# if that looks right (== in chrom and numerical order), then let it run:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin \
> phastConsElements6way.bed
# way too many negative
# scores with --expected-length 15 --target-coverage 0.60
# OK with--expected-length 15 --target-coverage 0.55
cp -p phastConsElements6way.bed /cluster/data/ce6/bed/multiz6way
# Figure out how much is actually covered by the bed file as so:
# Get the non-n genome size from faSize on all chroms:
ssh kkstore06
cd /cluster/data/ce6
faSize */chr*.fa
# 100281426 bases (0 N's 100281426 real 86981803 upper
# 13299623 lower) in 7 sequences in 7 files
# %13.26 masked total, %13.26 masked real
# 100281244 bases (0 N's 100281244 real 87008909 upper
# 13272335 lower) in 7 sequences in 7 files
# %13.24 masked total, %13.24 masked real
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
# The 100281426 comes from the non-n genome as counted above.
awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/100281426\n",100.0*sum/100281426,sum}' \
phastConsElements6way.bed
# --expected-length 15 --target-coverage 0.55
# % 29.56 = 100.0*29639749/100281426
# Aiming for %70 coverage in
# the following featureBits measurement on CDS:
# Beware of negative scores when too high. The logToBedScore
# will output an error on any negative scores.
HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits ce6 \
-enrichment sangerGene:cds phastConsElements6way.bed
# --expected-length 15 --target-coverage 0.55
# sangerGene:cds 25.329%, phastConsElements6way.bed 29.557%,
# both 15.469%, cover 61.07%, enrich 2.07x
# in Ce4, this was:
# sangerGene:cds 25.286%, phastConsElements5way.bed 32.368%,
# both 15.733%, cover 62.22%, enrich 1.92x
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
# the copy was already done above
# cp -p /san/sanvol1/scratch/worms/ce6/cons/consRun2/phastConsElements6way.bed .
time nice -n +19 hgLoadBed ce6 phastConsElements6way \
phastConsElements6way.bed
# Loaded 595094 elements of size 5
# should measure the same as above
time nice -n +19 featureBits ce6 -enrichment sangerGene:cds \
phastConsElements6way
# --expected-length 15 --target-coverage 0.55
# sangerGene:cds 25.329%, phastConsElements6way 29.557%, both 15.469%,
# cover 61.07%, enrich 2.07x
# on Ce4, this was:
time nice -n +19 featureBits ce4 -enrichment sangerGene:cds \
phastConsElements5way
# sangerGene:cds 25.286%, phastConsElements5way 32.368%, both 15.733%,
# cover 62.22%, enrich 1.92x
# Create merged posterier probability file and wiggle track data files
ssh kkstore06
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
# prepare compressed copy of ascii data values for downloads
ssh pk
cd /san/sanvol1/scratch/worms/ce6/cons/consRun2
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons6wayScores
for D in ppRaw/chr*
do
C=${D/ppRaw\/}
out=phastCons6wayScores/${C}.data.gz
echo "========================== ${C} ${D}"
find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 1m14.178s
# use those file to encode the data for the track:
for C in I II III IV V X M
do
zcat phastCons6wayScores/chr${C}.data.gz
done | wigEncode stdin phastCons6way.wig phastCons6way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# -rw-rw-r-- 1 56813758 Jun 18 12:05 phastCons6way.wib
# -rw-rw-r-- 1 8880383 Jun 18 12:05 phastCons6way.wig
nice -n +19 cp -p phastCons6way.wi? /cluster/data/ce6/bed/multiz6way/
# copy them for downloads
ssh kkstore06
mkdir /cluster/data/ce6/bed/multiz6way/phastCons6wayScores
cd /cluster/data/ce6/bed/multiz6way/phastCons6wayScores
cp -p /san/sanvol1/scratch/worms/ce6/cons/consRun2/phastCons6wayScores/* .
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/ce6
ln -s /cluster/data/ce6/bed/multiz6way/phastCons6wayScores .
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
ln -s `pwd`/phastCons6way.wib /gbdb/ce6/wib/phastCons6way.wib
nice -n +19 hgLoadWiggle ce6 phastCons6way phastCons6way.wig
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=ce6 phastCons6way > histogram.data 2>&1
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Worm Ce6 Histogram phastCons6way track"
set xlabel " phastCons6way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# enable this line in the ce6/trackDb.ra multiz6way track definition:
# wiggle phastCons6way
# to get the graph to display in the genome browser
############################################################################
# ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE - 2008-06-19 - Hiram)
ssh kolossus
mkdir /cluster/data/ce6/bed/multiz6way/anno
cd /cluster/data/ce6/bed/multiz6way/anno
# these actually already all exist from previous multiple alignments
# if there are done you will see an ls of them
# or else the twoBitInfo command will be echoed, run it if you want
for DB in `cat ../species.lst`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
mkdir maf run
cd run
rm -f sizes nBeds
for DB in `sed -e "s/ce6 //" ../../species.lst`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /cluster/data/ce6/bed/multiz6way
set c = $1
cat $dir/maf/${c}.maf | nice \
mafAddIRows -nBeds=nBeds stdin /cluster/data/ce6/ce6.2bit $2
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /cluster/data/ce6/chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
# Completed: 7 of 7 jobs
# CPU time in finished jobs: 217s 3.61m 0.06h 0.00d 0.000 y
# IO & Wait Time: 60s 1.01m 0.02h 0.00d 0.000 y
# Average job time: 40s 0.66m 0.01h 0.00d
# Longest finished job: 55s 0.92m 0.02h 0.00d
# Submission to last job: 58s 0.97m 0.02h 0.00d
# Load anno/maf
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way/anno/maf
mkdir -p /gbdb/ce6/multiz6way/anno/maf
ln -s /cluster/data/ce6/bed/multiz6way/anno/maf/*.maf \
/gbdb/ce6/multiz6way/anno/maf
time nice -n +19 /cluster/bin/x86_64/hgLoadMaf \
-pathPrefix=/gbdb/ce6/multiz6way/anno/maf ce6 multiz6way
# XXX problem: Can't hSetDb(ce6) after an hAllocConn(hg18), sorry.
# Loaded 420764 mafs in 7 files from /gbdb/ce6/multiz6way/anno/maf
# real 0m9.567s
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kolossus
cd /cluster/data/ce6/bed/multiz6way/anno/maf
time cat *.maf | \
nice -n +19 hgLoadMafSummary ce6 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 -test multiz6waySummary stdin
# Created 45771 summary blocks from 1051000 components
# and 420693 mafs from stdin
# real 0m16.138s
# -rw-rw-r-- 1 2111948 Jun 20 10:25 multiz6waySummary.tab
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way/anno/maf
time nice -n +19 hgLoadSqlTab ce6 multiz6waySummary \
~/kent/src/hg/lib/mafSummary.sql multiz6waySummary.tab
# real 0m0.645
# Create per-chrom individual maf files for downloads
ssh kkstore02
cd /cluster/data/ce6/bed/multiz6way
mkdir mafDownloads
time for M in anno/maf/chr*.maf
do
B=`basename $M`
nice -n +19 cp -p ${M} mafDownloads/${B}
nice -n +19 gzip mafDownloads/${B}
echo ${B} done
done
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way
# redone 2007-12-21 to fix difficulty in mafFrags when ce6 was not the
# first species listed
# upstream mafs
# There isn't any refGene table, trying sangerGene instead
#!/bin/sh
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 ce6 \
sangerGene:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ce6 multiz6way \
stdin stdout -orgs=species.lst \
| gzip -c > mafDownloads/sangerGene.upstream${S}.maf.gz
echo "done sangerGene.upstream${S}.maf.gz"
done
cd mafDownloads
md5sum *.gz > md5sum.txt
# deliver to downloads
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/ce6/multiz6way
cd /usr/local/apache/htdocs/goldenPath/ce6/multiz6way
ln -s /cluster/data/ce6/bed/multiz6way/mafDownloads/* .
#######################################################################
# MULTIZ5WAY MAF FRAMES (DONE - 2007-05-03 - Hiram)
ssh hgwdev
mkdir /cluster/data/ce6/bed/multiz6way/frames
cd /cluster/data/ce6/bed/multiz6way/frames
mkdir genes
# The following is adapted from the ce4 5-way sequence
#------------------------------------------------------------------------
# get the genes for all genomes
# using sangerGene for ce6
# using blastCe6SG for cb3, caePb2, caeRem3, caeJap1 priPac1
for qDB in ce6 cb3 caePb2 caeRem3 caeJap1 priPac1
do
if [ $qDB = "cb3" -o $qDB = "caePb2" -o $qDB = "caeRem3" -o $qDB = "caeJap1" -o $qDB = "priPac1" ]; then
geneTbl=blastCe6SG
echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
> /scratch/tmp/$qDB.tmp.psl
mrnaToGene -allCds /scratch/tmp/$qDB.tmp.psl stdout \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
rm -f /scratch/tmp/$qDB.tmp.psl
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
elif [ $qDB = "ce6" ]; then
geneTbl=sangerGene
echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
fi
done
ssh kkstore06
cd /cluster/data/ce6/bed/multiz6way/frames
cat ../maf/*.maf | nice -n +19 genePredToMafFrames ce6 \
stdin stdout ce6 genes/ce6.gp.gz cb3 \
genes/cb3.gp.gz caePb2 genes/caePb2.gp.gz caeRem3 \
genes/caeRem3.gp.gz caeJap1 genes/caeJap1.gp.gz \
priPac1 genes/priPac1.gp.gz \
| gzip > multiz6way.mafFrames.gz
ssh hgwdev
cd /cluster/data/ce6/bed/multiz6way/frames
nice -n +19 hgLoadMafFrames ce6 multiz6wayFrames multiz6way.mafFrames.gz
##########################################################################
# verify all.joiner entries (DONE - 2008-06-20 - Hiram)
# try to get joinerCheck tests clean output on these commands
ssh hgwdev
cd ~/kent/src/hg/makeDb/schema
joinerCheck -database=ce6 -tableCoverage all.joiner
joinerCheck -database=ce6 -keys all.joiner
joinerCheck -database=ce6 -times all.joiner
##########################################################################
# make downloads (DONE - 2008-06-20 - Hiram)
# verify all tracks have html pages for their trackDb entries
cd /cluster/data/ce6
/cluster/bin/scripts/makeDownloads.pl ce6
# fix the README files
# re-build the upstream files:
cd /cluster/data/ce6/goldenPath/bigZips
for size in 1000 2000 5000
do
echo $size
featureBits ce6 sangerGene:upstream:$size -fa=stdout \
| gzip -c > sangerGene.upstream$size.fa.gz
done
##########################################################################
# create pushQ entry (DONE - 2008-06-20 - Hiram)
# verify all tracks have html pages for their trackDb entries
ssh hgwdev
cd /cluster/data/ce6/jkStuff
/cluster/bin/scripts/makePushQSql.pl ce6 > ce6.pushQ.sql
ce6 does not have seq
Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
and genbank tables) which tracks to assign these tables to:
blastSGPep01
blastSGRef01
multiz6wayFrames
#############################################################################
# Create 6-way downloads (DONE - 2008-06-23 - Hiram)
ssh hgwdev
mkdir /cluster/data/ce6/goldenPath/multiz6way
mkdir /cluster/data/ce6/goldenPath/phastCons6way
cd /cluster/data/ce6/goldenPath/phastCons6way
ln -s ../../bed/multiz6way/phastCons6wayScores/*.gz .
ln -s ../../bed/multiz6way/cons/ave.cons.mod 6way.conserved.mod
ln -s ../../bed/multiz6way/cons/ave.noncons.mod 6way.non-conserved.mod
md5sum *.gz *.mod > md5sum.txt
# copy a README and fix it up for here:
cp /usr/local/apache/htdocs/goldenPath/calJac1/phastCons9way/README.txt .
mkdir /cluster/data/ce6/goldenPath/multiz6way
cd /cluster/data/ce6/goldenPath/multiz6way
cp /usr/local/apache/htdocs/goldenPath/calJac1/multiz9way/README.txt .
# edit that README.txt to be correct for this 6-way alignment
ssh kkstore06
cd /cluster/data/ce6/goldenPath/multiz6way
ln -s ../../bed/multiz6way/6way.nh ./ce6.6way.nh
cp -p ../../bed/multiz6way//anno/maf/*.maf .
gzip *.maf
ssh hgwdev
cd /cluster/data/ce6/goldenPath/multiz6way
# creating upstream files from sangerGene, bash script:
cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=ce6
GENE=sangerGene
NWAY=multiz6way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
stdin stdout \
-orgs=/cluster/data/${DB}/bed/${NWAY}/species.lst \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
'_EOF_'
# << happy emacs
chmod +x ./mkUpstream.sh
time nice -n +19 ./mkUpstream.sh
# real 119m5.562s
# -rw-rw-r-- 1 42975041 Mar 28 14:27 upstream1000.maf.gz
# -rw-rw-r-- 1 76363192 Mar 28 15:03 upstream2000.maf.gz
# -rw-rw-r-- 1 303870318 Mar 28 15:42 upstream5000.maf.gz
# check the names in these upstream files to ensure sanity:
zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
| sort | uniq -c | sort -rn | less
# should be a list of the other 4 species with a high count,
# then sangerGene names, e.g.:
# 10134 priPac1
# 10134 cb3
# 10134 caeRem3
# 10134 caePb2
# 10134 caeJap1
# 1 cTel55X.1a.1
# 1 ZK993.1
# 1 ZK973.9
# 1 ZK973.8
# 1 ZK973.3
# ... etc ...
ssh kkstore06
cd /cluster/data/ce6/goldenPath/multiz6way
md5sum *.gz *.nh > md5sum.txt
ssh hgwdev
cd /cluster/data/ce6/goldenPath/phastCons6way
mkdir /usr/local/apache/htdocs/goldenPath/ce6/phastCons6way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/ce6/phastCons6way
cd /cluster/data/ce6/goldenPath/multiz6way
mkdir /usr/local/apache/htdocs/goldenPath/ce6/multiz6way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/ce6/multiz6way
# if your ln -s `pwd`/* made extra links to files you don't want there,
# check the goldenPath locations and remove those extra links
##########################################################################
# Make protein blast runs for Gene Sorter and Known Genes on Hg18
ssh hgwdev
mkdir /cluster/data/ce6/bed/hgNearBlastp
cd /cluster/data/ce6/bed/hgNearBlastp
# Make sure there are no joinerCheck errors at this point, because
# if left here they can spread via doHgNearBlastp:
runJoiner.csh ce6 sangerPep
# ce6.kimExpDistance.query - hits 13356000 of 19134000 ok
# ce6.kimExpDistance.target - hits 12973822 of 19134000 ok
# ce6.orfToGene.name - hits 30296 of 30296 ok
# ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok
# ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok
# ce6.sangerCanonical.transcript - hits 20051 of 20051 ok
# ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok
# ce6.sangerLinks.orfName - hits 30115 of 30115 ok
# ce6.sangerPep.name - hits 23771 of 23771 ok
# ce6.sangerToRefSeq.name - hits 29995 of 29995 ok
# ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok
pepPredToFa ce6 sangerPep ce6.sangerPep.faa
cat << '_EOF_' > config.ra
# Latest human vs. this ce6 worm for Human Gene Sorter
targetGenesetPrefix known
targetDb hg18
queryDbs ce6
hg18Fa /cluster/data/hg18/bed/ucsc.10/ucscGenes.faa
ce6Fa /cluster/data/ce6/bed/hgNearBlastp/ce6.sangerPep.faa
buildDir /cluster/data/ce6/bed/hgNearBlastp
scratchDir /cluster/data/ce6/bed/hgNearBlastp/tmpHgNearBlastp
'_EOF_'
time nice -n +19 /cluster/bin/scripts/doHgNearBlastp.pl -workhorse=hgwdev \
-noLoad -verbose=2 config.ra > do.log 2>&1 &
##########################################################################
## creating xxBlastTab tables (DONE - 2008-07-11 - Hiram)
# Make sure there are no joinerCheck errors at this point, because
# if left here they can spread via doHgNearBlastp:
ssh hgwdev
runJoiner.csh ce6 sangerPep
# ce6.kimExpDistance.query - hits 13356000 of 19134000 ok
# ce6.kimExpDistance.target - hits 12973822 of 19134000 ok
# ce6.orfToGene.name - hits 30296 of 30296 ok
# ce6.sangerBlastTab.query - hits 1255312 of 1255312 ok
# ce6.sangerBlastTab.target - hits 1255312 of 1255312 ok
# ce6.sangerCanonical.transcript - hits 20051 of 20051 ok
# ce6.sangerIsoforms.transcript - hits 30296 of 30296 ok
# ce6.sangerLinks.orfName - hits 30115 of 30115 ok
# ce6.sangerPep.name - hits 23771 of 23771 ok
# ce6.sangerToRefSeq.name - hits 29995 of 29995 ok
# ce6.sangerGeneToWBGeneID.sangerGene - hits 30115 of 30115 ok
ssh hgwdev
mkdir /cluster/data/ce6/bed/hgNearBlastp/080711
cd /cluster/data/ce6/bed/hgNearBlastp/080711
# Get the proteins used by the other hgNear organisms:
pepPredToFa hg18 knownGenePep hg18.known.faa
pepPredToFa mm9 knownGenePep mm9.known.faa
pepPredToFa rn4 knownGenePep rn4.known.faa
pepPredToFa danRer5 ensPep danRer5.ensPep.faa
pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
pepPredToFa ce6 sangerPep ce6.sangerPep.faa
pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
# sanity check, number of lines in each faa file
wc -l *.faa
# 244343 ce6.sangerPep.faa
# 345537 danRer5.ensPep.faa
# 268201 dm3.flyBasePep.faa
# 526563 hg18.known.faa
# 453565 mm9.known.faa
# 86220 rn4.known.faa
# 64956 sacCer1.sgdPep.faa
# sanity check, count the number of protein sequences in each
for F in *.faa
do
echo -n "${F} "
grep "^>" ${F} | wc -l
done
# ce6.sangerPep.faa 23771
# danRer5.ensPep.faa 31743
# dm3.flyBasePep.faa 20736
# hg18.known.faa 45480
# mm9.known.faa 39489
# rn4.known.faa 8160
# sacCer1.sgdPep.faa 5766
cd /cluster/data/ce6/bed/hgNearBlastp
cat << '_EOF_' > config.ra
# Latest worm vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, fly, yeast
targetGenesetPrefix sanger
targetDb ce6
queryDbs hg18 mm9 rn4 danRer5 dm3 sacCer1
recipBest hg18 mm9 rn4 danRer5 dm3 sacCer1
ce6Fa /cluster/data/ce6/bed/hgNearBlastp/080711/ce6.sangerPep.faa
hg18Fa /cluster/data/ce6/bed/hgNearBlastp/080711/hg18.known.faa
mm9Fa /cluster/data/ce6/bed/hgNearBlastp/080711/mm9.known.faa
rn4Fa /cluster/data/ce6/bed/hgNearBlastp/080711/rn4.known.faa
danRer5Fa /cluster/data/ce6/bed/hgNearBlastp/080711/danRer5.ensPep.faa
dm3Fa /cluster/data/ce6/bed/hgNearBlastp/080711/dm3.flyBasePep.faa
sacCer1Fa /cluster/data/ce6/bed/hgNearBlastp/080711/sacCer1.sgdPep.faa
buildDir /cluster/data/ce6/bed/hgNearBlastp/080711
scratchDir /san/sanvol1/scratch/ce6HgNearBlastp
'_EOF_'
# << happy emacs
# Run this in a screen since it is going to take awhile
# Run with -noLoad so we can eyeball files.
# Later overload other databases' dmBlastTab tables.
ssh kkstore06
screen
cd
time nice -n +19 doHgNearBlastp.pl -workhorse=hgwdev -clusterHub=pk \
-noLoad config.ra > do.log 2>&1 &
# watch progress in do.log
# load the blast tabs into ce6
ssh hgwdev
cd /cluster/data/ce6/bed/hgNearBlastp/080711
for D in danRer5 dm3 hg18 mm9 rn4 sacCer1
do
cd run.ce6.$D
grep hgLoadBlastTab loadPairwise.csh
./loadPairwise.csh
cd ..
done
#############################################################################
# Lift nucleosome data from ce4 to ce6 (DONE - 2008-09-28,30 - Hiram)
mkdir /hive/data/genomes/ce6/bed/monoNucleosomes
cd /hive/data/genomes/ce6/bed/monoNucleosomes
ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.for.paired.bed.gz \
ce4.nucleosomeControl.fwd.bed.gz
ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.rev.paired.bed.gz \
ce4.nucleosomeControl.rev.bed.gz
ln -s \
/usr/local/apache/htdocs/goldenPath/ce4/liftOver/ce4ToCe6.over.chain.gz \
ce4ToCe6.over.chain.gz
# The name filed isn't being parsed correctly because of the blanks
# take out the blanks so liftOver can read them, put them back
# in after lift
zcat ce4.nucleosomeControl.fwd.bed.gz | sed -e "s/n = /n=/" \
| liftOver stdin ce4ToCe6.over.chain.gz stdout \
ce6.nucleosomeControl.fwd.unMapped \
| sed -e "s/n=/n = /" | gzip > ce6.nucleosomeControl.fwd.bed.gz
# only 5 items do not map correctly
zcat ce4.nucleosomeControl.rev.bed.gz | sed -e "s/n = /n=/" \
| liftOver stdin ce4ToCe6.over.chain.gz stdout \
ce6.nucleosomeControl.rev.unMapped \
| sed -e "s/n=/n = /" | gzip > ce6.nucleosomeControl.rev.bed.gz
# 71 items do not map correctly
LOAD="hgLoadBed -tab -noNameIx ce6"
zcat ce6.nucleosomeControl.fwd.bed.gz ce6.nucleosomeControl.rev.bed.gz \
| ${LOAD} nucleosomeControl stdin
# Loaded 28951421 elements of size 12
# Collapsed nucleosome track files by chromosome:
ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.for.clean.bed.gz \
./ce4.nucleosomeFragmentsSense.bed.gz
ln -s ../../../ce4/bed/monoNucleosomes/chrI-M.rev.clean.bed.gz \
./ce4.nucleosomeFragmentsAntisense.bed.gz
zcat ce4.nucleosomeFragmentsSense.bed.gz | sed -e "s/n = /n=/" \
| liftOver stdin ce4ToCe6.over.chain.gz stdout \
ce6.nucleosomeControlFragmentsSense.unMapped \
| sed -e "s/n=/n = /" | gzip > ce6.nucleosomeFragmentsSense.bed.gz
# four items do not map
zcat ce4.nucleosomeFragmentsAntisense.bed.gz | sed -e "s/n = /n=/" \
| liftOver stdin ce4ToCe6.over.chain.gz stdout \
ce6.nucleosomeControlFragmentsAntisense.unMapped \
| sed -e "s/n=/n = /" | gzip > ce6.nucleosomeFragmentsAntisense.bed.gz
# 34 items do not map
zcat ce6.nucleosomeFragmentsSense.bed.gz \
| ${LOAD} nucleosomeFragmentsSense stdin
# Loaded 11279598 elements of size 12
zcat ce6.nucleosomeFragmentsAntisense.bed.gz \
| ${LOAD} nucleosomeFragmentsAntisense stdin
# Loaded 11269358 elements of size 12
# these three tracks grouped into the composite: "nucleosome"
# labels:
# composite container: Nucleosome predictions from SOLidD core alignments
# Control - : mononucleosome control
# fragments + : mononucleosomal fragments, sense strand reads
# fragments - : mononucleosomal fragments, antisense strand reads
# Positioning stringency wig track
ln -s \
../../../ce4/bed/monoNucleosomes/chrI-M_positioning_stringency.wig.gz \
./ce4.nucleosomeStringency.wigVariable.txt.gz
zcat ce4.nucleosomeStringency.wigVariable.txt.gz \
| varStepToBedGraph.pl stdin | liftOver stdin ce4ToCe6.over.chain.gz \
stdout ce6.nucleosomeStringency.unMapped \
| gzip > ce6.nucleosomeStringency.bedGraph.gz
# 11 items do not lift
cat << '_EOF_' > bedGraphToVarStep.sh
#!/bin/sh
SPAN=1
SRC0=ce4.nucleosomeStringency.wigVariable.txt.gz
SRC=ce6.nucleosomeStringency.bedGraph.gz
R=ce6.nucleosomeStringency.wigVariable.txt
rm -f ${R}
zcat ${SRC0} | egrep "^browser|^track" > ${R}
zcat ${SRC} | grep "^chr" | cut -f1 | sort -u > chr.list
cat chr.list | while read C
do
echo "variableStep chrom=${C} span=${SPAN}" >> ${R}
zcat "${SRC}" \
| awk '{if (match($1,"^'"${C}"'$")) { print } }' | sort -k2n | awk '
{
printf "%d\t%g\n", $2+1, $4
}
' >> ${R}
done
rm -f chr.list
'_EOF_'
# << happy emacs
time ./bedGraphToVarStep.sh
# real 35m15.201s
gzip ce6.nucleosomeStringency.wigVariable.txt
zcat ce6.nucleosomeStringency.wigVariable.txt.gz \
| wigEncode stdin nucleosomeStringency.wig nucleosomeStringency.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
ln -s `pwd`/nucleosomeStringency.wib /gbdb/ce6/wib
# Positioning stringency wig track
hgLoadWiggle ce6 nucleosomeStringency nucleosomeStringency.wig
# NSome Stringency
# Nucleosome Positioning Stringency from SOLiD core alignments
# Adjusted nucleosome coverage wig track:
ln -s \
../../../ce4/bed/monoNucleosomes/adjusted_coverage_chrI-M.wig.gz \
./ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz
zcat ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz \
| varStepToBedGraph.pl stdin | liftOver stdin ce4ToCe6.over.chain.gz \
stdout ce6.nucleosomeAdjustedCoverage.unMapped \
| gzip > ce6.nucleosomeAdjustedCoverage.bedGraph.gz
time ./bedGraphToVarStep.sh \
ce4.nucleosomeAdjustedCoverage.wigVariable.txt.gz \
ce6.nucleosomeAdjustedCoverage.bedGraph.gz \
ce6.nucleosomeAdjustedCoverage.wigVariable.txt
# real 27m40.408s
time gzip ce6.nucleosomeAdjustedCoverage.wigVariable.txt
# real 3m27.146s
time wigEncode ce6.nucleosomeAdjustedCoverage.wigVariable.txt.gz \
nucleosomeAdjustedCoverage.wig nucleosomeAdjustedCoverage.wib
# real 1m25.770s
# Converted ce6.nucleosomeAdjustedCoverage.wigVariable.txt.gz
# upper limit 8.76, lower limit -11.93
ln -s `pwd`/nucleosomeAdjustedCoverage.wib /gbdb/ce6/wib
hgLoadWiggle ce6 nucleosomeAdjustedCoverage nucleosomeAdjustedCoverage.wig
# Adj NSome Covrg
# Adjusted nucleosome coverage (25bp)
# MNase Coverage
# Micrococcal nuclease control coverage
# Cntl MNase +Covg
# Micrococcal nuclease control coverage, sense strand reads
# Cntl MNase -Covg
# Micrococcal nuclease control coverage, antisense strand reads
zcat ce6.nucleosomeControl.fwd.bed.gz | sort -k1,1 -k2,2n \
| bedItemOverlapCount ce6 stdin \
| wigEncode stdin nucleosomeControlSenseCoverage.wig \
nucleosomeControlSenseCoverage.wib
# Converted stdin, upper limit 142.00, lower limit 1.00
ln -s `pwd`/nucleosomeControlSenseCoverage.wib /gbdb/ce6/wib
zcat ce6.nucleosomeControl.rev.bed.gz | sort -k1,1 -k2,2n \
| bedItemOverlapCount ce6 stdin \
| wigEncode stdin nucleosomeControlAntisenseCoverage.wig \
nucleosomeControlAntisenseCoverage.wib
# Converted stdin, upper limit 145.00, lower limit 1.00
ln -s `pwd`/nucleosomeControlAntisenseCoverage.wib /gbdb/ce6/wib
hgLoadWiggle ce6 nucleosomeControlSenseCoverage \
nucleosomeControlSenseCoverage.wig
hgLoadWiggle ce6 nucleosomeControlAntisenseCoverage \
nucleosomeControlAntisenseCoverage.wig
mkdir ce4.monoNucleosomes
cd ce4.monoNucleosomes
ln -s ../../../../ce4/bed/monoNucleosomes/stanford/clean/* .
zcat ce4.monoNucleosomes/chr*.forward.bed.gz \
| liftOver stdin ce4ToCe6.over.chain.gz \
stdout ce6.monoNucleosomesSense.unMapped \
| gzip > ce6.monoNucleosomesSense.bed.gz
zcat ce4.monoNucleosomes/chr*.reverse.bed.gz \
| liftOver stdin ce4ToCe6.over.chain.gz \
stdout ce6.monoNucleosomesAntiSense.unMapped \
| gzip > ce6.monoNucleosomesAntiSense.bed.gz
bedItemOverlapCount ce6 ce6.monoNucleosomesSense.bed.gz \
| wigEncode stdin monoNucleosomesSense.wig monoNucleosomesSense.wib
# Converted stdin, upper limit 16195.00, lower limit 1.00
bedItemOverlapCount ce6 ce6.monoNucleosomesAntiSense.bed.gz \
| wigEncode stdin monoNucleosomesAntiSense.wig \
monoNucleosomesAntiSense.wib
# Converted stdin, upper limit 19137.00, lower limit 1.00
ln -s `pwd`/monoNucleosomesSense.wib /gbdb/ce6/wib
ln -s `pwd`/monoNucleosomesAntiSense.wib /gbdb/ce6/wib
# for downloads:
bedItemOverlapCount ce6 ce6.monoNucleosomesSense.bed.gz \
| gzip --rsyncable > monoNucleosomesSense.wigVariable.gz
bedItemOverlapCount ce6 ce6.monoNucleosomesAntiSense.bed.gz \
| gzip --rsyncable > monoNucleosomesAntiSense.wigVariable.gz
# NSome Coverage
# Coverage of nucleosome predictions from SOLiD core alignments
# Control
# mononucleosome control
hgLoadWiggle ce6 monoNucleosomesSense monoNucleosomesSense.wig
# NSome Core +Covg
# Coverage of mononucleosomal fragments, sense strand reads
hgLoadWiggle ce6 monoNucleosomesAntiSense monoNucleosomesAntiSense.wig
# NSome Core -Covg
# Coverage of mononucleosomal fragments, antisense strand reads
#########################################################################
# lift 25mers repeat track from ce4 (DONE - 2008-09-30 - Hiram)
mkdir /cluster/data/ce6/bed/25mersRepeats
cd /cluster/data/ce6/bed/25mersRepeats
ln -s /cluster/data/ce4/bed/25mersRepeats/withScore.bed \
./ce4.25mersRepeats.bed
ln -s \
/usr/local/apache/htdocs/goldenPath/ce4/liftOver/ce4ToCe6.over.chain.gz \
ce4ToCe6.over.chain.gz
liftOver ce4.25mersRepeats.bed ce4ToCe6.over.chain.gz \
ce6.25mersRepeats.bed ce6.unMapped
# no elements failed the eliftOver
hgLoadBed ce6 25mersRepeats ce6.25mersRepeats.bed
# Loaded 757742 elements of size 9
# clean up
rm bed.tab
gzip ce6.25mersRepeats.bed
#########################################################################
# hgPal downloads (DONE braney 2008-09-30)
ssh hgwdev
screen
bash
rm -rf /cluster/data/ce6/bed/multiz6way/pal
mkdir /cluster/data/ce6/bed/multiz6way/pal
cd /cluster/data/ce6/bed/multiz6way/pal
mz=multiz6way
gp=refGene
db=ce6
echo $db > order.lst
echo "select settings from trackDb where tableName='$mz'" | \
hgsql $db | sed 's/\\n/\n/g'| grep speciesOrder | tr ' ' '\n' | \
tail -n +2 >> order.lst
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > refGene.jobs
time sh -x refGene.jobs > refGene.job.log 2>&1 &
sleep 1
tail -f refGene.job.log
# real 14m44.986s
# user 2m26.113s
# sys 1m41.837s
zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
rm -rf $pd/$gp.exonAA.fa.gz $pd/$gp.exonNuc.fa.gz
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
mz=multiz6way
gp=sangerGene
db=ce6
mkdir exonAA exonNuc ppredAA ppredNuc
for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
do
echo "date"
echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \
gzip -c > ppredAA/$j.ppredAA.fa.gz"
echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
gzip -c > exonNuc/$j.exonNuc.fa.gz"
echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
gzip -c > exonAA/$j.exonAA.fa.gz"
done > $gp.$mz.jobs
time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 &
sleep 1
tail -f $gp.$mz.job.log
# real 16m2.266s
# user 2m43.764s
# sys 2m7.173s
zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
rm -rf exonAA exonNuc ppredAA ppredNuc
# we're only distributing exons at the moment
pd=/usr/local/apache/htdocs/goldenPath/ce6/multiz6way
ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
#########################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
ce6.upstreamGeneTbl = refGene
ce6.upstreamMaf = multiz6way /hive/data/genomes/ce6/bed/multiz6way/species.lst
############################################################################
# CE6->CE7 LIFTOVER (DONE - 2008-06-24 - Hiram)
cd /hive/data/genomes/ce6
# test procedure with -debug first
doSameSpeciesLiftOver.pl -bigClusterHub=swarm -workhorse=hgwdev \
-ooc /hive/data/genomes/ce6/jkStuff/11.ooc ce6 ce7 -debug
cd bed/blat.ce7.2009-07-28
time nice -n +19 doSameSpeciesLiftOver.pl -bigClusterHub=swarm \
-workhorse=hgwdev \
-ooc /hive/data/genomes/ce6/jkStuff/11.ooc ce6 ce7 > do.log 2>&1 &
# real 8m7.833s
#########################################################################
+# ALG Binding sites (DONE - 2010-01-14 - Hiram)
+ # From the Gene Yeo lab:
+ # fetch files
+T="http://www.snl.salk.edu/~lovci/Amy/ce6/alltheexperiments/for_ucsc"
+
+one() {
+wget --timestamping \
+"${T}/$1" -O $1
+}
+
+one all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz
+one all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz
+one MT1.neg.off.WIG.gz
+one MT1.pos.off.WIG.gz
+one MT2.neg.off.WIG.gz
+one MT2.pos.off.WIG.gz
+one MT3.neg.off.WIG.gz
+one MT3.pos.off.WIG.gz
+one WT1.neg.off.WIG.gz
+one WT1.pos.off.WIG.gz
+one WT2.neg.off.WIG.gz
+one WT2.pos.off.WIG.gz
+one WT3.neg.off.WIG.gz
+one WT3.pos.off.WIG.gz
+one WT_region_specific_clusters.weight_normal.3.rp.bed.WT.good_clusters.25.gz
+
+ # encode and load wiggle tracks
+
+one() {
+ID=$1
+NP=$2
+RF=$3
+UC=$4
+LC=`echo $ID | tr '[A-Z]' '[a-z]'`
+zcat $ID.$NP.off.WIG.gz \
+ | wigEncode stdin ${LC}${UC}.wig ${LC}${UC}.wib > ${LC}$UC.log 2>&1
+ln -s `pwd`/${LC}${UC}.wib /gbdb/ce6/wib
+hgLoadWiggle ce6 ${LC}${UC} ${LC}${UC}.wig
+}
+
+one MT1 neg reverse Reverse
+one MT1 pos forward Forward
+one MT2 neg reverse Reverse
+one MT2 pos forward Forward
+one MT3 neg reverse Reverse
+one MT3 pos forward Forward
+one WT1 neg reverse Reverse
+one WT1 pos forward Forward
+one WT2 neg reverse Reverse
+one WT2 pos forward Forward
+one WT3 neg reverse Reverse
+one WT3 pos forward Forward
+
+ # load BED tables:
+zcat WT_region_specific_clusters.weight_normal.3.rp.bed.WT.good_clusters.25.gz \
+ | grep -v track \
+ | awk -F'\t' '{printf "%s %d %d %s 1 %s\n", $1, $2, $3, $4, $6}' \
+ | hgLoadBed -noNameIx ce6 algBindSites stdin
+zcat all_L3_L4.L3_L4.ce6.ingenes.both_strands.reannotated.BED.gz \
+ | grep -v track | awk -F'\t' '{printf "%s %d %d %s 0 %s\n", $1, $2, $3, $4, $6}' | hgLoadBed -noNameIx ce6 algBindGenic stdin
+
+#########################################################################