src/hg/makeDb/doc/sacCer2.txt 1.4
1.4 2009/02/12 19:50:19 hiram
Finished 7-way conservation business
Index: src/hg/makeDb/doc/sacCer2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/sacCer2.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/sacCer2.txt 10 Feb 2009 23:56:21 -0000 1.3
+++ src/hg/makeDb/doc/sacCer2.txt 12 Feb 2009 19:50:19 -0000 1.4
@@ -1,637 +1,926 @@
# for emacs: -*- mode: sh; -*-
# Saccharomyces cerevisiae
# Saccharomyces Genome Database, Stanford
# $Id$
#######################################################################
# Download data (DONE - 2009-01-30 - Hiram)
mkdir -p /hive/data/genomes/sacCer2/download
cd /hive/data/genomes/sacCer2/download
# convert chrom names to something reasonable, they are now roman nums
mkdir -p chromosomes
# bash scripting follows:
runOne() {
N=$1
C=$2
echo ">chr${C}" > chromosomes/chr${C}.fa
grep -v "^>" sgd.chromosomes/chr${N}.fsa >> chromosomes/chr${C}.fa
echo "done chr${N} -> ${C}"
}
runOne 01 I
runOne 02 II
runOne 03 III
runOne 04 IV
runOne 05 V
runOne 06 VI
runOne 07 VII
runOne 08 VIII
runOne 09 IX
runOne 10 X
runOne 11 XI
runOne 12 XII
runOne 13 XIII
runOne 14 XIV
runOne 15 XV
runOne 16 XVI
runOne mt M
# the same sequence again, with the following runOne()
# will product an AGP file:
runOne() {
N=$1
C=$2
CTG=`head -1 sgd.chromosomes/chr${N}.fsa | sed -e 's/>ref|//; s/|.*//'`
size=`faSize sgd.chromosomes/chr${N}.fsa | head -1 | awk '{print $1}'`
echo -e "chr${C}\t1\t$size\t$SEQ\tF\t$CTG\t1\t$size\t+"
SEQ=`echo $SEQ | awk '{print $1+1}'`
}
# plus this echo statement for 2micron:
echo -e "2micron\t1\t6318\t${SEQ}\tF\tNC_001398\t1\t6318\t+"
# trim the SGD gff file and get UCSC chrom names
awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' chromosomal_feature/saccharomyces_cerevisiae.gff \
| sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff
# this program needed to be fixed for this newer gff file format
checkSgdSync -verbose=4 -gffRowCount=9 -faExtn=fa \
-gffFile=S.cerevisiae.gff \
download > sgd.feature.scan.txt 2>&1
# this used to have only a couple, now there are many non-coding
# annotations
# good 6702, bad 371, total 7073
#########################################################################
cd /hive/data/genomes/sacCer2
cat << '_EOF_' > sacCer2.config.ra
# Config parameters for makeGenomeDb.pl:
db sacCer2
scientificName Saccharomyces cerevisiae
commonName S. cerevisiae
assemblyDate June 2008
assemblyLabel SGD June 2008
orderKey 998
mitoAcc NC_001224
# override the check for max size of 25,000
mitoSize 90000
fastaFiles /hive/data/genomes/sacCer2/download/chromosomes/*.fa
agpFiles /hive/data/genomes/sacCer2/download/sacCer2.agp
# qualFiles /dev/null
dbDbSpeciesDir sacCer
taxId 4932
'_EOF_'
# << happy emacs
makeGenomeDb.pl sacCer2.config.ra > makeGenomeDb.log 2>&1
# take the trackDb stuff and check it in
# no need for masked sequence:
ln -s sacCer2.unmasked.2bit sacCer2.2bit
ln -s `pwd`/sacCer2.2bit /gbdb/sacCer2
#########################################################################
# sacCer2 - S. cerevisiae - Ensembl Genes version 52 (DONE - 2009-02-04 - hiram)
ssh swarm
cd /hive/data/genomes/sacCer2
cat << '_EOF_' > sacCer2.ensGene.ra
# required db variable
db sacCer2
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names. With quotes just to make sure.
nameTranslation "s/^VIII/chrVIII/; s/^VII/chrVII/; s/^VI/chrVI/; s/^V/chrV/; s/^XIII/chrXIII/; s/^XII/chrXII/; s/^XIV/chrXIV/; s/^XI/chrXI/; s/^XVI/chrXVI/; s/^XV/chrXV/; s/^X/chrX/; s/^III/chrIII/; s/^IV/chrIV/; s/^II/chrII/; s/^IX/chrIX/; s/^I/chrI/; s/^MT/chrM/; s/2-micron/2micron/"
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=52 sacCer2.ensGene.ra
ssh hgwdev
cd /hive/data/genomes/sacCer2/bed/ensGene.52
featureBits sacCer2 ensGene
# 8912793 bases of 12162995 (73.278%) in intersection
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-02-04 - Hiram)
# repMatch = 1024 * sizeof(sacCer2)/sizeof(hg18)
# 4.32 = 1024 * (12162995/2881515245)
# use 10 to be very conservative
ssh hgwdev
cd /hive/data/genomes/sacCer2
blat sacCer2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/11.ooc \
-repMatch=10
# Wrote 3137 overused 11-mers to 11.ooc
# copy this to scratch data
cp -p jkStuff/11.ooc /hive/data/staging/data/sacCer2/sacCer2.11.ooc
#########################################################################
# work with the SGD annotations (WORKING - 2009-02-04 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/sgdAnnotations
cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
# get rid of the FASTA section from their gff file
awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' ../../download/chromosomal_feature/saccharomyces_cerevisiae.gff \
| sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff
#########################################################################
# CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE - 2009-02-10 - Hiram)
mkdir /hive/data/sacCer2/bed/sgdAnnotations
cd /hive/data/sacCer2/bed/sgdAnnotations
# trim the delivered S.cerevisiae.gff file to get rid of the FASTA section
# and fixup the chrM and 2-micron chrom names:
awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' ../../download/chromosomal_feature/saccharomyces_cerevisiae.gff \
| sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff
# wrote this perl script to specifically parse this particular instance
# of the sgd gff file.
$HOME/kent/src/hg/makeDb/outside/yeast/hgSgdGff3/sgdGffToGtf.pl \
S.cerevisiae.gff > sacCer2.sgdGene.gtf 2> sacCer2.gtf.stderr
# This also writes the files:
# -rw-rw-r-- 1 640177 Feb 6 15:01 leftOvers.gff
# -rw-rw-r-- 1 259712 Feb 6 15:01 otherFeatures.bed
# -rw-rw-r-- 1 254996 Feb 6 15:01 notes.txt
# -rw-rw-r-- 1 1214875 Feb 6 15:01 descriptions.txt
ldHgGene -gtf sacCer2 sgdGene sacCer2.sgdGene.gtf
hgLoadBed sacCer2 sgdOther otherFeatures.bed \
-tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql
# this perl script will fixup the fasta header lines for
# the chr*.peptides.fsa files to run into hgSgdPep
cat << '_EOF_' > filter.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH,">symbol.txt") or die "can not write to symbol.txt";
my $inAnnotation = 0;
while (my $line=<>) {
if ($line =~ m/^>Annotated\|/) {
$inAnnotation = 1;
my (@words) = split('\s+', $line);
die "can not find four fields in\n'$line'" if (scalar(@words) < 4);
my $name = $words[3];
$name =~ s/;.*//;
$name =~ s#/.*##;
printf ">ORFP:%s\n", $name;
printf FH "%s\t", $name;
if ("${name};" ne $words[3]) {
my $otherName = $words[3];
$otherName =~ s#.*/##;
$otherName =~ s/;//;
printf FH "%s", $otherName;
# printf FH " %s", $words[2];
} else {
printf FH "%s", $name;
}
# for (my $i = 4; $i < scalar(@words); ++$i) {
# printf FH " %s", $words[$i];
# }
print FH "\n";
} elsif ($line =~ m/^>/) {
$inAnnotation = 0;
} else {
if ($inAnnotation) { print $line; }
}
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x filter.pl
zcat \
../../download/protein_info/hypothetical_peptides/chr*.peptides.20040928.fsa.gz\
| ./filter.pl | hgSgdPep stdin sgdPep.fa symbol.txt
hgPepPred sacCer2 generic sgdPep sgdPep.fa
hgsql sacCer2 -e 'create table sgdToName ( \
name varchar(10) not null, \
value varchar(10) not null, \
PRIMARY KEY(name), \
INDEX (value));'
hgsql sacCer2 -e 'load data local infile "symbol.txt" \
into table sgdToName;'
hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdDescription.sql
hgsql sacCer2 -e 'load data local infile "descriptions.txt" \
into table sgdDescription;'
hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql
hgsql sacCer2 -e 'load data local infile "notes.txt" \
into table sgdOtherDescription;'
############################################################################
# ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE - 2009-02-10 - Hiram)
cd /hive/data/sacCer2/bed/sgdAnnotation
grep "Swiss-Prot" ../../download/chromosomal_feature/dbxref.tab \
| awk '{printf("%s\t%s\n", $5, $1);}' > sgdToSwissProt.txt
hgsql sacCer2 -e 'create table sgdToSwissProt ( \
name varchar(10) not null, \
value varchar(10) not null, \
PRIMARY KEY(name), \
INDEX (value));'
hgsql sacCer2 -e 'load data local infile "sgdToSwissProt.txt" \
into table sgdToSwissProt;'
hgProtIdToGenePred sacCer2 sgdGene sgdToSwissProt name value
############################################################################
# CREATE SGD-BASED CLONE TRACK (DONE - 2009-02-10 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/sgdClone
cd /hive/data/genomes/sacCer2/bed/sgdClone
# since sacCer1, these coordinates have become 0-relative ?
awk -F '\t' '{printf("%d\t%d\t%d\t%s\t%s\n", $3, $4, $5, $2, $1);}' \
../../download/chromosomal_feature/clone.tab \
| sed -e \
"s/^8/chrVIII/; s/^7/chrVII/; s/^6/chrVI/; s/^5/chrV/;
s/^13/chrXIII/; s/^12/chrXII/; s/^14/chrXIV/; s/^11/chrXI/;
s/^16/chrXVI/; s/^15/chrXV/; s/^10/chrX/; s/^3/chrIII/;
s/^4/chrIV/; s/^2/chrII/; s/^9/chrIX/; s/^1/chrI/;" > sgdClone.bed
hgLoadBed sacCer2 sgdClone sgdClone.bed -tab \
-sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql
############################################################################
# AUTO UPDATE GENBANK RUN (Done - 2009-02-10 - Hiram)
# align with latest genbank process.
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add sacCer2 just before sacCer1
# sacCer2 S. cerevisiae
sacCer2.serverGenome = /hive/data/genomes/sacCer2/sacCer2.2bit
sacCer2.clusterGenome = /scratch/data/sacCer2/sacCer2.2bit
sacCer2.ooc = no
sacCer2.maxIntron = 5000
sacCer2.lift = no
sacCer2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter}
sacCer2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
sacCer2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
sacCer2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
sacCer2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter}
#sacCer2.perChromTables = no # doesn't work in the browser
sacCer2.genbank.mrna.xeno.load = no
sacCer2.refseq.mrna.native.load = no
sacCer2.downloadDir = sacCer2
cvs ci -m "Added sacCer2." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial sacCer2 &
# logFile: var/build/logs/2009.02.10-11:27:42.sacCer2.initalign.log
# real 173m1.570s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sacCer2
# logFile: var/dbload/hgwdev/logs/2009.02.10-14:25:48.dbload.log
# real 18m21.436s
XXXX TBD
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add ce6 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added ce6 - C. elegans WS190" \
etc/align.dbs etc/hgwdev.dbs
make etc-update
############################################################################
# DOING MULTIPLE ALIGNMENT WITH OTHER YEAST (DONE - 2009-02-10 - Hiram)
# the sequences are the same set from 2003 that were used in sacCer1
# so, just link to them and rerun the alignment
# in the directory /cluster/data/sacCer1/bed/otherYeast/align
mkdir /hive/data/genomes/sacCer2/bed/otherYeast
cd /hive/data/genomes/sacCer2/bed/otherYeast
for D in sacBay sacCas sacKlu sacKud sacMik sacPar
do
ln -s ../../../sacCer1/bed/otherYeast/${D} .
done
mkdir align
cd align
ln -s ../../../../sacCer1/bed/otherYeast/align/sac??? .
# helpful to have 2bit files for chainAntiRepeat
for G in sac???
do
faToTwoBit ${G} ${G}.2bit
done
# Create directory full of size info
mkdir sizes
for F in sac???
do
faSize $F -detailed > sizes/$F
done
# Create some working directories
mkdir lav axtAll axtBest mafIn mafRawOut mafOut
# Create little shell script to do blastz alignments
cat << '_EOF_' > lastz.csh
#!/bin/csh -ef
/cluster/bin/penn/$MACHTYPE/lastz $1 $2 > $3
'_EOF_'
# << happy emacs
chmod +x lastz.csh
# create sacCer2 individual chr.fa files:
mkdir sacCer2
twoBitInfo ../../../sacCer2.2bit stdout | cut -f1 | while read C
do
twoBitToFa ../../../sacCer2.2bit:${C} sacCer2/${C}.fa
echo "${C} done"
done
-
# Create job spec to do all blastz alignments
ls -1 sacCer2/*.fa > sacCer.list
ls -1 sac??? > other.list
cat << '_EOF_' > template
#LOOP
lastz.csh $(path1) $(path2) {check out line+ lav/$(root1).$(root2)}
#ENDLOOP
'_EOF_'
# << happy emacs
ssh swarm
cd /hive/data/genomes/sacCer2/bed/otherYeast/align
gensub2 sacCer.list other.list template jobList
# Convert from lav to psl
for L in lav/*
do
B=`basename $L`
echo $B
lavToPsl $L stdout | gzip > psl/$B.psl.gz
done
# chaining
mkdir -p axtChain/run
cd axtChain/run
ls -d ../../sac??? | xargs -L 1 basename > species.list
cut -f1 ../../../../../chrom.sizes > chrom.list
cat << '_EOF_' > template
#LOOP
chain.csh $(path1) $(path2) {check out line+ $(root1)/$(root2).chain}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 species.list chrom.list template jobList
para create jobList
para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs: 73s 1.21m 0.02h 0.00d 0.000 y
# IO & Wait Time: 1100s 18.34m 0.31h 0.01d 0.000 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest finished job: 12s 0.20m 0.00h 0.00d
# Submission to last job: 38s 0.63m 0.01h 0.00d
cd ..
# merge the individual results
foreach S (`cat run/species.list`)
echo $S
find ./run/${S} -name "*.chain" \
| chainMergeSort -inputList=stdin | gzip -c > sacCer2.${S}.all.chain.gz
end
# split them again to get consistent numbering
foreach S (`cat run/species.list`)
echo $S
chainSplit chain.${S} sacCer2.${S}.all.chain.gz
end
# netting the chains
cat << '_EOF_' > netChains.csh
#!/bin/csh -ef
set S0 = "../../../../chrom.sizes"
foreach S (`cat run/species.list`)
echo $S
chainPreNet sacCer2.${S}.all.chain.gz ${S0} ../sizes/${S} stdout \
| chainNet stdin -minSpace=1 ${S0} ../sizes/${S} stdout /dev/null \
| netSyntenic stdin noClass.${S}.net
end
'_EOF_'
# << happy emacs
chmod +x netChains.csh
./netChains.csh
cat << '_EOF_' > mkMafs.csh
#!/bin/csh -ef
set TOP = `pwd`
set S0 = "/hive/data/genomes/sacCer2/chrom.sizes"
foreach S (`cat run/species.list`)
# Make axtNet for download: one .axt per sacCer2 seq.
netSplit noClass.${S}.net net.${S}
cd ..
rm -fr axtNet.${S}
mkdir axtNet.${S}
foreach f (axtChain/net.${S}/*.net)
netToAxt $f axtChain/chain.${S}/$f:t:r.chain \
/hive/data/genomes/sacCer2/sacCer2.2bit ${S}.2bit stdout \
| axtSort stdin stdout \
| gzip -c > axtNet.${S}/$f:t:r.sacCer2.${S}.net.axt.gz
end
# Make mafNet for multiz: one .maf per sacCer2 seq.
rm -fr mkdir mafNet.${S}
mkdir mafNet.${S}
foreach f (axtNet.${S}/*.sacCer2.${S}.net.axt.gz)
axtToMaf -tPrefix=sacCer2. -qPrefix=${S}. $f \
${S0} sizes/${S} stdout \
| gzip -c > mafNet.${S}/$f:t:r:r:r:r:r.maf.gz
end
cd ${TOP}
end
'_EOF_'
# << happy emacs
chmod +x mkMafs.csh
./mkMafs.csh
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-10 braney )
# bash if not using bash shell already
cd /cluster/data/sacCer2
mkdir /cluster/data/sacCer2/blastDb
awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
twoBitToFa -seqList=1meg.lst sacCer2.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa 1meg.lst
awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
twoBitToFa -seqList=less1meg.lst sacCer2.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
rm temp.fa less1meg.lst
cd blastDb
for i in *.fa
do
/hive/data/outside/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 14
mkdir -p /cluster/data/sacCer2/bed/tblastn.hg18KG
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 14 query.lst
# we want around 50000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(50000/`wc query.lst | awk '{print $1}'`\)
# 36727/(50000/14) = 10.283560
# chose 100 because 10 is too small
mkdir -p kgfa
split -l 100 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
wc kg.lst
# 368 368 4784 kg.lst
mkdir -p blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/sacCer2/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
exit
ssh swarm
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
gensub2 query.lst kg.lst blastGsub blastSpec
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 5152 of 5152 jobs
# CPU time in finished jobs: 71064s 1184.40m 19.74h 0.82d 0.002 y
# IO & Wait Time: 16915s 281.92m 4.70h 0.20d 0.001 y
# Average job time: 17s 0.28m 0.00h 0.00d
# Longest finished job: 69s 1.15m 0.02h 0.00d
# Submission to last job: 156s 2.60m 0.04h 0.00d
ssh swarm
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS ../blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
para create chainSpec
para try, check, push, check etc.
# Completed: 368 of 368 jobs
# CPU time in finished jobs: 5s 0.09m 0.00h 0.00d 0.000 y
# IO & Wait Time: 1487s 24.78m 0.41h 0.02d 0.000 y
# Average job time: 4s 0.07m 0.00h 0.00d
# Longest finished job: 10s 0.17m 0.00h 0.00d
# Submission to last job: 37s 0.62m 0.01h 0.00d
cd /cluster/data/sacCer2/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
cd ..
pslCheck blastHg18KG.psl
# checked: 5050 failed: 0 errors: 0
# load table
ssh hgwdev
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
hgLoadPsl sacCer2 blastHg18KG.psl
# check coverage
featureBits sacCer2 blastHg18KG
# 1261725 bases of 12162995 (10.373%) in intersection
featureBits sacCer2 blastHg18KG sgdGene -enrichment
# blastHg18KG 10.373%, sgdGene 72.883%, both 10.320%, cover 99.49%, enrich 1.37x
rm -rf blastOut
#end tblastn
###########################################################################
+## 7-Way Multiz (DONE - 2009-02-11 - Hiram)
+ mkdir /hive/data/genomes/sacCer2/bed/multiz7way
+ cd /hive/data/genomes/sacCer2/bed/multiz7way
+
+ # using the phylogenetic tree as mentioned at:
+ # http://www.genetics.wustl.edu/saccharomycesgenomes/yeast_phylogeny.html
+ # and from Jim's note: genomes/sacCer1/bed/otherYeast/README
+ # ((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)
+ echo "((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)" \
+ > tree.nh
+ echo sacCer2 sacPar sacMik sacKud sacBay sacCas sacKlu > species.list
+ mkdir mafLinks
+ for S in sacPar sacMik sacKud sacBay sacCas sacKlu
+do
+ mkdir mafLinks/${S}
+ cd mafLinks/${S}
+ ln -s ../../../otherYeast/align/mafNet.${S}/*.maf.gz .
+ cd ../..
+done
+ find -L ./mafLinks -type f | xargs -L 1 basename \
+ | sed -e "s/.maf.gz//" | sort -u > maf.list
+
+ mkdir maf run
+ cd run
+ mkdir penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
+
+ # set the db and pairs directories here
+ cat > autoMultiz.csh << '_EOF_'
+#!/bin/csh -ef
+set db = sacCer2
+set c = $1
+set result = $2
+set run = `pwd`
+set tmp = $run/tmp/$db/multiz.$c
+set pairs = /hive/data/genomes/sacCer2/bed/multiz7way/mafLinks
+/bin/rm -fr $tmp
+/bin/mkdir -p $tmp
+/bin/cp -p ../tree.nh ../species.list $tmp
+pushd $tmp
+foreach s (`sed -e "s/ $db//" species.list`)
+ set in = $pairs/$s/$c.maf
+ set out = $db.$s.sing.maf
+ if (-e $in.gz) then
+ /bin/zcat $in.gz > $out
+ else if (-e $in) then
+ ln -s $in $out
+ else
+ echo "##maf version=1 scoring=autoMZ" > $out
+ endif
+end
+set path = ($run/penn $path); rehash
+$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
+popd
+/bin/rm -f $result
+/bin/cp -p $tmp/$c.maf $result
+/bin/rm -fr $tmp
+/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
+/bin/rmdir --ignore-fail-on-non-empty $run/tmp
+'_EOF_'
+# << happy emacs
+ chmod +x autoMultiz.csh
+
+ cat << '_EOF_' > template
+#LOOP
+./autoMultiz.csh $(root1) {check out line+ ../maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+# << happy emacs
+
+ gensub2 ../maf.list single template jobList
+ para create jobList
+# Completed: 18 of 18 jobs
+# CPU time in finished jobs: 710s 11.83m 0.20h 0.01d 0.000 y
+# IO & Wait Time: 127s 2.12m 0.04h 0.00d 0.000 y
+# Average job time: 47s 0.78m 0.01h 0.00d
+# Longest finished job: 105s 1.75m 0.03h 0.00d
+# Submission to last job: 144s 2.40m 0.04h 0.00d
+# Estimated complete: 0s 0.00m 0.00h 0.00d
+
+ # load MAF tables
+ ssh hgwdev
+ mkdir -p /gbdb/sacCer2/multiz7way/maf
+ cd /hive/data/genomes/sacCer2/bed/multiz7way/maf
+ ln -s `pwd`/*.maf /gbdb/sacCer2/multiz7way/maf
+
+ # allow the temporary multiz7way.tab file to be created in tmp
+ cd /data/tmp
+ time nice -n +19 hgLoadMaf \
+ -pathPrefix=/gbdb/sacCer2/multiz7way/maf sacCer2 multiz7way
+ # real 0m4.903s
+ # Loaded 38795 mafs in 18 files from /gbdb/sacCer2/multiz7way/maf
+ # load summary table
+ time nice -n +19 cat /gbdb/sacCer2/multiz7way/maf/*.maf \
+ | hgLoadMafSummary sacCer2 -mergeGap=1500 multiz7waySummary stdin
+ # real 0m5.536
+ # Created 3037 summary blocks from 73806 components and 38795 mafs
+
+#########################################################################
+## phastCons conservation (DONE - 2009-02-11 - Hiram)
+ # Create SS files for each chromosome:
+ mkdir /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
+ cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
+ mkdir SS
+
+ # split up alignments; no need to use cluster here
+ MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut
+ FA=/cluster/data/sacCer1
+ CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M"
+ mkdir -p SS
+ for C in `(cd ../maf; ls *.maf | sed -e "s/.maf//")`
+do
+ echo $C
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
+ ../maf/${C}.maf -M ../../otherYeast/align/sacCer2/${C}.fa -i MAF \
+ -O sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \
+ -o SS > SS/${C}.ss
+done
+
+ # produce rough starting model
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
+ -i SS --aggregate sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \
+ -o SS -z SS/*.ss > all.ss
+
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloFit \
+ -i SS all.ss \
+ --tree "((((((sacCer2,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" \
+ -o starting-tree
+
+ # (cheat the branch lengths up a bit, since this represents an
+ # average of conserved and nonconserved sites; we want to
+ # initialize for nonconserved)
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/tree_doctor \
+ --scale 2 starting-tree.mod > tmp.mod
+ mv tmp.mod starting-tree.mod
+
+ # also get average GC content of aligned regions
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
+ -i SS all.ss --summary-only
+#descrip. A C G T G+C length all_gaps some_gaps
+#all.ss 0.3056 0.1950 0.1948 0.3047 0.3898 13130139 0 2122279
+
+ # save some I/O and space
+ gzip SS/*.ss
+
+ # now set up cluster job to estimate model parameters. See
+ # other make{hg18|mm9}.doc for add'l details
+
+ cat << '_EOF_' > doEstimate.sh
+#!/bin/sh
+zcat SS/$1.ss.gz | \
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons \
+ - starting-tree.mod --gc 0.3898 --nrates 1,1 --no-post-probs \
+ --ignore-missing --expected-lengths 75 --target-coverage 0.5 \
+ --quiet --log $2 --estimate-trees $3
+'_EOF_'
+ # << happy emacs
+ chmod +x doEstimate.sh
+
+ rm -fr LOG TREES
+ mkdir LOG TREES
+ cat << '_EOF_' > template
+#LOOP
+doEstimate.sh $(root1) {check out line+ LOG/$(root1).log} TREES/$(root1)
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ (cd SS; ls *.ss.gz) | sed -e "s/.ss.gz//" > chr.list
+ gensub2 chr.list single template jobList
+ para create jobList
+ para push
+ para time
+# Completed: 18 of 18 jobs
+# CPU time in finished jobs: 8152s 135.86m 2.26h 0.09d 0.000 y
+# IO & Wait Time: 323s 5.39m 0.09h 0.00d 0.000 y
+# Average job time: 471s 7.85m 0.13h 0.01d
+# Longest finished job: 889s 14.82m 0.25h 0.01d
+# Submission to last job: 895s 14.92m 0.25h 0.01d
+
+ # Now combine parameter estimates.
+ ls TREES/*.cons.mod > cons.txt
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \
+ --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
+ ls TREES/*.noncons.mod > noncons.txt
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \
+ --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
+
+ # Now set up cluster job for computing consevation scores and
+ # predicted elements
+ cat << '_EOF_' > doPhastCons.sh
+#!/bin/sh
+
+mkdir -p POSTPROBS ELEMENTS
+chr=$1
+out=$2
+tmpFile=/scratch/tmp/phastCons.$chr
+rm -f $tmpFile
+zcat SS/$chr.ss.gz \
+ | /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons - \
+ ave.cons.mod,ave.noncons.mod --expected-lengths 75 \
+ --target-coverage 0.5 --quiet --seqname $chr --idpref $chr \
+ --viterbi ELEMENTS/$pref.bed --score --require-informative 0 > $tmpFile
+gzip -c $tmpFile > POSTPROBS/$chr.pp.gz
+rm $tmpFile
+'_EOF_'
+ # << happy emacs
+ chmod u+x doPhastCons.sh
+
+ cat << '_EOF_' > run.template
+#LOOP
+doPhastCons.sh $(root1) {check out exists+ POSTPROBS/$(root1).pp.gz} {check out line+ ELEMENTS/$(root1).bed}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ gensub2 chr.list single run.template run.jobList
+ rm -fr POSTPROBS ELEMENTS
+ para create run.jobList
+ para push
+ para time
+# Completed: 18 of 18 jobs
+# CPU time in finished jobs: 78s 1.30m 0.02h 0.00d 0.000 y
+# IO & Wait Time: 658s 10.97m 0.18h 0.01d 0.000 y
+# Average job time: 41s 0.68m 0.01h 0.00d
+# Longest finished job: 53s 0.88m 0.01h 0.00d
+# Submission to last job: 57s 0.95m 0.02h 0.00d
+
+ # set up phastConsElements track
+ cat ELEMENTS/*.bed | sort -k1,1 -k2,2n | \
+ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
+ /cluster/bin/scripts/lodToBedScore /dev/stdin \
+ > phastConsElements7way.bed
+
+ hgLoadBed sacCer2 phastConsElements7way phastConsElements7way.bed
+ featureBits sacCer2 phastConsElements7way
+# 7762587 bases of 12162995 (63.821%) in intersection
+ # previously this was:
+ featureBits sacCer1 phastConsElements
+# 7517116 bases of 12156302 (61.837%) in intersection
+
+ mkdir ../downloads
+ mkdir ../downloads/phastCons7way
+ for C in `cat chr.list`
+do
+ cp -p POSTPROBS/${C}.pp.gz \
+ ../downloads/phastCons7way/sacCer2.${C}.wigFixed.gz
+done
+
+ # encode those files into wiggle data
+ zcat ../downloads/phastCons7way/*.wigFixed.gz \
+ | wigEncode stdin phastCons7way.wig phastCons7way.wib
+ # Converted stdin, upper limit 1.00, lower limit 0.00
+
+ # Load gbdb and database with wiggle.
+ cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
+ ln -s `pwd`/phastCons7way.wib /gbdb/sacCer2/multiz7way/phastCons7way.wib
+ hgLoadWiggle -pathPrefix=/gbdb/sacCer2/multiz7way sacCer2 \
+ phastCons7way phastCons7way.wig
+
+ # Create histogram to get an overview of all the data
+ hgWiggle -doHistogram \
+ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+ -db=sacCer2 phastCons7way > histogram.data 2>&1
+
+ # create plot of histogram:
+
+ cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Yeast sacCer2 Histogram phastCons7way track"
+set xlabel " phastCons7way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+ # << happy emacs
+
+ display histo.png &