src/hg/makeDb/doc/sacCer2.txt 1.17
1.17 2010/02/05 06:54:06 kuhn
grammar
Index: src/hg/makeDb/doc/sacCer2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/sacCer2.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 1000000 -r1.16 -r1.17
--- src/hg/makeDb/doc/sacCer2.txt 2 Feb 2010 18:56:32 -0000 1.16
+++ src/hg/makeDb/doc/sacCer2.txt 5 Feb 2010 06:54:06 -0000 1.17
@@ -1,1392 +1,1395 @@
# for emacs: -*- mode: sh; -*-
# Saccharomyces cerevisiae
# Saccharomyces Genome Database, Stanford
# $Id$
#######################################################################
# Download data (DONE - 2009-01-30 - Hiram)
mkdir -p /hive/data/genomes/sacCer2/download
cd /hive/data/genomes/sacCer2/download
TOP=/hive/data/genomes/sacCer2/download
for D in gene_registry literature_curation oracle_schema protein_info \
protein_info/hypothetical_peptides
do
mkdir -p ${D}
cd ${D}
wget -l 1 --timestamping -np -nd --cut-dirs=1 -r -X "archive" \
"http://downloads.yeastgenome.org/${D}/"
rm -f index.* robots.txt
cd ${TOP}
done
mkdir sgd.chromosomes
for C in 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt
do
wget --timestamping \
"http://downloads.yeastgenome.org/sequence/genomic_sequence/chromosomes/fasta/ch
r${C}.fsa" -O sgd.chromosomes/chr${C}.fsa
done
# convert chrom names to something reasonable, they are now roman nums
mkdir -p chromosomes
# bash scripting follows:
runOne() {
N=$1
C=$2
echo ">chr${C}" > chromosomes/chr${C}.fa
grep -v "^>" sgd.chromosomes/chr${N}.fsa >> chromosomes/chr${C}.fa
echo "done chr${N} -> ${C}"
}
runOne 01 I
runOne 02 II
runOne 03 III
runOne 04 IV
runOne 05 V
runOne 06 VI
runOne 07 VII
runOne 08 VIII
runOne 09 IX
runOne 10 X
runOne 11 XI
runOne 12 XII
runOne 13 XIII
runOne 14 XIV
runOne 15 XV
runOne 16 XVI
runOne mt M
# the same sequence again, with the following runOne()
# will product an AGP file:
runOne() {
N=$1
C=$2
CTG=`head -1 sgd.chromosomes/chr${N}.fsa | sed -e 's/>ref|//; s/|.*//'`
size=`faSize sgd.chromosomes/chr${N}.fsa | head -1 | awk '{print $1}'`
echo -e "chr${C}\t1\t$size\t$SEQ\tF\t$CTG\t1\t$size\t+"
SEQ=`echo $SEQ | awk '{print $1+1}'`
}
# plus this echo statement for 2micron:
echo -e "2micron\t1\t6318\t${SEQ}\tF\tNC_001398\t1\t6318\t+"
# trim the SGD gff file and get UCSC chrom names
awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' chromosomal_feature/saccharomyces_cerevisiae.gff \
| sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff
# this program needed to be fixed for this newer gff file format
checkSgdSync -verbose=4 -gffRowCount=9 -faExtn=fa \
-gffFile=S.cerevisiae.gff \
download > sgd.feature.scan.txt 2>&1
# this used to have only a couple, now there are many non-coding
# annotations
# good 6702, bad 371, total 7073
#########################################################################
cd /hive/data/genomes/sacCer2
cat << '_EOF_' > sacCer2.config.ra
# Config parameters for makeGenomeDb.pl:
db sacCer2
scientificName Saccharomyces cerevisiae
commonName S. cerevisiae
assemblyDate June 2008
assemblyLabel SGD June 2008
orderKey 998
mitoAcc NC_001224
# override the check for max size of 25,000
mitoSize 90000
fastaFiles /hive/data/genomes/sacCer2/download/chromosomes/*.fa
agpFiles /hive/data/genomes/sacCer2/download/sacCer2.agp
# qualFiles /dev/null
dbDbSpeciesDir sacCer
taxId 4932
'_EOF_'
# << happy emacs
makeGenomeDb.pl sacCer2.config.ra > makeGenomeDb.log 2>&1
# take the trackDb stuff and check it in
# no need for masked sequence:
ln -s sacCer2.unmasked.2bit sacCer2.2bit
ln -s `pwd`/sacCer2.2bit /gbdb/sacCer2
#########################################################################
# sacCer2 - S. cerevisiae - Ensembl Genes version 52 (DONE - 2009-02-04 - hiram)
ssh swarm
cd /hive/data/genomes/sacCer2
cat << '_EOF_' > sacCer2.ensGene.ra
# required db variable
db sacCer2
# optional nameTranslation, the sed command that will transform
# Ensemble names to UCSC names. With quotes just to make sure.
nameTranslation "s/^VIII/chrVIII/; s/^VII/chrVII/; s/^VI/chrVI/; s/^V/chrV/; s/^XIII/chrXIII/; s/^XII/chrXII/; s/^XIV/chrXIV/; s/^XI/chrXI/; s/^XVI/chrXVI/; s/^XV/chrXV/; s/^X/chrX/; s/^III/chrIII/; s/^IV/chrIV/; s/^II/chrII/; s/^IX/chrIX/; s/^I/chrI/; s/^MT/chrM/; s/2-micron/2micron/"
'_EOF_'
# << happy emacs
doEnsGeneUpdate.pl -ensVersion=52 sacCer2.ensGene.ra
ssh hgwdev
cd /hive/data/genomes/sacCer2/bed/ensGene.52
featureBits sacCer2 ensGene
# 8912793 bases of 12162995 (73.278%) in intersection
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-02-04 - Hiram)
# repMatch = 1024 * sizeof(sacCer2)/sizeof(hg18)
# 4.32 = 1024 * (12162995/2881515245)
# use 10 to be very conservative
ssh hgwdev
cd /hive/data/genomes/sacCer2
blat sacCer2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/11.ooc \
-repMatch=10
# Wrote 3137 overused 11-mers to 11.ooc
# copy this to scratch data
cp -p jkStuff/11.ooc /hive/data/staging/data/sacCer2/sacCer2.11.ooc
#########################################################################
# work with the SGD annotations (WORKING - 2009-02-04 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/sgdAnnotations
cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
# get rid of the FASTA section from their gff file
awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' ../../download/chromosomal_feature/saccharomyces_cerevisiae.gff \
| sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff
#########################################################################
# CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE - 2009-02-10 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/sgdAnnotations
cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
# trim the delivered S.cerevisiae.gff file to get rid of the FASTA section
# and fixup the chrM and 2-micron chrom names:
awk '
BEGIN { keepGoing = 1 }
{
if (match($1, "^##FASTA")) { keepGoing = 0; }
if (keepGoing) print;
}
' ../../download/chromosomal_feature/saccharomyces_cerevisiae.gff \
| sed -e "s/^2-micron/2micron/; s/^chrMito/chrM/" > S.cerevisiae.gff
# wrote this perl script to specifically parse this particular instance
# of the sgd gff file.
$HOME/kent/src/hg/makeDb/outside/yeast/hgSgdGff3/sgdGffToGtf.pl \
S.cerevisiae.gff > sacCer2.sgdGene.gtf 2> sacCer2.gtf.stderr
# This also writes the files:
# -rw-rw-r-- 1 640177 Feb 6 15:01 leftOvers.gff
# -rw-rw-r-- 1 259712 Feb 6 15:01 otherFeatures.bed
# -rw-rw-r-- 1 254996 Feb 6 15:01 notes.txt
# -rw-rw-r-- 1 1214875 Feb 6 15:01 descriptions.txt
ldHgGene -gtf sacCer2 sgdGene sacCer2.sgdGene.gtf
hgLoadBed sacCer2 sgdOther otherFeatures.bed \
-tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql
- # this perl script will fixup the fasta header lines for
+ # this perl script will fix up the fasta header lines for
# the chr*.peptides.fsa files to run into hgSgdPep
cat << '_EOF_' > filter.pl
#!/usr/bin/env perl
use strict;
use warnings;
open (FH,">symbol.txt") or die "can not write to symbol.txt";
my $inAnnotation = 0;
while (my $line=<>) {
if ($line =~ m/^>Annotated\|/) {
$inAnnotation = 1;
my (@words) = split('\s+', $line);
- die "can not find four fields in\n'$line'" if (scalar(@words) < 4);
+ die "cannot find four fields in\n'$line'" if (scalar(@words) < 4);
my $name = $words[3];
$name =~ s/;.*//;
$name =~ s#/.*##;
printf ">ORFP:%s\n", $name;
printf FH "%s\t", $name;
if ("${name};" ne $words[3]) {
my $otherName = $words[3];
$otherName =~ s#.*/##;
$otherName =~ s/;//;
printf FH "%s", $otherName;
# printf FH " %s", $words[2];
} else {
printf FH "%s", $name;
}
# for (my $i = 4; $i < scalar(@words); ++$i) {
# printf FH " %s", $words[$i];
# }
print FH "\n";
} elsif ($line =~ m/^>/) {
$inAnnotation = 0;
} else {
if ($inAnnotation) { print $line; }
}
}
close (FH);
'_EOF_'
# << happy emacs
chmod +x filter.pl
zcat \
../../download/protein_info/hypothetical_peptides/chr*.peptides.20040928.fsa.gz\
| ./filter.pl | hgSgdPep stdin sgdPep.fa symbol.txt
hgPepPred sacCer2 generic sgdPep sgdPep.fa
hgsql sacCer2 -e 'create table sgdToName ( \
name varchar(10) not null, \
value varchar(10) not null, \
PRIMARY KEY(name), \
INDEX (value));'
hgsql sacCer2 -e 'load data local infile "symbol.txt" \
into table sgdToName;'
hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdDescription.sql
hgsql sacCer2 -e 'load data local infile "descriptions.txt" \
into table sgdDescription;'
hgsql sacCer2 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql
hgsql sacCer2 -e 'load data local infile "notes.txt" \
into table sgdOtherDescription;'
## Clean up some stray names:
cd /hive/data/genomes/sacCer2/bed/sgdAnnotations
hgsql -N -e "select name from sgdGene;" sacCer2 \
| sort -u > sacCer2.sgdGene.name.txt
hgsql -N -e "select name from sgdPep;" sacCer2 \
| sort -u > sacCer2.sgdPep.name.txt
comm -23 sacCer2.sgdPep.name.txt sacCer2.sgdGene.name.txt | while read N
do
hgsql -e "delete from sgdPep where name=\"$N\";" sacCer2
done
hgsql -N -e "select name from sgdDescription;" sacCer2 \
| sort -u > sacCer2.sgdDescription.name.txt
comm -23 sacCer2.sgdDescription.name.txt sacCer2.sgdGene.name.txt \
| while read N
do
hgsql -e "delete from sgdDescription where name=\"${N}\";" sacCer2
done
############################################################################
# catch up to other tables in sacCer1 - (DONE - 2009-02-24 - Hiram)
# can simply transfer these tables across from sacCer1:
hgsqldump --all -c --tab=. sacCer1 sgdAbundance sgdLocalization sgdToPfam
# hgLoadSqlTab doesn't like the comment characters:
grep -v "^--" sgdToPfam.sql \
| hgLoadSqlTab sacCer2 sgdToPfam stdin sgdToPfam.txt
grep -v "^--" sgdAbundance.sql \
| hgLoadSqlTab sacCer2 sgdAbundance stdin sgdAbundance.txt
grep -v "^--" sgdLocalization.sql \
| hgLoadSqlTab sacCer2 sgdLocalization stdin sgdLocalization.txt
hgsqldump --all -c --tab=. sacCer1 yeastP2P
grep -v "^--" yeastP2P.sql \
| hgLoadSqlTab sacCer2 yeastP2P stdin yeastP2P.txt
############################################################################
# ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE - 2009-02-10 - Hiram)
cd /hive/data/sacCer2/bed/sgdAnnotation
grep "Swiss-Prot" ../../download/chromosomal_feature/dbxref.tab \
| awk '{printf("%s\t%s\n", $5, $1);}' > sgdToSwissProt.txt
hgsql sacCer2 -e 'create table sgdToSwissProt ( \
name varchar(10) not null, \
value varchar(10) not null, \
PRIMARY KEY(name), \
INDEX (value));'
hgsql sacCer2 -e 'load data local infile "sgdToSwissProt.txt" \
into table sgdToSwissProt;'
hgProtIdToGenePred sacCer2 sgdGene sgdToSwissProt name value
+# interesting to note that sgdTOSwissProt has one accession not
+# in sgdGene.name: KHS1 (BK 2009-07-14)
+
############################################################################
# CREATE SGD-BASED CLONE TRACK (DONE - 2009-02-10 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/sgdClone
cd /hive/data/genomes/sacCer2/bed/sgdClone
# since sacCer1, these coordinates have become 0-relative ?
awk -F '\t' '{printf("%d\t%d\t%d\t%s\t%s\n", $3, $4, $5, $2, $1);}' \
../../download/chromosomal_feature/clone.tab \
| sed -e \
"s/^8/chrVIII/; s/^7/chrVII/; s/^6/chrVI/; s/^5/chrV/;
s/^13/chrXIII/; s/^12/chrXII/; s/^14/chrXIV/; s/^11/chrXI/;
s/^16/chrXVI/; s/^15/chrXV/; s/^10/chrX/; s/^3/chrIII/;
s/^4/chrIV/; s/^2/chrII/; s/^9/chrIX/; s/^1/chrI/;" > sgdClone.bed
hgLoadBed sacCer2 sgdClone sgdClone.bed -tab \
-sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql
############################################################################
# AUTO UPDATE GENBANK RUN (Done - 2009-02-10 - Hiram)
# align with latest genbank process.
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add sacCer2 just before sacCer1
# sacCer2 S. cerevisiae
sacCer2.serverGenome = /hive/data/genomes/sacCer2/sacCer2.2bit
sacCer2.clusterGenome = /scratch/data/sacCer2/sacCer2.2bit
sacCer2.ooc = no
sacCer2.maxIntron = 5000
sacCer2.lift = no
sacCer2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter}
sacCer2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
sacCer2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
sacCer2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
sacCer2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter}
#sacCer2.perChromTables = no # doesn't work in the browser
sacCer2.genbank.mrna.xeno.load = no
sacCer2.refseq.mrna.native.load = no
sacCer2.downloadDir = sacCer2
cvs ci -m "Added sacCer2." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
ssh genbank
screen # use a screen to manage this job
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial sacCer2 &
# logFile: var/build/logs/2009.02.10-11:27:42.sacCer2.initalign.log
# real 173m1.570s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad sacCer2
# logFile: var/dbload/hgwdev/logs/2009.02.10-14:25:48.dbload.log
# real 18m21.436s
# enable daily alignment and update of hgwdev (DONE - 2009-02-24 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add sacCer2 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added sacCer2 - S. cerevisiae" \
etc/align.dbs etc/hgwdev.dbs
make etc-update
############################################################################
# DOING MULTIPLE ALIGNMENT WITH OTHER YEAST (DONE - 2009-02-10 - Hiram)
# the sequences are the same set from 2003 that were used in sacCer1
# so, just link to them and rerun the alignment
# in the directory /cluster/data/sacCer1/bed/otherYeast/align
mkdir /hive/data/genomes/sacCer2/bed/otherYeast
cd /hive/data/genomes/sacCer2/bed/otherYeast
for D in sacBay sacCas sacKlu sacKud sacMik sacPar
do
ln -s ../../../sacCer1/bed/otherYeast/${D} .
done
mkdir align
cd align
ln -s ../../../../sacCer1/bed/otherYeast/align/sac??? .
# helpful to have 2bit files for chainAntiRepeat
for G in sac???
do
faToTwoBit ${G} ${G}.2bit
done
# Create directory full of size info
mkdir sizes
for F in sac???
do
faSize $F -detailed > sizes/$F
done
# Create some working directories
mkdir lav axtAll axtBest mafIn mafRawOut mafOut
# Create little shell script to do blastz alignments
cat << '_EOF_' > lastz.csh
#!/bin/csh -ef
/cluster/bin/penn/$MACHTYPE/lastz $1 $2 > $3
'_EOF_'
# << happy emacs
chmod +x lastz.csh
# create sacCer2 individual chr.fa files:
mkdir sacCer2
twoBitInfo ../../../sacCer2.2bit stdout | cut -f1 | while read C
do
twoBitToFa ../../../sacCer2.2bit:${C} sacCer2/${C}.fa
echo "${C} done"
done
-
# Create job spec to do all blastz alignments
ls -1 sacCer2/*.fa > sacCer.list
ls -1 sac??? > other.list
cat << '_EOF_' > template
#LOOP
lastz.csh $(path1) $(path2) {check out line+ lav/$(root1).$(root2)}
#ENDLOOP
'_EOF_'
# << happy emacs
ssh swarm
cd /hive/data/genomes/sacCer2/bed/otherYeast/align
gensub2 sacCer.list other.list template jobList
# Convert from lav to psl
for L in lav/*
do
B=`basename $L`
echo $B
lavToPsl $L stdout | gzip > psl/$B.psl.gz
done
# chaining
mkdir -p axtChain/run
cd axtChain/run
ls -d ../../sac??? | xargs -L 1 basename > species.list
cut -f1 ../../../../../chrom.sizes > chrom.list
cat << '_EOF_' > chain.csh
#!/bin/csh -ef
set S = $1
set C = $2
set IN = ../../psl/${C}.${S}.psl.gz
set OUT = ${S}/${C}.chain
mkdir -p ${S}
zcat ${IN} \
| axtChain -psl -verbose=0 -minScore=1000 -linearGap=medium stdin \
/hive/data/genomes/sacCer2/sacCer2.2bit \
-faQ ../../${S} stdout \
| chainAntiRepeat /hive/data/genomes/sacCer2/sacCer2.2bit \
../../${S}.2bit stdin ${OUT}
'_EOF_'
# << happy emacs
chmod +x chain.csh
cat << '_EOF_' > template
#LOOP
chain.csh $(path1) $(path2) {check out line+ $(root1)/$(root2).chain}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 species.list chrom.list template jobList
para create jobList
para push
para time
# Completed: 108 of 108 jobs
# CPU time in finished jobs: 73s 1.21m 0.02h 0.00d 0.000 y
# IO & Wait Time: 1100s 18.34m 0.31h 0.01d 0.000 y
# Average job time: 11s 0.18m 0.00h 0.00d
# Longest finished job: 12s 0.20m 0.00h 0.00d
# Submission to last job: 38s 0.63m 0.01h 0.00d
cd ..
# merge the individual results
foreach S (`cat run/species.list`)
echo $S
find ./run/${S} -name "*.chain" \
| chainMergeSort -inputList=stdin | gzip -c > sacCer2.${S}.all.chain.gz
end
# split them again to get consistent numbering
foreach S (`cat run/species.list`)
echo $S
chainSplit chain.${S} sacCer2.${S}.all.chain.gz
end
# netting the chains
cat << '_EOF_' > netChains.csh
#!/bin/csh -ef
set S0 = "../../../../chrom.sizes"
foreach S (`cat run/species.list`)
echo $S
chainPreNet sacCer2.${S}.all.chain.gz ${S0} ../sizes/${S} stdout \
| chainNet stdin -minSpace=1 ${S0} ../sizes/${S} stdout /dev/null \
| netSyntenic stdin noClass.${S}.net
end
'_EOF_'
# << happy emacs
chmod +x netChains.csh
./netChains.csh
cat << '_EOF_' > mkMafs.csh
#!/bin/csh -ef
set TOP = `pwd`
set S0 = "/hive/data/genomes/sacCer2/chrom.sizes"
foreach S (`cat run/species.list`)
# Make axtNet for download: one .axt per sacCer2 seq.
netSplit noClass.${S}.net net.${S}
cd ..
rm -fr axtNet.${S}
mkdir axtNet.${S}
foreach f (axtChain/net.${S}/*.net)
netToAxt $f axtChain/chain.${S}/$f:t:r.chain \
/hive/data/genomes/sacCer2/sacCer2.2bit ${S}.2bit stdout \
| axtSort stdin stdout \
| gzip -c > axtNet.${S}/$f:t:r.sacCer2.${S}.net.axt.gz
end
# Make mafNet for multiz: one .maf per sacCer2 seq.
rm -fr mkdir mafNet.${S}
mkdir mafNet.${S}
foreach f (axtNet.${S}/*.sacCer2.${S}.net.axt.gz)
axtToMaf -tPrefix=sacCer2. -qPrefix=${S}. $f \
${S0} sizes/${S} stdout \
| gzip -c > mafNet.${S}/$f:t:r:r:r:r:r.maf.gz
end
cd ${TOP}
end
'_EOF_'
# << happy emacs
chmod +x mkMafs.csh
./mkMafs.csh
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE 2009-02-10 braney )
# bash if not using bash shell already
cd /cluster/data/sacCer2
mkdir /cluster/data/sacCer2/blastDb
awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
twoBitToFa -seqList=1meg.lst sacCer2.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa 1meg.lst
awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
twoBitToFa -seqList=less1meg.lst sacCer2.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
rm temp.fa less1meg.lst
cd blastDb
for i in *.fa
do
/hive/data/outside/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 14
mkdir -p /cluster/data/sacCer2/bed/tblastn.hg18KG
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
echo ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 14 query.lst
# we want around 50000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(50000/`wc query.lst | awk '{print $1}'`\)
# 36727/(50000/14) = 10.283560
# chose 100 because 10 is too small
mkdir -p kgfa
split -l 100 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl kgfa/kg
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
wc kg.lst
# 368 368 4784 kg.lst
mkdir -p blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/sacCer2/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
exit
ssh swarm
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
gensub2 query.lst kg.lst blastGsub blastSpec
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 5152 of 5152 jobs
# CPU time in finished jobs: 71064s 1184.40m 19.74h 0.82d 0.002 y
# IO & Wait Time: 16915s 281.92m 4.70h 0.20d 0.001 y
# Average job time: 17s 0.28m 0.00h 0.00d
# Longest finished job: 69s 1.15m 0.02h 0.00d
# Submission to last job: 156s 2.60m 0.04h 0.00d
ssh swarm
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS ../blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
para create chainSpec
para try, check, push, check etc.
# Completed: 368 of 368 jobs
# CPU time in finished jobs: 5s 0.09m 0.00h 0.00d 0.000 y
# IO & Wait Time: 1487s 24.78m 0.41h 0.02d 0.000 y
# Average job time: 4s 0.07m 0.00h 0.00d
# Longest finished job: 10s 0.17m 0.00h 0.00d
# Submission to last job: 37s 0.62m 0.01h 0.00d
cd /cluster/data/sacCer2/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
cd ..
pslCheck blastHg18KG.psl
# checked: 5050 failed: 0 errors: 0
# load table
ssh hgwdev
cd /cluster/data/sacCer2/bed/tblastn.hg18KG
hgLoadPsl sacCer2 blastHg18KG.psl
# check coverage
featureBits sacCer2 blastHg18KG
# 1261725 bases of 12162995 (10.373%) in intersection
featureBits sacCer2 blastHg18KG sgdGene -enrichment
# blastHg18KG 10.373%, sgdGene 72.883%, both 10.320%, cover 99.49%, enrich 1.37x
rm -rf blastOut
#end tblastn
###########################################################################
## 7-Way Multiz (DONE - 2009-02-11 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/multiz7way
cd /hive/data/genomes/sacCer2/bed/multiz7way
# using the phylogenetic tree as mentioned at:
# http://www.genetics.wustl.edu/saccharomycesgenomes/yeast_phylogeny.html
# and from Jim's note: genomes/sacCer1/bed/otherYeast/README
# ((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)
echo "((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)" \
> tree.nh
echo sacCer2 sacPar sacMik sacKud sacBay sacCas sacKlu > species.list
mkdir mafLinks
for S in sacPar sacMik sacKud sacBay sacCas sacKlu
do
mkdir mafLinks/${S}
cd mafLinks/${S}
ln -s ../../../otherYeast/align/mafNet.${S}/*.maf.gz .
cd ../..
done
find -L ./mafLinks -type f | xargs -L 1 basename \
| sed -e "s/.maf.gz//" | sort -u > maf.list
mkdir maf run
cd run
mkdir penn
cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
# set the db and pairs directories here
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = sacCer2
set c = $1
set result = $2
set run = `pwd`
set tmp = $run/tmp/$db/multiz.$c
set pairs = /hive/data/genomes/sacCer2/bed/multiz7way/mafLinks
/bin/rm -fr $tmp
/bin/mkdir -p $tmp
/bin/cp -p ../tree.nh ../species.list $tmp
pushd $tmp
foreach s (`sed -e "s/ $db//" species.list`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
/bin/zcat $in.gz > $out
else if (-e $in) then
ln -s $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
/bin/rm -f $result
/bin/cp -p $tmp/$c.maf $result
/bin/rm -fr $tmp
/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
/bin/rmdir --ignore-fail-on-non-empty $run/tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../maf.list single template jobList
para create jobList
# Completed: 18 of 18 jobs
# CPU time in finished jobs: 710s 11.83m 0.20h 0.01d 0.000 y
# IO & Wait Time: 127s 2.12m 0.04h 0.00d 0.000 y
# Average job time: 47s 0.78m 0.01h 0.00d
# Longest finished job: 105s 1.75m 0.03h 0.00d
# Submission to last job: 144s 2.40m 0.04h 0.00d
# Estimated complete: 0s 0.00m 0.00h 0.00d
# load MAF tables
ssh hgwdev
mkdir -p /gbdb/sacCer2/multiz7way/maf
cd /hive/data/genomes/sacCer2/bed/multiz7way/maf
ln -s `pwd`/*.maf /gbdb/sacCer2/multiz7way/maf
# allow the temporary multiz7way.tab file to be created in tmp
cd /data/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/sacCer2/multiz7way/maf sacCer2 multiz7way
# real 0m4.903s
# Loaded 38795 mafs in 18 files from /gbdb/sacCer2/multiz7way/maf
# load summary table
time nice -n +19 cat /gbdb/sacCer2/multiz7way/maf/*.maf \
| hgLoadMafSummary sacCer2 -mergeGap=1500 multiz7waySummary stdin
# real 0m5.536
# Created 3037 summary blocks from 73806 components and 38795 mafs
#########################################################################
## phastCons conservation (DONE - 2009-02-11 - Hiram)
# Create SS files for each chromosome:
mkdir /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
mkdir SS
# split up alignments; no need to use cluster here
MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut
FA=/cluster/data/sacCer1
CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M"
mkdir -p SS
for C in `(cd ../maf; ls *.maf | sed -e "s/.maf//")`
do
echo $C
/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
../maf/${C}.maf -M ../../otherYeast/align/sacCer2/${C}.fa -i MAF \
-O sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \
-o SS > SS/${C}.ss
done
# produce rough starting model
/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
-i SS --aggregate sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \
-o SS -z SS/*.ss > all.ss
/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloFit \
-i SS all.ss \
--tree "((((((sacCer2,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" \
-o starting-tree
# (cheat the branch lengths up a bit, since this represents an
# average of conserved and nonconserved sites; we want to
# initialize for nonconserved)
/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/tree_doctor \
--scale 2 starting-tree.mod > tmp.mod
mv tmp.mod starting-tree.mod
# also get average GC content of aligned regions
/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
-i SS all.ss --summary-only
#descrip. A C G T G+C length all_gaps some_gaps
#all.ss 0.3056 0.1950 0.1948 0.3047 0.3898 13130139 0 2122279
# save some I/O and space
gzip SS/*.ss
# now set up cluster job to estimate model parameters. See
# other make{hg18|mm9}.doc for add'l details
cat << '_EOF_' > doEstimate.sh
#!/bin/sh
zcat SS/$1.ss.gz | \
/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons \
- starting-tree.mod --gc 0.3898 --nrates 1,1 --no-post-probs \
--ignore-missing --expected-lengths 75 --target-coverage 0.5 \
--quiet --log $2 --estimate-trees $3
'_EOF_'
# << happy emacs
chmod +x doEstimate.sh
rm -fr LOG TREES
mkdir LOG TREES
cat << '_EOF_' > template
#LOOP
doEstimate.sh $(root1) {check out line+ LOG/$(root1).log} TREES/$(root1)
#ENDLOOP
'_EOF_'
# << happy emacs
(cd SS; ls *.ss.gz) | sed -e "s/.ss.gz//" > chr.list
gensub2 chr.list single template jobList
para create jobList
para push
para time
# Completed: 18 of 18 jobs
# CPU time in finished jobs: 8152s 135.86m 2.26h 0.09d 0.000 y
# IO & Wait Time: 323s 5.39m 0.09h 0.00d 0.000 y
# Average job time: 471s 7.85m 0.13h 0.01d
# Longest finished job: 889s 14.82m 0.25h 0.01d
# Submission to last job: 895s 14.92m 0.25h 0.01d
# Now combine parameter estimates.
ls TREES/*.cons.mod > cons.txt
/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \
--read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
ls TREES/*.noncons.mod > noncons.txt
/cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \
--read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
# Now set up cluster job for computing consevation scores and
# predicted elements
cat << '_EOF_' > doPhastCons.sh
#!/bin/sh
mkdir -p POSTPROBS ELEMENTS
chr=$1
out=$2
tmpFile=/scratch/tmp/phastCons.$chr
rm -f $tmpFile
zcat SS/$chr.ss.gz \
| /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons - \
ave.cons.mod,ave.noncons.mod --expected-lengths 75 \
--target-coverage 0.5 --quiet --seqname $chr --idpref $chr \
--viterbi ELEMENTS/$chr.bed --score --require-informative 0 > $tmpFile
gzip -c $tmpFile > POSTPROBS/$chr.pp.gz
rm $tmpFile
'_EOF_'
# << happy emacs
chmod u+x doPhastCons.sh
cat << '_EOF_' > run.template
#LOOP
doPhastCons.sh $(root1) {check out exists+ POSTPROBS/$(root1).pp.gz} {check out line+ ELEMENTS/$(root1).bed}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 chr.list single run.template run.jobList
rm -fr POSTPROBS ELEMENTS
para create run.jobList
para push
para time
# Completed: 18 of 18 jobs
# CPU time in finished jobs: 78s 1.30m 0.02h 0.00d 0.000 y
# IO & Wait Time: 658s 10.97m 0.18h 0.01d 0.000 y
# Average job time: 41s 0.68m 0.01h 0.00d
# Longest finished job: 53s 0.88m 0.01h 0.00d
# Submission to last job: 57s 0.95m 0.02h 0.00d
# set up phastConsElements track
cat ELEMENTS/*.bed | sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin \
> phastConsElements7way.bed
hgLoadBed sacCer2 phastConsElements7way phastConsElements7way.bed
featureBits sacCer2 phastConsElements7way
# 7762587 bases of 12162995 (63.821%) in intersection
# previously this was:
featureBits sacCer1 phastConsElements
# 7517116 bases of 12156302 (61.837%) in intersection
mkdir ../downloads
mkdir ../downloads/phastCons7way
for C in `cat chr.list`
do
cp -p POSTPROBS/${C}.pp.gz \
../downloads/phastCons7way/sacCer2.${C}.wigFixed.gz
done
# encode those files into wiggle data
zcat ../downloads/phastCons7way/*.wigFixed.gz \
| wigEncode stdin phastCons7way.wig phastCons7way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# Load gbdb and database with wiggle.
cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
ln -s `pwd`/phastCons7way.wib /gbdb/sacCer2/multiz7way/phastCons7way.wib
hgLoadWiggle -pathPrefix=/gbdb/sacCer2/multiz7way sacCer2 \
phastCons7way phastCons7way.wig
# Create histogram to get an overview of all the data
hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=sacCer2 phastCons7way > histogram.data 2>&1
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Yeast sacCer2 Histogram phastCons7way track"
set xlabel " phastCons7way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# To create the tree diagram for the details page, use this tree
# definition in http://genome.ucsc.edu/cgi-bin/phyloGif
((((((S._cerevisiae,S._paradoxus),S._mikatae),S._kudriavzevii),S._bayanus),S._castelli),S._kluyveri)
#########################################################################
## Annotate the sacCer2 7-way sequence with genes
mkdir /hive/data/genomes/sacCer2/bed/multiz7way/anno
cd /hive/data/genomes/sacCer2/bed/multiz7way/anno
mkdir genes
# using sgdGene
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from sgdGene" sacCer2 \
| genePredSingleCover stdin stdout | gzip -2c \
> genes/sacCer2.sgdGene.gz
(cat ../maf/*.maf | genePredToMafFrames sacCer2 stdin stdout \
sacCer2 genes/sacCer2.sgdGene.gz \
| gzip > multiz7way.mafFrames.gz) > frames.log 2>&1
zcat multiz7way.mafFrames.gz \
| sort -k1,1 -k2,2n | hgLoadMafFrames sacCer2 multiz7wayFrames stdin
############################################################################
# creating upstream mafs (DONE - 2009-06-26 - Hiram)
ssh hgwdev
cd /hive/data/genomes/sacCer2/goldenPath/multiz7way
# run this bash script
cat << '_EOF_' > mkUpstream.sh
DB=sacCer2
GENE=sgdGene
NWAY=multiz7way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.${GENE}.maf"
echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout"
featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| mafFrags ${DB} ${NWAY} stdin stdout \
-orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list \
| gzip -c > upstream${S}.${GENE}.maf.gz
echo "done upstream${S}.${GENE}.maf.gz"
done
'_EOF_'
# << happy emacs
chmod +x mkUpstream.sh
./mkUpstream.sh
#########################################################################
## simpleRepeats (DONE - 2009-02-12 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/simpleRepeat
cd /hive/data/genomes/sacCer2/bed/simpleRepeat
doSimpleRepeat.pl -buildDir=`pwd` -smallClusterHub=swarm \
-workhorse=hgwdev sacCer2 > do.log 2>&1
#########################################################################
## Regulatory Code (DONE - 2009-02-17 - Hiram)
# liftOver from sacCer1
mkdir /hive/data/genomes/sacCer2/bed/transRegCode
cd /hive/data/genomes/sacCer2/bed/transRegCode
# lifting sacCer1 data to this assembly
hgsql -N -e "select chrom,chromStart,chromEnd,name,score,chipEvidence,consSpecies from transRegCode" sacCer1 \
| liftOver -bedPlus=5 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
transRegCode.lifted.bed transRegCode.unMapped.bed
hgsql -N -e "select chrom,chromStart,chromEnd,name,tfCount,tfList,bindVals fromtransRegCodeProbe;" sacCer1 \
| liftOver -bedPlus=4 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
transRegCodeProbe.lifted.bed transRegCodeProbe.unMapped.bed
hgLoadBed sacCer2 transRegCode transRegCode.lifted.bed \
-sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql
# Loaded 206672 elements of size 7
hgLoadBed sacCer2 transRegCodeProbe transRegCodeProbe.lifted.bed \
-sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab
# Loaded 6178 elements of size 7
hgsql sacCer2 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql
hgsql sacCer2 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql
hgsql sacCer2 < $HOME/kent/src/hg/lib/growthCondition.sql
hgsql -N -e "select * from transRegCodeCondition;" sacCer1 \
| hgsql sacCer2 -e \
'load data local infile "/dev/stdin" into table transRegCodeCondition'
hgsql -N -e "select * from transRegCodeMotif;" sacCer1 \
| hgsql sacCer2 -e \
'load data local infile "/dev/stdin" into table transRegCodeMotif'
hgsql -N -e "select * from growthCondition;" sacCer1 \
| hgsql sacCer2 -e \
'load data local infile "/dev/stdin" into table growthCondition'
#########################################################################
# Oreganno track - (DONE - 2009-02-17 - Hiram)
# liftOver from sacCer1 database
hgsql -N -e \
"select chrom,chromStart,chromEnd,id,strand,name from oreganno;" sacCer1 \
| liftOver -bedPlus=4 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
oreganno.lifted.bed oreganno.unMapped.bed
hgsql sacCer2 < $HOME/kent/src/hg/lib/oreganno.sql
hgLoadBed -oldTable sacCer2 oreganno oreganno.lifted.bed -tab
# Loaded 7302 elements of size 6
# and load non-positional tracks from sacCer1:
hgsql -N -e "select * from oregannoAttr;" sacCer1 \
| hgLoadSqlTab -oldTable sacCer2 oregannoAttr \
~/humPhen/kent/src/hg/lib/oreganno.sql stdin
hgsql -N -e "select * from oregannoLink;" sacCer1 \
| hgLoadSqlTab -oldTable sacCer2 oregannoLink \
~/humPhen/kent/src/hg/lib/oreganno.sql stdin
#########################################################################
# Regulatory Module - (DONE - 2009-02-17 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/regModule
cd /hive/data/genomes/sacCer2/bed/regModule
# liftOver data from sacCer1
hgsql -N -e \
"select chrom,chromStart,chromEnd,name,score,strand from esRegUpstreamRegion;" \
sacCer1 | liftOver -bedPlus=6 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
esRegUpstreamRegion.lifted.bed esRegUpstreamRegion.unMapped.bed
hgsql -N -e \
"select chrom,chromStart,chromEnd,name,score,strand,gene from esRegGeneToMotif;" \
sacCer1 | liftOver -bedPlus=6 -tab stdin \
/usr/local/apache/htdocs/goldenPath/sacCer1/liftOver/sacCer1ToSacCer2.over.chain.gz \
esRegGeneToMotif.lifted.bed esRegGeneToMotif.unMapped.bed
# I do not see instructions in sacCer1 to create these tables,
# so, dump their schemas:
hgsqldump --all -c -d --tab=. sacCer1 esRegUpstreamRegion esRegGeneToMotif
# and data for these other two:
hgsqldump --all -c --tab=. sacCer1 esRegGeneToModule esRegMotif
hgLoadBed sacCer2 esRegGeneToMotif -sqlTable=esRegGeneToMotif.sql -tab \
esRegGeneToMotif.lifted.bed
# Loaded 4002 elements of size 7
hgLoadBed sacCer2 esRegUpstreamRegion -sqlTable=esRegUpstreamRegion.sql \
-tab esRegUpstreamRegion.lifted.bed
# Loaded 1670 elements of size 6
hgsql sacCer2 < esRegMotif.sql
hgsql sacCer2 -e \
'load data local infile "esRegMotif.txt" into table esRegMotif;'
hgsql sacCer2 < esRegGeneToModule.sql
hgsql sacCer2 -e \
'load data local infile "esRegGeneToModule.txt" into table esRegGeneToModule;'
#########################################################################
# creating tables for Gene Sorter (DONE - 2009-02-17 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/hgNear
cd /hive/data/genomes/sacCer2/bed/hgNear
hgClusterGenes sacCer2 sgdGene sgdIsoforms sgdCanonical
# Got 6550 clusters, from 6717 genes in 18 chromosomes
# Make self mapping table for expression.
hgsql -N -e 'select name from sgdGene;' sacCer2 \
| awk '{printf("%s\t%s\n", $1, $1);}' > sgdToSgd.tab
hgsql sacCer2 -e 'create table sgdToSgd ( \
name varchar(10) not null, \
value varchar(10) not null, \
PRIMARY KEY(name), \
UNIQUE (value));'
hgsql sacCer2 \
-e 'load data local infile "sgdToSgd.tab" into table sgdToSgd'
# Removed sgdToSgd table because it wasn't being used anywhere.
# Katrina 7/14/09
# Make expression similarity table.
hgExpDistance sacCer2 hgFixed.yeastChoCellCycle \
hgFixed.yeastChoCellCycleExps choExpDistance
# Have 6259 elements in hgFixed.yeastChoCellCycle
# Got 6259 unique elements in hgFixed.yeastChoCellCycle
# Made choExpDistance.tab
#########################################################################
# running the blastP operation to the other genomes for the gene sorter
# (DONE - 2009-02-18 - Hiram)
mkdir /hive/data/genomes/sacCer2/bed/hgNearBlastp
cd /hive/data/genomes/sacCer2/bed/hgNearBlastp
mkdir tmp 090218
pepPredToFa sacCer2 sgdPep 090218/sgdPep.faa
pepPredToFa hg18 knownGenePep 090218/hg18.known.faa
pepPredToFa mm9 knownGenePep 090218/mm9.known.faa
pepPredToFa rn4 knownGenePep 090218/rn4.known.faa
pepPredToFa danRer5 ensPep 090218/danRer5.ensPep.faa
pepPredToFa dm3 flyBasePep 090218/dm3.flyBasePep.faa
pepPredToFa ce6 sangerPep 090218/ce6.sangerPep.faa
# sanity check, number of lines in each faa file
cd 090218
cat << '_EOF_' > config.ra
# Latest Yeast vs. other Gene Sorter orgs:
# human, mouse, rat, zebrafish, fly, worm
targetGenesetPrefix known
targetDb sacCer2
queryDbs hg18 mm9 rn4 danRer5 dm3 ce6
sacCer2Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/sgdPep.faa
hg18Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/hg18.known.faa
mm9Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/mm9.known.faa
rn4Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/rn4.known.faa
danRer5Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/danRer5.ensPep.faa
dm3Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/dm3.flyBasePep.faa
ce6Fa /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218/ce6.sangerPep.faa
buildDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/090218
scratchDir /hive/data/genomes/sacCer2/bed/hgNearBlastp/tmp
'_EOF_'
# << happy emacs
# takes about an hour
time nice -n +19 $HOME/kent/src/hg/utils/automation/doHgNearBlastp.pl \
config.ra > do.log 2>&1 &
# real 21m32.343s
# one name seems to have snuck in here:
cd /hive/data/genomes/sacCer2/bed/hgNearBlastp
hgsql -N -e "select query from mmBlastTab;" sacCer2 \
| sort -u > sacCer2.mmBlastTab.query.txt
hgsql -N -e "select name from sgdGene;" sacCer2 \
| sort -u > sacCer2.sgdGene.name.txt
# the single one is:
comm -23 sacCer2.mmBlastTab.query.txt sacCer2.sgdGene.name.txt
# YDL038C
# it was the same in all of them:
hgsql -e "delete from mmBlastTab where query=\"YDL038C\";" sacCer2
hgsql -e "delete from drBlastTab where query=\"YDL038C\";" sacCer2
hgsql -e "delete from dmBlastTab where query=\"YDL038C\";" sacCer2
hgsql -e "delete from ceBlastTab where query=\"YDL038C\";" sacCer2
hgsql -N -e "select query from knownBlastTab;" sacCer2 \
| sort -u > sacCer2.knownBlastTab.query.txt
comm -23 sacCer2.knownBlastTab.query.txt sacCer2.sgdGene.name.txt \
| while read N
do
hgsql -e "delete from knownBlastTab where query=\"${N}\";" sacCer2
done
# knownBlastTab does not run any columns in yeast GS.
RENAME TABLE knownBlastTab TO sgdBlastTab;
# kuhn and katrina 06-18-2009
#########################################################################
# creating download files and pushQ (DONE - 2009-02-24 - Hiram)
cd /hive/data/genoems/sacCer2
# there aren't any repeats on 2micron
touch bed/simpleRepeat/trfMaskChrom/2micron.bed
# and, there are no RM files:
makeDownloads.pl -ignoreRepeatMasker sacCer2
# edit the README files in:
# ./goldenPath/bigZips/README.txt
# ./goldenPath/database/README.txt
# ./goldenPath/liftOver/README.txt
# ./goldenPath/chromosomes/README.txt
mkdir pushQ
makePushQSql.pl sacCer2 > sacCer2.pushQ.sql
# one warning:
# sacCer2 does not have seq
# it could not identify the following tables:
# 2micron_est
# 2micron_gap
# 2micron_gold
# 2micron_intronEst
# 2micron_mrna
# growthCondition
# sgdToPfam
# yeastP2P
scp -p sacCer2.pushQ.sql hiram@hgwbeta:/tmp
ssh hgwbeta
hgsql qapushq < sacCer2.pushQ.sql
#########################################################################
# BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("sacCer2", "blat10", "17792", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("sacCer2", "blat10", "17793", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
# uwFootprints: (2009-04-04 markd)
# lifted sacCer1 tag count data and set to Nobel lab
# William Stafford Noble <noble@gs.washington.edu>
# Xiaoyu Chen <xchen@cs.washington.edu>
# lift counts to sacSer2 and give back to UW
cd /hive/data/genomes/sacCer1/bed/uwFootprintsxi
wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.bed
liftOver yeast.dnaseI.tagCounts.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.bed /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.nolift.bed
mkdir /hive/data/genomes/sacCer2/bed/uwFootprints
cd /hive/data/genomes/sacCer2/bed/uwFootprints
# get back wig
wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.sacCer2.lifted.wig
# lift other BEDs
liftOver /hive/data/genomes/sacCer1/bed/uwFootprints/yeast.mappability.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz yeast.mappability.bed yeast.mappability.nolift.bed
# drop yeast.footprints.bed and truncate name to three decimal places
tawk 'NR>1{print $1,$2,$3,sprintf("%0.3f",$4)}' /hive/data/genomes/sacCer1/bed/uwFootprints/yeast.footprints.bed | liftOver stdin /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz yeast.footprints.bed yeast.footprints.nolift.bed
wigEncode yeast.dnaseI.tagCounts.sacCer2.lifted.wig uwFootprintsTagCounts.wig uwFootprintsTagCounts.wib
# Converted yeast.dnaseI.tagCounts.sacCer2.lifted.wig, upper limit 13798.00, lower limit 1.00
ln -sf /hive/data/genomes/sacCer2/bed/uwFootprints/uwFootprintsTagCounts.wib /gbdb/sacCer2/wib
hgLoadWiggle -tmpDir=/data/tmp sacCer2 uwFootprintsTagCounts uwFootprintsTagCounts.wig
hgLoadBed -tab -tmpDir=/data/tmp sacCer2 uwFootprintsMappability yeast.mappability.bed
hgLoadBed -tmpDir=/data/tmp sacCer2 uwFootprintsPrints yeast.footprints.bed
############################################################################
# fixup sgdToName table (DONE - 2009-07-09 - Hiram)
# this table is missing a name correspondence for some of
# the gene names in sgdGene.name
# to fixup, any names in sgdGene.name that are not in sgdToName,
# simply add those names and reference themselves
mkdir /hive/data/genomes/sacCer2/bed/fixSgdToName
cd /hive/data/genomes/sacCer2/bed/fixSgdToName
hgsql -N -e "select name from sgdGene;" sacCer2 | sort -u > sgdGene.name
hgsql -N -e "select name from sgdToName;" sacCer2 > sgdToName.tab
# convert the two columns of names to a single list of unique names
cat sgdToName.tab | tr '[\t]' '[\n]' | sort -u > all.sgdToName.name
comm -12 sgdGene.name all.sgdToName.name > commonToBoth
comm -23 sgdGene.name all.sgdToName.name > uniqueToSgdGene
comm -13 sgdGene.name all.sgdToName.name > uniqueToSgdToName
awk '{printf "%s\t%s\n", $1, $1}' uniqueToSgdGene > addSgdGeneNames.tab
# count before load
hgsql -e "select count(*) from sgdToName;" sacCer2
# 6254
# adding names:
wc -l addSgdGeneNames.tab
# 472
# 6254 + 472 = 6726
hgsql sacCer2 \
-e 'load data local infile "addSgdGeneNames.tab" into table sgdToName;'
hgsql -e "select count(*) from sgdToName;" sacCer2
# 6726
# Removed extraneous records from sgdToName (DONE - 2009-07-10 - Katrina)
#sgdGene has 6717 records (all of which have a distinct value in the
#name field) and sgdToName has 6726 (all of which have a distinct value
#in the name field).I searched for these 9 gene names in the SGD web
#site (yeastgenome.org). Most are alias' of another gene or something
#that was originally characterized as a separate gene but later found
#to be a part of another gene.
mysql> delete from sgdToName where name= "YBL039W-A";
mysql> delete from sgdToName where name= "YBL101W-A";
mysql> delete from sgdToName where name= "YBL101W-C";
mysql> delete from sgdToName where name= "YDL038C";
mysql> delete from sgdToName where name= "YDR524W-A";
mysql> delete from sgdToName where name= "YGR272C";
mysql> delete from sgdToName where name= "YLR157W-A";
mysql> delete from sgdToName where name= "YLR157W-C";
mysql> delete from sgdToName where name= "YNL097C-A";
############################################################################
# for some unknown reason, indexes missing on chrM_gold and chrM_gap
# maybe because they have only one and zero items ?
# (DONE - 2009-07-21 - Hiram)
hgsql -e "alter table chrM_gold add index chromStart (chromStart);" sacCer2
hgsql -e "alter table chrM_gold add index bin (bin);" sacCer2
hgsql -e "alter table chrM_gold add index frag (frag(9));" sacCer2
hgsql -e "alter table chrM_gap add index chromStart (chromStart);" sacCer2
hgsql -e "alter table chrM_gap add index bin (bin);" sacCer2
############################################################################
# LIFTOVER TO SacCer1 (DONE - 2010-01-14 - Hiram )
mkdir /hive/data/genomes/sacCer2/bed/blat.sacCer1.2010-01-14
cd /hive/data/genomes/sacCer2/bed/blat.sacCer1.2010-01-14
# -debug run to create run dir, preview scripts...
doSameSpeciesLiftOver.pl -debug sacCer2 sacCer1
# Real run:
time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
sacCer2 sacCer1 > do.log 2>&1
# real 3m42.700s
#############################################################################