src/hg/makeDb/doc/sacCer1.txt 1.10
1.10 2009/11/25 21:48:42 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/sacCer1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/sacCer1.txt,v
retrieving revision 1.9
retrieving revision 1.10
diff -b -B -U 1000000 -r1.9 -r1.10
--- src/hg/makeDb/doc/sacCer1.txt 6 Apr 2009 05:42:22 -0000 1.9
+++ src/hg/makeDb/doc/sacCer1.txt 25 Nov 2009 21:48:42 -0000 1.10
@@ -1,1459 +1,1459 @@
# for emacs: -*- mode: sh; -*-
# This describes how to make the sacCer1 browser database.
#
# The genomic sequence was downloaded Nov. 17, 2003, and is dated
# 10/1/2003 on the Stanford FTP site. The previous version of the
# genomic sequence is dated Nov. 2002. I don't see version numbers.
# NOTE: this doc may have genePred loads that fail to include
# the bin column. Please correct that for the next build by adding
# a bin column when you make any of these tables:
#
# mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
# +-----------+-----------------+
# | tableName | type |
# +-----------+-----------------+
# | sgdGene | genePred sgdPep |
# +-----------+-----------------+
# Create the directory structure and download sequence from the SGD
# site at Stanford.
mkdir /cluster/store6/sacCer1
ln -s /cluster/store6/sacCer1 /cluster/data/sacCer1
cd /cluster/data/sacCer1
mkdir 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M bed download
cd download
mkdir chromosomes
cd chromosomes
foreach i (01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 mt)
wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/chromosomes/fasta/chr$i.fsa
end
foreach i (1 2 3 4 5 6 7 8 9)
echo ">chr$i" > chr$i.fa
grep -v '^>' chr0$i.fsa >> chr$i.fa
end
foreach i (chr1?.fsa)
echo ">$i:r" > $i:r.fa
grep -v '^>' $i >> $i:r.fa
end
echo ">chrM" > chrM.fa
grep -v '^>' chrmt.fsa > chrM.fa
foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M)
mv chr$i.fa ../../$i
end
gzip -4 *.fsa
# Download other info from the FTP site.
wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/chromosomal_feature
wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/gene_registry
wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/literature_curation
wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/oracle_schema
wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/protein_info
wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence_similarity
wget -nH --cut-dirs=3 -r ftp://genome-ftp.stanford.edu/pub/yeast/data_download/systematic_results
wget -nH --cut-dirs=5 -r -l 2 ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein
# Check that the genome is in sync with the annotations
cd /cluster/data/sacCer1
checkSgdSync download
# This showed 5 of 5788 with no ATG. I sent these to Mike Cherry.
# CREATING DATABASE, MAKE AND LOAD NIBS (DONE 2003-11-24 - Jim)
# NOTE FOR YEAST WE DO NOT REPEAT MASK SEQUENCE.
ssh hgwdev
echo 'create database sacCer1' | hgsql ''
cd /cluster/data/sacCer1
hgNibSeq sacCer1 /cluster/data/sacCer1/nib */*.fa
faSize -detailed */*.fa > chrom.sizes
mkdir -p /gbdb/sacCer1/nib
ln -s /cluster/data/sacCer1/nib/*.nib /gbdb/sacCer1/nib
# Create a read-only alias in your .cshrc or .profile
alias sacCer1 mysql -u hguser -phguserstuff -A sacCer1
# Use df to ake sure there is at least 5 gig of free mySql table space
df -h /var/lib/mysql
# CREATING GRP TABLE FOR TRACK GROUPING (DONE 2003-11-21 - Jim)
ssh hgwdev
# the following command copies all the data from the table
# grp in the database hg16 to our new database sacCer1
echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
| hgsql sacCer1
# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR YEAST (DONE 2003-11-24 Jim)
# Warning: must genome and organism fields must correspond
# with defaultDb values
echo 'INSERT INTO dbDb \
(name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk) values \
("sacCer1", "Oct. 2003", "/gbdb/sacCer1/nib", "Yeast", \
"chr2:827700-845800", 1, 65, "Yeast", \
"Saccharomyces cerevisiae", "/gbdb/sacCer1/html/description.html", \
0);' \
| hgsql -h genome-testdb hgcentraltest
echo 'INSERT INTO defaultDb (genome, name) values ("Yeast", "sacCer1");' \
| hgsql -h genome-testdb hgcentraltest
# Make trackDb table so browser knows what tracks to expect:
ssh hgwdev
cd ~/src/hg/makeDb/trackDb
cvs up -d -P
# Edit that makefile to add sacCer1 in all the right places and do
make update
# go public on genome-test
#make alpha
cvs commit makefile
# Add trackDb directories
mkdir sacCer
mkdir sacCer/sacCer1
cvs add sacCer
cvs add sacCer/sacCer1
cvs commit sacCer
# MAKE GCPERCENT (DONE 2003-11-24 - Jim)
ssh hgwdev
mkdir /cluster/data/sacCer1/bed/gcPercent
cd /cluster/data/sacCer1/bed/gcPercent
# create and load gcPercent table
hgsql sacCer1 < ~/src/hg/lib/gcPercent.sql
hgGcPercent sacCer1 ../../nib
# RUN TANDEM REPEAT MASKER (DONE 2003-11-24 - Jim)
# Do tandem repeat masking - this takes about 2 minutes.
ssh hgwdev
mkdir -p /cluster/data/sacCer1/bed/simpleRepeat
cd /cluster/data/sacCer1
foreach i (? ??)
trfBig $i/chr$i.fa /dev/null \
-bedAt=/cluster/data/sacCer1/bed/simpleRepeat/chr$i.bed
end
# Load into the database
cd /cluster/data/sacCer1/bed/simpleRepeat
hgLoadBed sacCer1 simpleRepeat *.bed \
-sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
# Loaded 1316 elements of size 16
featureBits sacCer1 simpleRepeat
# 82600648 bases of 2627444668 (3.144%) in intersection
# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (done)
ssh hgwdev
# Write ~/kent/src/hg/makeDb/trackDb/sacCer/sacCer1/description.html
# with a description of the assembly and some sample position queries.
chmod a+r ~/kent/src/hg/makeDb/trackDb/sacCer/sacCer1/description.html
# Check it in
mkdir -p /gbdb/sacCer1/html
ln -s /cluster/data/sacCer1/html/description.html /gbdb/sacCer1/html/
# Create line in trackDb/makefile that copies description.html into
# /cluster/data/sacCer1/html/description.html
# Note, you definitely want to make the symbolic link in /gbdb/sacCer/html
# before doing this.
# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 2003-11-24 Jim)
# AND SET UP BLAT SERVERS
ssh blat10
cd /scratch
mkdir sacCer1Nib
scp 'hgwdev:/cluster/data/sacCer1/nib/*.nib' sacCer1Nib
# Ask admins to set up blat servers and ask which ports they assign.
# 8/26/04: set canPcr=1 for untranslated blat server.
ssh hgwdev
echo 'insert into blatServers values("sacCer1", "blat10", 17788, 0, 1); \
insert into blatServers values("sacCer1", "blat10", 17789, 1, 0);' \
| hgsql -h genome-testdb hgcentraltest
# CREATING SGD-BASED KNOWN GENES AND OTHER FEATURES (DONE 2003-12-02 Jim)
# Note initially the s_cerevisiae.gff3 file ended up being out of
# sync with the genome. Mike Cherry (cherry@genome.stanford.edu)
# regenerated it. The format may end up changing in the future though.
ssh hgwdev
cd /cluster/data/sacCer1/bed
mkdir sgdGene
hgSgdGff3 ../download/chromosomal_feature/s_cerevisiae.gff3 sgdGene
cd sgdGene
ldHgGene sacCer1 sgdGene codingGenes.gff
hgLoadBed sacCer1 sgdOther otherFeatures.bed \
-tab -sqlTable=$HOME/kent/src/hg/lib/sgdOther.sql
zcat ../../download/orf_protein/*.fasta.gz \
| hgSgdPep -restrict=genePred.tab stdin sgdPep.fa symbol.txt
hgPepPred sacCer1 generic sgdPep sgdPep.fa
echo 'create table sgdToName ( \
name varchar(10) not null, \
value varchar(10) not null, \
PRIMARY KEY(name), \
INDEX (value));' | hgsql sacCer1
echo 'load data local infile "symbol.txt" \
into table sgdToName;' | hgsql sacCer1
hgsql sacCer1 < $HOME/kent/src/hg/lib/sgdDescription.sql
echo 'load data local infile "descriptions.txt" \
into table sgdDescription;' | hgsql sacCer1
hgsql sacCer1 < $HOME/kent/src/hg/lib/sgdOtherDescription.sql
echo 'load data local infile "notes.txt" \
into table sgdOtherDescription;' | hgsql sacCer1
# ADDING SWISSPROT ACCESSION TO KNOWN GENES (DONE 2003-11-25 Jim)
ssh hgwdev
cd /cluster/data/sacCer1/bed/sgdGene
awk '$2 == "SwissProt" {printf("%s\t%s\n", $3, $1);}' \
../../download/chromosomal_feature/external_id.tab \
> sgdToSwissProt.txt
echo 'create table sgdToSwissProt ( \
name varchar(10) not null, \
value varchar(10) not null, \
PRIMARY KEY(name), \
INDEX (value));' | hgsql sacCer1
echo 'load data local infile "sgdToSwissProt.txt" \
into table sgdToSwissProt;' | hgsql sacCer1
hgProtIdToGenePred sacCer1 sgdGene sgdToSwissProt name value
# CREATE SGD-BASED CLONE TRACK (DONE 2003-11-25 Jim)
ssh hgwdev
cd /cluster/data/sacCer1/bed
mkdir sgdClone
cd sgdClone
awk -F '\t' '{printf("chr%s\t%d\t%d\t%s\t%s\n", $3, $4-1, $5, $2, $1);}' \
../../download/chromosomal_feature/clone.tab > sgdClone.bed
hgLoadBed sacCer1 sgdClone sgdClone.bed -tab \
-sqlTable=$HOME/kent/src/hg/lib/sgdClone.sql
# AUTO UPDATE GENBANK MRNA RUN (Done 2003-11-24 Jim)
# Put the nib's on /cluster/bluearc:
ssh eieio
mkdir -p /cluster/bluearc/sacCer/sacCer1/nib
cp -pR /cluster/data/sacCer1/nib/*.nib /cluster/bluearc/sacCer/sacCer1/nib
# Instructions for setting up incremental genbank updates are here:
# http://www.soe.ucsc.edu/~markd/genbank-update/doc/initial-load.html
# Added
static char *sacCerNames[] = {"Saccharomyces cerevisiae", NULL};
and
{"dm", dmNames, NULL},
to appropriate parts of src/hg/makeDb/genbank/src/lib/gbGenome.c
# Then make and make install
cd src/hg/make/makeDb/genbank
make PREFIX=/cluster/store5/genbank install
# Edit /cluster/data/genbank/etc/genbank.conf and add:
# sacCer1
sacCer1.genome = /cluster/bluearc/sacCer/sacCer1/nib/chr*.nib
sacCer1.lift = no
sacCer1.genbank.mrna.xeno.load = no
sacCer1.genbank.est.xeno.load = no
sacCer1.downloadDir = sacCer1
# Do the alignments
ssh eieio
cd /cluster/data/genbank
nice bin/gbAlignStep -iserver=no \
-clusterRootDir=/cluster/bluearc/genbank \
-verbose=1 -initial sacCer1
# Load the results from the above
ssh hgwdev
cd /cluster/data/genbank
nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad sacCer1
# DOING MULTIPLE ALIGNMENT WITH OTHER YEAST (Done Jim)
# Grab sequence from all six yeast and massage
# it so that fasta records all start with sacXXX
# in the directory /cluster/data/sacCer1/bed/otherYeast/align
ssh kkr1u00
cd /cluster/data/sacCer1/bed/otherYeast/align
# Create directory full of size info
mkdir sizes
foreach i (sac*)
faSize $i -detailed > sizes/$i
end
# Create some working directories
mkdir lav axtAll axtBest mafIn mafRawOut mafOut
# Create little shell script to do blastz alignments
cat > bz << end
#!/bin/csh -ef
blastz \$1 \$2 > \$3
end
chmod a+x bz
# Create job spec to do all blastz alignments
ls -1 ../../../*/chr*.fa > sacCer.lst
ls -1 sac??? > other.lst
cat > gsub << end
#LOOP
bz \$(path1) \$(path2) {check out line+ lav/\$(root1).\$(root2)}
#ENDLOOP
end
gensub2 sacCer.lst other.lst gsub spec
# Do parasol blastz run
para create spec
para try
# Do some para checks and if all is well
para push
#Completed: 102 of 102 jobs
#CPU time in finished jobs: 43279s 721.31m 12.02h 0.50d 0.001 y
#IO & Wait Time: 540s 9.00m 0.15h 0.01d 0.000 y
#Average job time: 430s 7.16m 0.12h 0.00d
#Longest job: 1458s 24.30m 0.41h 0.02d
#Submission to last job: 3994s 66.57m 1.11h 0.05d
# Convert from lav to axt
cd lav
foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
foreach j (*.$i)
lavToAxt $j ../../../../nib ../$i ../axtAll/$j -fa
end
echo done $i
end
cd ..
# Run axtBest
cd axtAll
foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
foreach j (*.$i)
axtBest $j $j:r ../axtBest/$j
end
echo done $i
end
cd ..
# Convert to maf
cd axtBest
foreach i (sacPar sacMik sacKud sacBay sacCas sacKlu)
foreach j (*.$i)
axtToMaf $j ../../../../chrom.sizes ../sizes/$i ../mafIn/$j -tPrefix=sacCer1.
end
echo done $i
end
cd ..
# Run multiz
cd mafIn
set mz = /cluster/bin/penn/tba.9.13/multiz
foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M)
set c = chr$i
$mz $c.sacPar $c.sacMik - > ../mafRawOut/$c.maf
echo done $c.sacMik
foreach s (sacKud sacBay sacCas sacKlu)
$mz $c.$s ../mafRawOut/$c.maf - > ../mafRawOut/tmp
mv ../mafRawOut/tmp ../mafRawOut/$c.maf
echo done $c.$s
end
end
cd ..
#Load into database
ssh hgwdev
cd /cluster/data/sacCer1/bed/otherYeast/align
ln -s /cluster/store6/sacCer1/bed/otherYeast/align/mafOut /gbdb/sacCer1/multizYeast
hgLoadMaf sacCer1 multizYeast
# Clean up
cd mafRawOut
foreach i (*.maf)
mafFilter -minScore=100 $i > ../mafOut/$i
rm -r axtAll axtBest lav err mafRawOut multizYeast.tab
# ADDING LOCALIZATION AND ABUNDANCE INFO FROM SGD AND
# http://yeastgfp.ucsf.edu (DONE 2003-11-24 - Jim)
ssh hgwdev
cd /cluster/data/sacCer1/bed
mkdir sgdLocalization
cd sgdLocalization
hgSgdGfp sacCer1 ../../download/systematic_results/localization/OS*.tab .
# UPDATE GO DATABASE (DONE -Jim)
ssh hgwdev
cd /cluster/store1/geneOntology
mkdir 20031202
cd 20031202
wget ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest/go_200311-termdb-data.gz
wget ftp://ftp.geneontology.org/pub/go/gene-associations/gene_association.goa_sptr.gz
hgsql mysql <<end
drop database go;
create database go;
end
zcat go_*data.gz | hgsql go
zcat gene_association.goa_sptr.gz | hgGoAssociation go goaPart stdin -taxon=9606,10090,10116,6239,7227,4932
######## MAKING FAMILY BROWSER TABLES #######
# GETTING STARTED WITH FAMILY BROWSER (DONE 2003-11-29 Jim)
# Making tables to cluster splice varients
ssh hgwdev
hgClusterGenes sacCer1 sgdGene sgdIsoforms sgdCanonical
# Make self mapping table for expression.
echo 'create table sgdToSgd ( \
name varchar(10) not null, \
value varchar(10) not null, \
PRIMARY KEY(name), \
UNIQUE (value));' | hgsql sacCer1
echo 'select name from sgdGene;' | hgsql -N sacCer1 \
| awk '{printf("%s\t%s\n", $1, $1);}' > sgdToSgd.tab
echo 'load data local infile "sgdToSgd.tab" into table sgdToSgd' \
| hgsql sacCer1
# Make expression similarity table.
cd ~/src/hg/near/hgExpDistance
hgExpDistance sacCer1 hgFixed.yeastChoCellCycle hgFixed.yeastChoCellCycleExps choExpDistance
#DOING SELF-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
# Extract peptides from genome database into fasta file
# and create a blast database out of them.
mkdir -p /cluster/data/sacCer1/bed/blastp
cd /cluster/data/sacCer1/bed/blastp
pepPredToFa sacCer1 sgdPep sgdPep.faa
formatdb -i sgdPep.faa -t sgdPep -n sgdPep
cd ..
# Copy over database to /scratch
ssh kk
mkdir -p /scratch/sacCer1/blastp
cp /cluster/data/sacCer1/bed/blastp/sgdPep.p* /scratch/sacCer1/blastp
sudo /cluster/install/utilities/updateLocal
# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/sacCer1/bed/blastp
mkdir split
faSplit sequence sgdPep.faa 6000 split/kg
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/sacCer1/bed/blastp/self
cd /cluster/data/sacCer1/bed/blastp/self
mkdir run
cd run
mkdir out
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /scratch/blast/data
/scratch/blast/blastall -p blastp -d /scratch/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1000
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
# This should finish in 1 minute if the cluster is free.
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs: 3570s 59.50m 0.99h 0.04d 0.000 y
#IO & Wait Time: 14973s 249.55m 4.16h 0.17d 0.000 y
#Average job time: 3s 0.05m 0.00h 0.00d
#Longest job: 12s 0.20m 0.00h 0.00d
#Submission to last job: 55s 0.92m 0.02h 0.00d
# Load into database. This takes a minute
ssh hgwdev
cd /cluster/data/sacCer1/bed/blastp/self/run/out
hgLoadBlastTab sacCer1 sgdBlastTab *.tab
#Scanning through 5743 files
#Loading database with 52725 rows
#DOING C.ELEGANS-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/sacCer1/bed/blastp/ce1
cd /cluster/data/sacCer1/bed/blastp/ce1
mkdir run
cd run
mkdir out
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/ce1/blastp/wormPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
Completed: 5743 of 5743 jobs
#CPU time in finished jobs: 9397s 156.62m 2.61h 0.11d 0.000 y
#IO & Wait Time: 21849s 364.15m 6.07h 0.25d 0.001 y
#Average job time: 5s 0.09m 0.00h 0.00d
#Longest job: 26s 0.43m 0.01h 0.00d
#Submission to last job: 75s 1.25m 0.02h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/sacCer1/bed/blastp/ce1/run/out
hgLoadBlastTab sacCer1 ceBlastTab -maxPer=1 *.tab
#DOING MOUSE-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
# Make mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to iscratch/i
# if it doesn't exist already
cd /cluster/data/mm4/bed
mkdir blastp
cd blastp
pepPredToFa mm4 knownGenePep known.faa
formatdb -i known.faa -t known -n known
ssh kkr1u00
if (-e /iscratch/i/mm4/blastp) then
rm -r /iscratch/i/mm4/blastp
endif
mkdir -p /iscratch/i/mm4/blastp
cp /cluster/data/mm4/bed/blastp/known.p?? /iscratch/i/mm4/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/sacCer1/bed/blastp/mm4
cd /cluster/data/sacCer1/bed/blastp/mm4
mkdir run
cd run
mkdir out
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/mm4/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs: 13913s 231.88m 3.86h 0.16d 0.000 y
#IO & Wait Time: 27267s 454.45m 7.57h 0.32d 0.001 y
#Average job time: 7s 0.12m 0.00h 0.00d
#Longest job: 40s 0.67m 0.01h 0.00d
#Submission to last job: 80s 1.33m 0.02h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/sacCer1/bed/blastp/mm4/run/out
hgLoadBlastTab sacCer1 mmBlastTab -maxPer=1 *.tab
#DOING ZEBRAFISH-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to iscratch/i
# if it doesn't exist already
cd /cluster/data/dr1/bed
mkdir blastp
cd blastp
wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH2.pep.fa.gz
zcat Dan*.pep.fa.gz > ensembl.faa
echo "Translation:" > subs.in
subs -e ensembl.faa > /dev/null
formatdb -i ensembl.faa -t ensembl -n ensembl
ssh kkr1u00
if (-e /iscratch/i/dr1/blastp) then
rm -r /iscratch/i/dr1/blastp
endif
mkdir -p /iscratch/i/dr1/blastp
cp /cluster/data/dr1/bed/blastp/ensembl.p?? /iscratch/i/dr1/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/sacCer1/bed/blastp/dr1
cd /cluster/data/sacCer1/bed/blastp/dr1
mkdir run
cd run
mkdir out
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/dr1/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs: 11135s 185.58m 3.09h 0.13d 0.000 y
#IO & Wait Time: 23501s 391.69m 6.53h 0.27d 0.001 y
#Average job time: 6s 0.10m 0.00h 0.00d
#Longest job: 40s 0.67m 0.01h 0.00d
#Submission to last job: 71s 1.18m 0.02h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/sacCer1/bed/blastp/dr1/run/out
hgLoadBlastTab sacCer1 drBlastTab -maxPer=1 *.tab
#DOING FRUITFLY-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to iscratch/i
# if it doesn't exist already
cd /cluster/data/dm1/bed
mkdir blastp
cd blastp
pepPredToFa dm1 bdgpGenePep bdgp.faa
formatdb -i bdgp.faa -t bdgp -n bdgp
ssh kkr1u00
if (-e /iscratch/i/dm1/blastp) then
rm -r /iscratch/i/dm1/blastp
endif
mkdir -p /iscratch/i/dm1/blastp
cp /cluster/data/dm1/bed/blastp/bdgp.p?? /iscratch/i/dm1/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/sacCer1/bed/blastp/dm1
cd /cluster/data/sacCer1/bed/blastp/dm1
mkdir run
cd run
mkdir out
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs: 9749s 162.49m 2.71h 0.11d 0.000 y
#IO & Wait Time: 22247s 370.78m 6.18h 0.26d 0.001 y
#Average job time: 6s 0.09m 0.00h 0.00d
#Longest job: 28s 0.47m 0.01h 0.00d
#Submission to last job: 64s 1.07m 0.02h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/sacCer1/bed/blastp/dm1/run/out
hgLoadBlastTab sacCer1 dmBlastTab -maxPer=1 *.tab
#DOING HUMAN-PROTEIN ALIGNMENTS AND LOADING (DONE 2003-12-8 Jim)
# Make human ortholog column using blastp on human known genes.
# First make human protein database and copy it to iscratch/i
# if it doesn't exist already
cd /cluster/data/hg16/bed
mkdir blastp
cd blastp
pepPredToFa hg16 knownGenePep known.faa
formatdb -i known.faa -t known -n known
ssh kkr1u00
if (-e /iscratch/i/hg16/blastp) then
rm -r /iscratch/i/hg16/blastp
endif
mkdir -p /iscratch/i/hg16/blastp
cp /cluster/data/hg16/bed/blastp/known.p?? /iscratch/i/hg16/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/sacCer1/bed/blastp/hg16
cd /cluster/data/sacCer1/bed/blastp/hg16
mkdir run
cd run
mkdir out
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/hg16/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# << this line makes emacs coloring happy
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try
# Wait a couple of minutes, and do a para check, if all is good
# then do a
para push
#Completed: 5743 of 5743 jobs
#CPU time in finished jobs: 16096s 268.27m 4.47h 0.19d 0.001 y
#IO & Wait Time: 29943s 499.05m 8.32h 0.35d 0.001 y
#Average job time: 8s 0.13m 0.00h 0.00d
#Longest job: 65s 1.08m 0.02h 0.00d
#Submission to last job: 100s 1.67m 0.03h 0.00d
# Load into database.
ssh hgwdev
cd /cluster/data/sacCer1/bed/blastp/hg16/run/out
hgLoadBlastTab sacCer1 hgBlastTab -maxPer=1 *.tab
# LOADING ERAN SEGAL'S REGULATORY MODULE STUFF (DONE 2004-9-22 -Jim)
cd /cluster/data/sacCer1/bed
mkdir eranSegal
cd eranSegal
# Copy module_assignments.tab, motif_attributes.tab, motif_modules.tab
# from email from <eran@cs.stanford.edu> into this directory.
echo "select name,chrom,txStart,txEnd,strand,500,0 from sgdGene;" | \
hgsql -N sacCer1 > gene_position.tab
echo "select name,chrom,chromStart,chromEnd,strand,500,0 from sgdOther;" | \
hgsql -N sacCer1 >> gene_position.tab
hgLoadEranModules sacCer1 module_assignments.tab motif_modules.tab \
modules_motifs.gxm modules_motif_positions.tab gene_position.tab
# LOADING REGULATORY CODE STUFF (In Progress 2004-9-22 -Jim)
cd /cluster/data/sacCer1/bed
mkdir harbisonGordon
cd harbisonGordon
# Create growthCondition.tab by editing by hand
# http://jura.wi.mit.edu/young_public/regulatory_code/GrowthEnvironments.html
# and then load:
hgsql sacCer1 < $HOME/kent/src/hg/lib/growthCondition.sql
hgsql sacCer1 -e 'load data local infile "growthCondition.tab" into table growthCondition'
# Get GFF files and motif file from downloads section of
# http://jura.wi.mit.edu/fraenkel/regulatory_map
# Also get v24_probes_041004.GFF from Ben Gordon dbgordon@wi.mit.edu via email.
# Get Conditions_Summary.txt also from Ben from email.
sort v24_probes_041004.GFF | uniq > probes.GFF
hgYeastRegCode motifGff Final_InTableS2_v24.motifs probes.GFF Conditions_Summary.txt\
transRegCode.bed transRegCodeMotif.tab transRegCodeProbe.bed transRegCodeCondition.tab
hgLoadBed sacCer1 transRegCode transRegCode.bed -sqlTable=$HOME/kent/src/hg/lib/transRegCode.sql
hgLoadBed sacCer1 transRegCodeProbe transRegCodeProbe.bed -sqlTable=$HOME/kent/src/hg/lib/transRegCodeProbe.sql -tab
hgsql sacCer1 < $HOME/kent/src/hg/lib/transRegCodeCondition.sql
hgsql sacCer1 -e 'load data local infile "transRegCodeCondition.tab" into table transRegCodeCondition'
hgsql sacCer1 < $HOME/kent/src/hg/lib/transRegCodeMotif.sql
hgsql sacCer1 -e 'load data local infile "transRegCodeMotif.tab" into table transRegCodeMotif'
# Get rid of some crufty motif placements Ben Gordon doesn't like anymore that aren't in the
# transRegCodeMotif table.
hgsql sacCer1 -e 'delete from transRegCode where name="CRZ1" or name="ECM22" or name="SFL1"'
# Trim some motif columns
fixHarbisonMotifs sacCer1
#CREATING DOWNLOADS DIRECTORY
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath
mkdir sacCer1
cd sacCer1
mkdir bigZips database
cd bigZips
zip -j chromFa.zip /cluster/data/sacCer1/*/chr*.fa
zip -j multizYeast.zip /gbdb/sacCer1/multizYeast/*.maf
# Create a README.txt file here.
# MITOPRED DATA FOR HGGENE (DONE 7/30/04 angie)
ssh hgwdev
mkdir /cluster/data/sacCer1/bed/mitopred
cd /cluster/data/sacCer1/bed/mitopred
wget http://mitopred.sdsc.edu/data/yst_30.out
perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' yst_30.out > mitopred.tab
cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
name varchar(10) not null, # SwissProt ID
confidence varchar(8) not null, # Confidence level
#Indices
PRIMARY KEY(name(6))
);
'_EOF_'
# << this line makes emacs coloring happy
hgsql sacCer1 < mitopred.sql
hgsql sacCer1 -e 'load data local infile "mitopred.tab" into table mitopred'
# MAKE Human Proteins track (DONE 2004-08-25 braney)
ssh kksilo
mkdir -p /cluster/data/sacCer1/blastDb
cd /cluster/data/sacCer1/blastDb
for i in ../*/*.fa; do ln -s $i .; done
for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
rm *.fa *.log
ssh kkr1u00
mkdir -p /iscratch/i/sacCer1/blastDb
cp /cluster/data/sacCer1/blastDb/* /iscratch/i/sacCer1/blastDb
(~kent/bin/iSync) 2>&1 > sync.out
mkdir -p /cluster/data/sacCer1/bed/tblastn.hg16KG
cd /cluster/data/sacCer1/bed/tblastn.hg16KG
ls -1S /iscratch/i/sacCer1/blastDb/*.nsq | sed "s/\.nsq//" > yeast.lst
exit
# back to kksilo
cd /cluster/data/sacCer1/bed/tblastn.hg16KG
mkdir kgfa
ls -1S /iscratch/i/squirt/ci1/kgfas/*.fa > kg.lst
mkdir blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2
exit 0
fi
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'
chmod +x blastSome
gensub2 yeast.lst kg.lst blastGsub blastSpec
ssh kk
cd /cluster/data/sacCer1/bed/tblastn.hg16KG
para create blastSpec
para try, push
cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=7500 stdin ../c.`basename $1`.psl)
'_EOF_'
chmod +x chainSome
ls -1dS `pwd`/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
ssh kki
cd /cluster/data/sacCer1/bed/tblastn.hg16KG
para create chainSpec
para push
exit
# back to kksilo
cd /cluster/data/sacCer1/bed/tblastn.hg16KG/blastOut
for i in kg??
do
awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n | uniq > ../blastHg16KG.psl
ssh hgwdev
cd /cluster/data/sacCer1/bed/tblastn.hg16KG
hgLoadPsl sacCer1 blastHg16KG.psl
exit
# back to kksilo
rm -rf blastOut
# End tblastn
# BLAT SGD predicted sacCer1 proteins against sacCer1 (DONE 2004-08-31 braney)
ssh hgwdev
cd /cluster/data/sacCer1/bed
mkdir blat.sacCer1SG
cd blat.sacCer1SG
pepPredToFa sacCer1 sgdPep sacCer1SG.fa
hgPepPred sacCer1 generic blastSGPep00 sacCer1SG.fa
ssh kk
cd /cluster/data/sacCer1/bed/blat.sacCer1SG
cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
# << keep emacs happy
chmod +x blatSome
ls -1S /cluster/bluearc/sacCer/sacCer1/nib/*.nib > yeast.lst
mkdir sgfas
cd sgfas
faSplit sequence ../sacCer1SG.fa 1000 sg
cd ..
ls -1S sgfas/*.fa > sg.lst
cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
# << keep emacs happy
gensub2 yeast.lst sg.lst blatGsub blatSpec
mkdir psl
cd psl
foreach i (`cat ../yeast.lst`)
mkdir `basename $i .nib`
end
cd ..
para create blatSpec
para push
# Completed: 16490 of 16490 jobs
# CPU time in finished jobs: 52796s 879.94m 14.67h 0.61d 0.002 y
# IO & Wait Time: 44955s 749.25m 12.49h 0.52d 0.001 y
# Average job time: 6s 0.10m 0.00h 0.00d
# Longest job: 22s 0.37m 0.01h 0.00d
# Submission to last job: 714s 11.90m 0.20h 0.01d
ssh kksilo
cd /cluster/data/sacCer1/bed/blat.sacCer1SG
pslSort dirs raw.psl /tmp psl/*
pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null
pslMaxMap -maxMap=1 cov90.psl sacCer1SG.psl
pslxToFa sacCer1SG.psl sacCer1SG_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
sgName sacCer1 sacCer1SG.psl blastSGRef00
ssh hgwdev
cd /cluster/data/sacCer1/bed/blat.sacCer1SG
hgsql sacCer1 < ~/kent/src/hg/lib/blastRef.sql
echo "rename table blastRef to blastSGRef00" | hgsql sacCer1
echo "load data local infile 'blastSGRef00' into table blastSGRef00" | hgsql sacCer1
# PHASTCONS AND PHASTCONS ELEMENTS (acs, 9/15/04)
ssh hgwdev
cd /cluster/data/sacCer1/bed/otherYeast
mkdir phastCons
cd phastCons
# split up alignments; no need to use cluster here
MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut
FA=/cluster/data/sacCer1
CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M"
mkdir -p SS
for i in $CHROMS ; do \
echo chr${i} ; \
msa_view $MAF/chr${i}.maf -M $FA/$i/chr${i}.fa -i MAF -o SS -O sacCer1,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu > SS/chr${i}.ss ; \
done
# produce rough starting model
msa_view -i SS --aggregate sacCer1,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu -o SS -z SS/*.ss > all.ss
phyloFit -i SS all.ss --tree "((((((sacCer1,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" -o starting-tree
# (cheat the branch lengths up a bit, since this represents an
# average of conserved and nonconserved sites; we want to
# initialize for nonconserved)
tree_doctor --scale 2 starting-tree.mod > tmp.mod
mv tmp.mod starting-tree.mod
# also get average GC content of aligned regions
msa_view -i SS all.ss --summary-only
#descrip. A C G T G+C length all_gaps some_gaps
#all.ss 0.3052 0.1954 0.1951 0.3042 0.3906 13251474 0 2260153
# save some I/O and space
gzip SS/*.ss
# now set up cluster job to estimate model parameters. See
# makeHg17.doc for add'l details
cat << 'EOF' > doEstimate.sh
#!/bin/sh
zcat $1 | /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.3906 --nrates 1,1 --no-post-probs --ignore-missing --expected-lengths 75 --target-coverage 0.5 --quiet --log $2 --estimate-trees $3
EOF
chmod u+x doEstimate.sh
rm -fr LOG TREES
mkdir -p LOG TREES
rm -f jobs.lst
for f in SS/*.ss.gz ; do
root=`basename $f .ss.gz` ;\
echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobs.lst ;\
done
# run cluster job
ssh kk9... para create ...
# (takes about 15 minutes)
# Now combine parameter estimates.
ls TREES/*.cons.mod > cons.txt
phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
ls TREES/*.noncons.mod > noncons.txt
phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
# Now set up cluster job for computing consevation scores and
# predicted elements
cat << 'EOF' > doPhastCons.sh
#!/bin/sh
mkdir -p POSTPROBS ELEMENTS
pref=`basename $1 .ss.gz`
chr=`echo $pref | awk -F\. '{print $1}'`
tmpfile=/scratch/phastCons.$$
zcat $1 | /cluster/bin/phast/phastCons - ave.cons.mod,ave.noncons.mod --expected-lengths 75 --target-coverage 0.5 --quiet --seqname $chr --idpref $pref --viterbi ELEMENTS/$pref.bed --score --require-informative 0 > $tmpfile
gzip -c $tmpfile > POSTPROBS/$pref.pp.gz
rm $tmpfile
EOF
chmod u+x doPhastCons.sh
rm -fr POSTPROBS ELEMENTS
rm -f jobs2.lst
for f in SS/*.ss.gz ; do echo doPhastCons.sh $f >> jobs2.lst ; done
# run cluster job
ssh kk9... para create, ...
# (takes about 30 sec)
# set up phastConsElements track
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' ELEMENTS/*.bed > all.raw.bed
/cluster/bin/scripts/lodToBedScore all.raw.bed > all.bed
hgLoadBed sacCer1 phastConsElements all.bed
featureBits sacCer1 phastConsElements
# 7517116 bases of 12156302 (61.837%) in intersection
# set up wiggle
mkdir -p wib
for i in $CHROMS ; do \
wigAsciiToBinary -chrom=chr${i} -wibFile=wib/chr${i}_phastCons POSTPROBS/chr${i}.pp.gz ;\
done
# Changed this path 2004-09-27 so a new set of data could go out
# under the same table name - Hiram, was simply:
# /gbdb/sacCer1/wib/chr*_phastCons.wib
hgLoadWiggle sacCer1 phastCons -pathPrefix=/gbdb/sacCer1/wib/phastCons wib/chr*_phastCons.wig
mkdir -p /gbdb/sacCer1/wib/phastCons
rm -f /gbdb/sacCer1/wib/phastCons/chr*_phastCons.wib
ln -s /cluster/data/sacCer1/bed/otherYeast/phastCons/wib/*.wib /gbdb/sacCer1/wib/phastCons
chmod 775 . wib /gbdb/sacCer1/wib/phastCons /gbdb/sacCer1/wib/phastCons/*.wib
chmod 664 wib/*.wib
# set up full alignment/conservation track
rm -rf pwMaf /gbdb/sacCer1/pwMaf
mkdir -p pwMaf /gbdb/sacCer1/pwMaf
cd pwMaf ;\
for org in sacBay sacCas sacKlu sacKud sacMik sacPar ; do \
mkdir -p $org ; \
cp /cluster/data/sacCer1/bed/otherYeast/align/mafIn/chr*.$org $org ; \
for f in $org/* ; do chr=`basename $f .$org` ; mv $f $org/$chr.maf ; done ;\
ln -s /cluster/data/sacCer1/bed/otherYeast/phastCons/pwMaf/$org /gbdb/sacCer1/pwMaf/${org}_pwMaf ; \
hgLoadMaf sacCer1 -warn ${org}_pwMaf -pathPrefix=/gbdb/sacCer1/pwMaf/${org}_pwMaf ;\
done
cd ..
chmod 755 pwMaf pwMaf/* /gbdb/sacCer1/pwMaf /gbdb/sacCer1/pwMaf/*
chmod 644 pwMaf/*/*.maf
# trackDb entries:
#track multizYeast
#shortLabel Conservation
#longLabel Seven Species of Saccharomyces, Alignments & Conservation
#group compGeno
#priority 100
#visibility pack
#type wigMaf 0.0 1.0
#maxHeightPixels 100:40:11
#wiggle phastCons
#yLineOnOff Off
-#autoScaleDefault Off
+#autoScale Off
#pairwise pwMaf
#speciesOrder sacPar sacMik sacKud sacBay sacCas sacKlu
#track phastConsElements
#shortLabel Most Conserved
#longLabel PhastCons Conserved Elements (Seven Species of Saccharomyces)
#group compGeno
#priority 105
#visibility hide
#spectrum on
#exonArrows off
#showTopScorers 200
#type bed 5 .
# MAF COVERAGE FIGURES FOR ADAM (DONE 10/18/04 angie)
# First, get ranges of target coverage:
ssh kksilo
mkdir /cluster/data/sacCer1/bed/otherYeast/align/coverage
cd /cluster/data/sacCer1/bed/otherYeast/align/coverage
cat /cluster/data/sacCer1/bed/otherYeast/align/mafOut/*.maf \
| nice mafRanges -notAllOGap stdin sacCer1 sacCer1.mafRanges.bed
# Get pairwise coverage as well.
foreach other (sacBay sacCas sacKlu sacKud sacMik sacPar)
cat /cluster/data/sacCer1/bed/otherYeast/align/mafIn/chr*.$other \
| nice mafRanges -notAllOGap stdin sacCer1 sacCer1.$other.mafRanges.bed
end
ssh hgwdev
cd /cluster/data/sacCer1/bed/otherYeast/align/coverage
# To make subsequent intersections a bit quicker, output a bed with
# duplicate/overlapping ranges collapsed:
nice featureBits sacCer1 sacCer1.mafRanges.bed \
-bed=sacCer1.mafRangesCollapsed.bed
#11718319 bases of 12156302 (96.397%) in intersection
# mafCoverage barfs on chr15 currently, so pass on this for now:
#cat /cluster/data/sacCer1/bed/otherYeast/align/mafOut/*.maf \
#| nice mafCoverage -count=2 sacCer1 stdin > sacCer1.mafCoverage
# Intersect maf target coverage with gene regions:
nice featureBits sacCer1 -enrichment sgdGene:cds \
sacCer1.mafRangesCollapsed.bed \
-bed=sacCer1.mafCds.bed
#sgdGene:cds 69.491%, sacCer1.mafRangesCollapsed.bed 96.397%, both 69.114%, cover 99.46%, enrich 1.03x
# No UTR info for yeast, so can't intersect.
# Intersect pairwise target coverages with gene regions:
foreach other (sacBay sacCas sacKlu sacKud sacMik sacPar)
nice featureBits sacCer1 -enrichment sgdGene:cds \
sacCer1.$other.mafRanges.bed -bed=sacCer1.${other}Cds.bed \
|& grep -v "table gap doesn't exist"
end
#sgdGene:cds 69.491%, sacCer1.sacBay.mafRanges.bed 89.478%, both 66.581%, cover 95.81%, enrich 1.07x
#sgdGene:cds 69.491%, sacCer1.sacCas.mafRanges.bed 63.359%, both 55.560%, cover 79.95%, enrich 1.26x
#sgdGene:cds 69.491%, sacCer1.sacKlu.mafRanges.bed 56.086%, both 49.245%, cover 70.87%, enrich 1.26x
#sgdGene:cds 69.491%, sacCer1.sacKud.mafRanges.bed 89.684%, both 64.873%, cover 93.35%, enrich 1.04x
#sgdGene:cds 69.491%, sacCer1.sacMik.mafRanges.bed 92.989%, both 67.178%, cover 96.67%, enrich 1.04x
#sgdGene:cds 69.491%, sacCer1.sacPar.mafRanges.bed 96.550%, both 68.464%, cover 98.52%, enrich 1.02x
# CREATE TABLES TO SUPPORT SHOWING SAM-T02 RESULTS (DONE 1/4/05, Fan)
# Kevin just did his monthly update, so reuild ... (REBUILT 1/7/05 Fan)
ssh hgwdev
cd /cluster/data/sacCer1/bed
mkdir sam
cd sam
# create script to process 1 SAM subdirectory
cat << '_EOF_' >do1Subdir
ls /projects/compbio/experiments/protein-predict/yeast/$1/*/summary.html >j1.tmp
cat j1.tmp |sed -e 's/yeast\// /g' |sed -e 's/\/summary/ /g' >j2.tmp
cat j2.tmp | awk '{print $2}' |sed -e 's/\// /g ' | awk '{print "do1 " $1 " " $2}' >> doall
'_EOF_'
chmod +x do1Subdir
# create high level script to parse all SAM results
ls -l /projects/compbio/experiments/protein-predict/yeast | grep drw >j1
cp j1 j2
#edit j2 to remove non SAM result subdirectories
vi j2
cat j2 |awk '{print "do1Subdir " $9}' >doAllSubdir
chmod +x doAllSubdir
rm j1 j2
rm doall
doAllSubdir
rm j1.tmp j2.tmp
# create data needed for the samSubdir table and load them
cat doall |awk '{print $3"\t"$2}' >samSubdir.lis
hgsql sacCer1 -e "drop table samSubdir"
hgsql sacCer1 < ~/src/hg/lib/samSubdir.sql
hgsql sacCer1 -e 'load data local infile "samSubdir.lis" into table samSubdir'
# create script to parse SAM output results in one subdirectory
cat << '_EOF_' >do1
echo processing $2
samHit $2 /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.best-scores.rdb >$2.tab
'_EOF_'
chmod +x do1
chmod +x doall
# run the top level script to parse all SAM output results
rm *.tab
doall
# collect all results
cat *.tab |sort -u >protHomolog.all
# load the results into the protHomolog table
hgsql sacCer1 -e "drop table protHomolog"
hgsql sacCer1 < ~/src/hg/lib/protHomolog.sql
hgsql sacCer1 -e 'load data local infile "protHomolog.all" into table protHomolog'
# remove all intermediate .tab files
rm *.tab
# COPY PART OF SAM-T02 RESULTS TO UCSC GB SERVER (DONE 1/10/05, Fan)
# Kevin is going to do monthly updates on SAM-T02 results.
# To prevent data inconsistency problems, we now will copy
# over some key files and host them on our own server.
ssh hgwdev
cd /cluster/data/sacCer1/bed/sam
mkdir yeast050107
cd yeast050107
# create script to process 1 SAM subdirectory
cat << '_EOF_' >do1Subdir
mkdir -p yeast/$1
'_EOF_'
chmod +x do1Subdir
cp -p ../doAllSubdir .
mkdir yeast
doAllSubdir
cp -p ../doall .
# create script doall to copy over needed SAM output result files
cat << '_EOF_' >do1
echo processing $2
mkdir yeast/$1/$2
cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/*.jpg yeast/$1/$2
cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.w0.5-logo.pdf yeast/$1/$2
cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.dssp-ehl2-logo.pdf yeast/$1/$2
cp -f /projects/compbio/experiments/protein-predict/yeast/$1/$2/$2.t2k.undertaker-align.pdb.gz yeast/$1/$2
'_EOF_'
chmod +x do1
doall
ln -s ./yeast /usr/local/apache/htdocs/goldenPath/sacCer1/sam
# REBUIL MOUSE-PROTEIN ALIGNMENTS AND LOADING (DONE 2005-12-16 Fan)
# Update mouse ortholog column using blastp on mouse known genes.
# First make mouse protein database and copy it to iscratch/i
# if it doesn't exist already
cd /cluster/data/mm7/bed
mkdir blastp
cd blastp
pepPredToFa mm7 knownGenePep known.faa
formatdb -i known.faa -t known -n known
ssh kkr1u00
if (-e /iscratch/i/mm7/blastp) then
rm -r /iscratch/i/mm7/blastp
endif
mkdir -p /iscratch/i/mm7/blastp
cp -p /cluster/data/mm7/bed/blastp/known.p?? /iscratch/i/mm7/blastp
iSync
# Make parasol run directory
ssh kk
mkdir -p /cluster/data/sacCer1/bed/blastp/mm7
cd /cluster/data/sacCer1/bed/blastp/mm7
mkdir run
cd run
mkdir out
# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/scratch/blast/blastall -p blastp -d /iscratch/i/mm7/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome
# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try,push,check ...
# Completed: 5743 of 5743 jobs
# CPU time in finished jobs: 11476s 191.27m 3.19h 0.13d 0.000 y
# IO & Wait Time: 15126s 252.10m 4.20h 0.18d 0.000 y
# Average job time: 5s 0.08m 0.00h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 29s 0.48m 0.01h 0.00d
# Submission to last job: 700s 11.67m 0.19h 0.01d
# Load into database.
ssh hgwdev
cd /cluster/data/sacCer1/bed/blastp/mm7/run/out
hgLoadBlastTab sacCer1 mmBlastTab -maxPer=1 *.tab
##########################################################################
# HGNEAR PROTEIN BLAST TABLES (DONE 5/22/06 angie)
ssh hgwdev
mkdir /cluster/data/sacCer1/bed/hgNearBlastp
cd /cluster/data/sacCer1/bed/hgNearBlastp
# Re-fetch current sgdPep because the one in ../blastz/sgdPep.faa
# has been overwritten with more recent data from SGD. That's OK
# for other organisms who will be linking straight to SGD instead
# of to sacCer1, but sacCer1 *BlastTab should jive with sgdGene.
pepPredToFa sacCer1 sgdPep sgdPep.faa
cat << _EOF_ > config.ra
# Yeast vs. other Gene Sorter orgs that have recently been updated:
# human, mouse, fly
targetGenesetPrefix sgd
targetDb sacCer1
queryDbs hg18 mm8 dm2
sacCer1Fa /cluster/data/sacCer1/bed/hgNearBlastp/sgdPep.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
buildDir /cluster/data/sacCer1/bed/hgNearBlastp
scratchDir /san/sanvol1/scratch/sacCer1HgNearBlastp
_EOF_
doHgNearBlastp.pl -noSelf -targetOnly config.ra >& do.log & tail -f do.log
# *** hgBlastTab mmBlastTab dmBlastTab
##########################################################################
# RELOAD GENBANK (DONE 2006-09-06 markd)
# reloaded due to xenos track being dropped confusing joinerCheck
nice bin/gbDbLoadStep -drop -initialLoad sacCer1
#########################################################################
# ORegAnno - Open Regulatory Annotations
# update Oct 26, 2007; update July 7, 2008
# loaded by Belinda Giardine, in same manner as hg18 ORegAnno track
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2009-02-04 - Hiram)
# repMatch = 1024 * sizeof(sacCer1)/sizeof(hg18)
# 4.32 = 1024 * (12156302/2881515245)
# use 10 to be very conservative
ssh hgwdev
cd /hive/data/genomes/sacCer1
blat sacCer1.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc \
-repMatch=10
# Wrote 3145 overused 11-mers to 11.ooc
# copy this to scratch data
cp -p 11.ooc /hive/data/staging/data/sacCer1/sacCer1.11.ooc
# make push request to kluster nodes /scratch/data/
#############################################################################
# LIFTOVER TO SacCer2 (DONE - 2009-01-04 - Hiram )
mkdir /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04
cd /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04
# -debug run to create run dir, preview scripts...
doSameSpeciesLiftOver.pl -debug sacCer1 sacCer2
# Real run:
time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \
-bigClusterHub=pk -dbHost=hgwdev -workhorse=hgwdev \
sacCer1 sacCer2 > do.log 2>&1
# real 3m21.840s
#############################################################################
# uwFootprints: (2009-04-04 markd)
# obtained from Nobel lab
# William Stafford Noble <noble@gs.washington.edu>
# Xiaoyu Chen <xchen@cs.washington.edu>
mkdir /hive/data/genomes/sacCer1/bed/uwFootprints
cd /hive/data/genomes/sacCer1/bed/uwFootprints
wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.wig
wget http://noble.gs.washington.edu/proj/footprinting/yeast.mappability.bed
wget http://noble.gs.washington.edu/proj/footprinting/yeast.footprints.bed
chmod a-w yeast.*
wigEncode yeast.dnaseI.tagCounts.wig uwFootprintsTagCounts.wig uwFootprintsTagCounts.wib
# Converted yeast.dnaseI.tagCounts.wig, upper limit 13798.00, lower limit 1.00
ln -sf /hive/data/genomes/sacCer1/bed/uwFootprints/uwFootprintsTagCounts.wib /gbdb/sacCer1/wib
hgLoadWiggle -tmpDir=/data/tmp sacCer1 uwFootprintsTagCounts uwFootprintsTagCounts.wig
hgLoadBed -tab -tmpDir=/data/tmp sacCer1 uwFootprintsMappability yeast.mappability.bed
# drop yeast.footprints.bed and truncate name to three decimal places
tawk 'NR>1{print $1,$2,$3,sprintf("%0.3f",$4)}' yeast.footprints.bed | hgLoadBed -tmpDir=/data/tmp sacCer1 uwFootprintsPrints stdin
# lift counts to sacSer2 and give back to UW
wget http://noble.gs.washington.edu/proj/footprinting/yeast.dnaseI.tagCounts.bed
liftOver yeast.dnaseI.tagCounts.bed /hive/data/genomes/sacCer1/bed/blat.sacCer2.2009-02-04/sacCer1ToSacCer2.over.chain.gz /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.bed /cluster/home/markd/public_html/tmp/yeast/yeast.dnaseI.tagCounts.sacCer2.nolift.bed
#############################################################################