src/hg/makeDb/doc/pyrAer1.txt 1.3
1.3 2009/11/25 18:29:19 hiram
change defaultViewLimits to viewLimits
Index: src/hg/makeDb/doc/pyrAer1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/pyrAer1.txt,v
retrieving revision 1.2
retrieving revision 1.3
diff -b -B -U 1000000 -r1.2 -r1.3
--- src/hg/makeDb/doc/pyrAer1.txt 26 Jul 2006 16:59:58 -0000 1.2
+++ src/hg/makeDb/doc/pyrAer1.txt 25 Nov 2009 18:29:19 -0000 1.3
@@ -1,322 +1,322 @@
# for emacs: -*- mode: sh; -*-
# This file describes building the browser database for the archaeal
# species Pyrobaculum aerophilum.
# DOWNLOAD SEQUENCE FROM GENBANK (DONE 01/07/04)
ssh eieio
mkdir /cluster/store5/archae/pyrAer1
ln -s /cluster/store5/archae/pyrAer1 /cluster/data/pyrAer1
cd /cluster/data/pyrAer1
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Pyrobaculum_aerophilum/NC_003364.fna
mv NC_003364.fna chr1.fa
# Edit header of chr1.fa to '> pyrAer1'
# CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE 01/12/04)
ssh hgwdev
echo 'create database pyrAer1' | hgsql ''
cd /cluster/data/pyrAer1
hgNibSeq pyrAer1 /cluster/data/pyrAer1/nib chr1.fa
faSize -detailed chr1.fa > chrom.sizes
mkdir -p /gbdb/pyrAer1/nib
echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
| hgsql pyrAer1
echo 'INSERT INTO dbDb \
(name, description, nibPath, organism, \
defaultPos, active, orderKey, genome, scientificName, \
htmlPath, hgNearOk) values \
("pyrAer1", "Pyrobaculum aerophilum", "/gbdb/pyrAer1/nib", "P. aerophilum", \
"chr1:500000-550000", 1, 85, "Archae", \
"Pyrobaculum aerophilum", "/gbdb/pyrAer1/html/description.html", \
0);' \
| hgsql -h genome-testdb hgcentraltest
echo 'INSERT INTO defaultDb (genome, name) values ("Archae", "pyrAer1");' \
| hgsql -h genome-testdb hgcentraltest
cd ~/kent/src/hg/makeDb/trackDb
# edit the trackDb makefile
# add the trackDb directories
mkdir -p archae/pyrAer1
cvs add archae
cvs add archae/pyrAer1
cvs commit
# GC PERCENT TRACK (DONE 01/12/04)
ssh hgwdev
mkdir -p /cluster/data/pyrAer1/bed/gcPercent
cd /cluster/data/pyrAer1/bed/gcPercent
hgsql pyrAer1 < ~/kent/src/hg/lib/gcPercent.sql
hgGcPercent -win=2000 pyrAer1 ../../nib
# edit ~/kent/src/hg/makeDb/trackDb/archae/trackDb.ra and add an entry for
# GC percent for 2,000 base windows instead (simply cut and paste
# ~/kent/src/hg/makeDb/trackDb/trackDb.ra).
# GC 20 BASE WIGGLE TRACK (DONE 8/20)
mkdir /cluster/data/pyrAer1/bed/gc20Base
cd /cluster/data/pyrAer1/bed/gc20Base
mkdir wigData20 dataLimits20
hgGcPercent -chr=chr -file=stdout -win=20 -overlap=19 pyrAer1 ../../nib | grep -w GC | \
awk '
{
bases = $3 - $2
perCent = $5/10.0
printf "%d\t%.1f\n", $2+1, perCent
}' | wigAsciiToBinary -dataSpan=1 -chrom=chr \
-wibFile=wigData20/gc20Base_1 -name=1 stdin > dataLimits20/chr
hgLoadWiggle pyrAer1 gc20Base wigData20/*.wig
mkdir /gbdb/pyrAer1/wib
ln -s `pwd`/wigData20/*.wib /gbdb/pyrAer1/wib
# the trackDb entry
track gc20Base
shortLabel GC Percent
longLabel GC Percent in 20 base windows
group map
priority 1.7
visibility hide
autoScaleDefault Off
maxHeightPixels 128:36:16
graphTypeDefault Bar
gridDefault OFF
windowingFunction Mean
color 0,128,255
altColor 255,128,0
-defaultViewLimits 30:70
+viewLimits 30:70
type wig 0 100
# CONTIG TRACK
# reformat is a Todd Lowe program
for num in `seq 746 946`; do
curl -o ${num}.gbk "http://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?txt=on&val=AE009${num}.1"
reformat fasta ${num}.gbk >> contigs.fa
done
blat /cluster/data/pyrAer1/chr1.fa contigs.fa -minIdentity=100 contigs.psl
perfectBlatBed4 contigs.psl contigs.bed
mkdir /cluster/data/pyrAer1/bed/pyrAer1Contigs
cp contigs.bed /cluster/data/pyrAer1/bed/pyrAer1Contigs
cd /cluster/data/pyrAer1/bed/pyrAer1Contigs
hgLoadBed pyrAer1 pyrAer1Contigs contigs.bed
# the trackDb entry:
track pyrAer1Contigs
shortLabel Contigs
longLabel Contigs deposited in Genbank
group map
priority 0.5
visibility pack
url http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=nucleotide&cmd=search&term=$$
type bed 4 .
# TANDEM REPEAT MASKER (NOT DONE YET)
ssh hgwdev
mkdir -p /cluster/data/pyrAer1/bed/simpleRepeat
cd /cluster/data/pyrAer1
trfBig chr1.fa /dev/null -bedAt=/cluster/data/pyrAer1/bed/simpleRepeat/chr1.bed
cd /cluster/data/pyrAer1/bed/simpleRepeat
hgLoadBed pyrAer1 simpleRepeat *.bed -sqlTable=~kent/src/hg/lib/simpleRepeat.sql
# weird, I get a seg fault here. need to ask Hiram or somebody else.
# DESCRIPTION PAGE (DONE 01/12/04)
ssh hgwdev
# Write ~/kent/src/hg/makeDb/trackDb/archae/pyrAer1/description.html
chmod a+r ~/kent/src/hg/makeDb/trackDb/archae/pyrAer1/description.html
# Check it in.
mkdir /gbdb/pyrAer1/html
ln -s /cluster/data/pyrAer1/html/description.html /gbdb/pyrAer1/html/
# GENBANK PROTEIN-CODING GENES (DONE 01/14/04)
ssh hgwdev
mkdir /cluster/data/pyrAer1/genbank
cd /cluster/data/pyrAer1/genbank
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Bacteria/Pyrobaculum_aerophilum/NC_003364.gbk
mv NC_003364.gbk pyrAer1.gbk
# Create 3 files to assist parsing of the genbank
# 1. for a bed file
echo 'chr1
start
end
gene
1000
strand' > pyrAer1-params-bed.txt
# 2. for the peptide parts
echo 'gene
translation' > pyrAer1-params-pep.txt
# 3. for the other gene information
echo 'gene
gene
product
note
protein_id
db_xref
EC_number
pseudo' > pyrAer1-params-xra.txt
# Now extract the genes and information:
gbArchaeGenome pyrAer1.gbk pyrAer1-params-bed.txt pyrAer1-genbank-cds.bed
gbArchaeGenome pyrAer1.gbk pyrAer1-params-pep.txt pyrAer1-genbank-cds.pep
gbArchaeGenome pyrAer1.gbk pyrAer1-params-xra.txt pyrAer1-genbank-cds.xra
hgsql pyrAer1 < ~/kent/src/hg/lib/pepPred.sql
hgsql pyrAer1 < ~/kent/src/hg/lib/minGeneInfo.sql
echo rename table pepPred to gbProtCodePep | hgsql pyrAer1
echo rename table minGeneInfo to gbProtCodeXra | hgsql pyrAer1
echo load data local infile \'pyrAer1-genbank-cds.pep\' into table gbProtCodePep | hgsql pyrAer1
echo load data local infile \'pyrAer1-genbank-cds.xra\' into table gbProtCodeXra | hgsql pyrAer1
csh
/cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' pyrAer1-genbank-cds.bed | bedToGenePred stdin tmp.gp
#below substr($1,4,4) must be edited so that it returns a numeric field.
/cluster/bin/scripts/tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,substr($1,4,4),name2,"cmpl","cmpl",0}' tmp.gp > tmp2.gp
# hard tab between quotes use ctrl-V then press tab
join -t " " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.2 1.13 1.14 1.15 tmp2.gp pyrAer1-genbank-cds.xra > pyrAer1.gp
ldHgGene pyrAer1 refSeq pyrAer1.gp -predTab -genePredExt
# GENBANK rRNA GENES (NOT QUITE DONE)
ssh hgdev
cd /cluster/data/pyrAer1/genbank
gbArchaeGenome -kind=rRNA pyrAer1.gbk pyrAer1-params-bed.txt pyrAer1-rrnas.bed
echo 'gene
product
NA' > pyrAer1-params-rrna-xra.txt
gbArchaeGenome -kind=rRNA pyrAer1.gbk pyrAer1-params-rrna-xra.txt pyrAer1-rrnas-xra.txt
hgLoadBed pyrAer1 gbRRNA pyrAer1-rrnas.bed
hgsql pyrAer1 < ~/kent/src/hg/lib/minGeneInfo.sql
echo rename table minGeneInfo to gbRRNAXra | hgsql pyrAer1
echo load data local infile \'pyrAer1-rrnas-xra.txt\' into table gbRRNAXra | hgsql pyrAer1
# COG STUFF
# Cut and paste http://www.ncbi.nlm.nih.gov/cgi-bin/COG/palox into emacs (COG list)
# and save as cogpage.txt
awk '{printf("%s\t%s\n",$6,$5)}' < cogpage.txt | sed -e 's/\[//' -e 's/\]//' > cogs.txt
rm cogpage.txt
# Now we have the basic list of cogs and the letter code for each one.
# TODD LOWE tRNA GENES (DONE 01/15/04)
# This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd
# Lowe. See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields.
# **Showing the tRNAScanSE instructions would be nice in the future.
ssh hgwdev
mkdir /cluster/data/pyrAer1/bed/loweTrnaGene
cd /cluster/data/pyrAer1/bed/loweTrnaGene
hgLoadBed -tab pyrAer1 loweTrnaGene pyrAer1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql
# TODD LOWE snoRNA GENES (DONE 01/16/04)
# This is a bed 6 file created by hand.
ssh hgwdev
mkdir /cluster/data/pyrAer1/bed/loweSnoGene
cd /cluster/data/pyrAer1/bed/loweSnoGene
hgLoadBed -tab pyrAer1 loweSnoGene pyrAer1-snos.bed
# TIGR GENES (DONE 02/09/04)
# First go to http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi
# and fill out the web form as follows:
# - Pick "Retrieve attributes for the specified DNA feature within a specific
# organism and/or a specific role category".
# * Pick "Pyrobaculum aerophilum IM2", and "Primary and TIGR annotation ORFs"
# from the 1st and 3rd box.
# * Select everything from "Choose TIGR Annotation Gene Attributes"
# * Select "Primary Locus Name" from "Choose Primary Annotation Gene Attributes"
# * Select everything from "Choose Other Gene Attributes"
# - Click submit, and click save as tab-delimited file.
ssh hgwdev
mkdir /cluster/data/pyrAer1/bed/tigrCmrORFs
cp pyrAer1-tigr.tab /cluster/data/pyrAer1/bed/tigrCmrORFs
cd /cluster/data/pyrAer1/bed/tigrCmrORFs
tigrCmrToBed pyrAer1-tigr.tab pyrAer1-tigr.bed
hgLoadBed -tab pyrAer1 tigrCmrGene pyrAer1-tigr.bed -sqlTable=~/kent/src/hg/lib/tigrCmrGene.sql
echo RENAME TABLE tigrCmrGene to tigrCmrORFs | hgsql pyrAer1
# Lowe Lab Microarrays (DONE 02/09/04)
ssh hgwdev
mkdir /cluster/data/pyrAer1/bed/llaPaePrintA
cp Pae-arrays.{bed,exps} /cluster/data/pyrAer1/bed/llaPaePrintA
cd /cluster/data/pyrAer1/bed/llaPaePrintA
hgLoadBed pyrAer1 llaPaePrintA Pae-arrays.bed
hgsql pyrAer1 < ~/kent/src/hg/lib/expRecord.sql
echo RENAME TABLE expRecord to llaPaePrintAExps | hgsql pyrAer1
echo LOAD DATA LOCAL INFILE \'Pae-arrays.exps\' INTO TABLE llaPaePrintAExps | hgsql pyrAer1
# for now I'm using both hgFixed and the pyrAer1 database for this... that will change.
hgsql hgFixed < ~/kent/src/hg/lib/expRecord.sql
echo RENAME TABLE expRecord to llaPaePrintAExps | hgsql hgFixed
echo LOAD DATA LOCAL INFILE \'Pae-arrays.exps\' INTO TABLE llaPaePrintAExps | hgsql hgFixed
# AORF TRACK
# This is another hand-created file, provided originally by Sorel Fitz-Gibbon and massaged to be a bed.
ssh hgwdev
mkdir /cluster/data/pyrAer1/bed/primAORF
cp aorfs.bed /cluster/data/pyrAer1/bed/primAORF
cd /cluster/data/pyrAer1/bed/primAORF
hgLoadBed pyrAer1 primAORF aorfs.bed
# CHANGE "chr1" to "chr"
ssh hgdev
mv /cluster/data/pyrAer1/nib/chr1.nib /cluster/data/pyrAer1/nib/chr.nib
rm /gbdb/pyrAer1/nib/chr1.nib
ln -s /cluster/data/pyrAer1/nib/chr.nib /gbdb/pyrAer1/nib/chr.nib
# a quick script to replace chr1 with chr
#!/bin/bash
sed 's/chr1/chr/g' $1 > /tmp/whatever
mv /tmp/whatever $1
cd /cluster/data/pyrAer1/bed
find -name '*.bed' | xargs changeCh.sh
# Now change the DB
cd /tmp
hgsqldump pyrAer1 | sed 's/chr1/chr/g' > paeNew.sql
echo drop database pyrAer1 | hgsql test
echo create database pyrAer1 | hgsql test
hgsql pyrAer1 < paeNew.sql
rm paeNew.sql
echo 'update dbDb set defaultPos="chr:550000-580000" where name="pyrAer1"' | hgsql -h genome-testdb hgcentraltest
# Pae promoter track
cd /cluster/data/pyrAer1/wiggle
mkdir promoterScanPos
mkdir promoterScanNeg
cd promoterScanPos
cp prom.neg.gz .
wigAsciiToBinary -chrom=chr -wibFile=promoterScanPos prom.pos.gz
hgLoadWiggle pyrAer1 promoterScanPos promoterScanPos.wig
cd ../promoterScanNeg
cp prom.neg.gz .
wigAsciiToBinary -chrom=chr -wibFile=promoterScanNeg prom.neg.gz
hgLoadWiggle pyrAer1 promoterScanNeg promoterScanNeg.wig
cd /gbdb/pyrAer1/wib
ln -s /cluster/data/pyrAer1/wiggle/promoterScanPos/promoterScanPos.wib promoterScanPos.wib
ln -s /cluster/data/pyrAer1/wiggle/promoterScanNeg/promoterScanNeg.wib promoterScanNeg.wib
# shine DG track
cd /cluster/data/pyrAer1/wiggle
mkdir shineDGPos
mkdir shineDGNeg
cd shineDGPos
cp shine.pos.gz .
wigAsciiToBinary -chrom=chr -wibFile=shineDGPos shine.pos.gz
hgLoadWiggle pyrAer1 shineDGPos shineDGPos.wig
cd ../shineDGNeg
cp shine.neg.gz .
wigAsciiToBinary -chrom=chr -wibFile=shineDGNeg shine.neg.gz
hgLoadWiggle pyrAer1 shineDGNeg shineDGNeg.wig
cd /gbdb/pyrAer1/wib
ln -s /cluster/data/pyrAer1/wiggle/shineDGPos/shineDGPos.wib shineDGPos.wib
ln -s /cluster/data/pyrAer1/wiggle/shineDGNeg/shineDGNeg.wib shineDGNeg.wib
# RNA genes
cd /cluster/data/pyrAer1/bed
mkdir rnaGenes
cp paeAllRNA.bed .
cp ~/kent/src/hg/lib/rnaGenes.sql .
hgLoadBed -sqlTable=rnaGenes.sql pyrAer1 rnaGenes paeAllRNA.bed