src/hg/makeDb/doc/braFlo1.txt 1.19
1.19 2009/11/25 21:48:38 hiram
change autoScaleDefault to autoScale
Index: src/hg/makeDb/doc/braFlo1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/braFlo1.txt,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -B -U 1000000 -r1.18 -r1.19
--- src/hg/makeDb/doc/braFlo1.txt 17 Oct 2008 01:06:30 -0000 1.18
+++ src/hg/makeDb/doc/braFlo1.txt 25 Nov 2009 21:48:38 -0000 1.19
@@ -1,1527 +1,1527 @@
# for emacs: -*- mode: sh; -*-
# This file describes browser build for the Lancelet
# genome, Branchiostoma floridae, March 2006, v.1.0 from JGI
#
# "$Id$"
#
##########################################################################
### Fetch sequence (DONE - 2007-03-26 - Hiram)
ssh kkstore05
mkdir /cluster/store12/braFlo1
ln -s /cluster/store12/braFlo1 /cluster/data/braFlo1
mkdir /cluster/data/braFlo1/downloads
cd /cluster/data/braFlo1/downloads
wget --timestamping \
'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/Branchiostoma.fasta.gz'
wget --timestamping \
'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/proteins.Brafl1.fasta.gz'
wget --timestamping \
'ftp://ftp.jgi-psf.org/pub/JGI_data/Branchiostoma_floridae/v1.0/transcripts.Brafl1.fasta.gz'
gunzip Branchiostoma.fasta.gz
scaffoldFaToAgp Branchiostoma.fasta
##########################################################################
## Set up initial database (DONE - 2007-03-26 - Hiram)
cd /cluster/data/braFlo1
cat << '_EOF_' > braFlo1.config.ra
# Config parameters for makeGenomeDb.pl:
db braFlo1
clade deuterostome
genomeCladePriority 20
scientificName Branchiostoma floridae
commonName Lancelet
assemblyDate Mar. 2006
assemblyLabel JGI v.1.0 (March 2006)
orderKey 799
mitoAcc 5881414
fastaFiles /cluster/data/braFlo1/downloads/Branchiostoma.fasta
agpFiles /cluster/data/braFlo1/downloads/Branchiostoma.agp
# qualFiles none
dbDbSpeciesDir lancelet
'_EOF_'
time nice -n +19 makeGenomeDb.pl braFlo1.config.ra > makeGenomeDb.out 2>&1
hgsql -e 'INSERT INTO chrM_gold
(bin, chrom, chromStart, chromEnd, ix, type, frag, fragStart, fragEnd, strand)
VALUES (585, "chrM", 0, 15083, 1, "F", "NC_000834.1", 0, 15083, "+");' braFlo1
## done with this sequence
cd downloads
gzip Branchiostoma.fasta
##########################################################################
## RepeatMasker (DONE - 2007-03-26 - Hiram)
ssh kkstore05
mkdir /cluster/data/braFlo1/bed/RepeatMasker
cd /cluster/data/braFlo1/bed/RepeatMasker
time nice -n +10 doRepeatMasker.pl braFlo1 \
-buildDir=/cluster/data/braFlo1/bed/RepeatMasker > do.log 2>&1 &
# about 11 minutes
cd /cluster/data/braFlo1
twoBitToFa braFlo1.rmsk.2bit stdout | faSize -showPercent stdin
# 926386587 bases (95184565 N's 831202022 real
# 806238402 upper 24963620 lower) in 2 sequences in 1 files
# twoBitToFa braFlo1.rmsk.2bit stdout | faSize -showPercent stdin
# %2.69 masked total, %3.00 masked real
#########################################################################
# SIMPLE REPEATS (TRF) (DONE 2007-01-22 - Hiram)
ssh kolossus
mkdir /cluster/data/braFlo1/bed/simpleRepeat
cd /cluster/data/braFlo1/bed/simpleRepeat
for C in chrM chrUn
do
mkdir /scratch/tmp/braFlo1.trf
twoBitToFa -seq="${C}" ../../braFlo1.unmasked.2bit \
/scratch/tmp/braFlo1.trf/${C}.fa
time nice -n 19 trfBig -trf=/cluster/bin/i386/trf \
/scratch/tmp/braFlo1.trf/${C}.fa /dev/null \
-bedAt=${C}.bed -tempDir=/scratch/tmp/braFlo1.trf > ${C}.log 2>&1
rm -fr /scratch/tmp/braFlo1.trf
echo ${C} done
done
# real 187m42.772s
# no repeats found on chrM
# Make a filtered version for sequence masking:
awk '{if ($5 <= 12) print;}' chrUn.bed > trfMask.bed
# Load unfiltered repeats into the database:
ssh hgwdev
hgLoadBed braFlo1 simpleRepeat \
/cluster/data/braFlo1/bed/simpleRepeat/chrUn.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
################################################
## WINDOWMASKER (DONE - 2007-03-26 - Hiram)
ssh kkstore05
mkdir /cluster/data/braFlo1/bed/WindowMasker
cd /cluster/data/braFlo1/bed/WindowMasker
time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl braFlo1 \
-buildDir=/cluster/data/braFlo1/bed/WindowMasker \
-workhorse=kolossus > wmRun.log 2>&1 &
# real 82m55.387s
# Masking statistics
twoBitToFa braFlo1.wmsk.sdust.2bit stdout | faSize stdin
# 926386587 bases (95184565 N's 831202022 real
# 611909619 upper 219292403 lower) in 2 sequences in 1 files
# %23.67 masked total, %26.38 masked real
ssh hgwdev
time nice -n +19 \
hgLoadBed -strict braFlo1 windowmaskerSdust windowmasker.sdust.bed.gz
# Loaded 4263263 elements of size 3
# eliminate the gaps from the masking (WM bug)
featureBits braFlo1 -not gap -bed=notGap.bed
# 831202022 bases of 831202022 (100.000%) in intersection
time nice -n +19 featureBits braFlo1 windowmaskerSdust notGap.bed \
-bed=stdout | gzip -c > cleanWMask.bed.gz
# 219292403 bases of 831202022 (26.383%) in intersection
# reload track to get it clean
hgLoadBed braFlo1 windowmaskerSdust cleanWMask.bed.gz
# Loaded 4255796 elements of size 4
featureBits braFlo1 windowmaskerSdust
# 219292403 bases of 831202022 (26.383%) in intersection
##############################################################################
## combine trf mask with windowmasker (DONE - 2008-05-23 - Hiram)
ssh kkstore05
cd /cluster/data/braFlo1
zcat bed/WindowMasker/cleanWMask.bed.gz \
| twoBitMask braFlo1.unmasked.2bit stdin \
-type=.bed braFlo1.cleanWMSdust.2bit
cat bed/simpleRepeat/trfMask.bed \
| twoBitMask -add -type=.bed braFlo1.cleanWMSdust.2bit stdin braFlo1.2bit
# can safely ignore the warning about BED file >= 13 fields
# check it
twoBitToFa braFlo1.2bit stdout | faSize stdin
# 926386587 bases (95184565 N's 831202022 real 611655540 upper
# 219546482 lower) in 2 sequences in 1 files
# %23.70 masked total, %26.41 masked real
# create gbdb symlink
ssh hgwdev
ln -s /cluster/data/braFlo1/braFlo1.2bit /gbdb/braFlo1
#########################################################################
## Create masked scaffolds-only 2bit file (DONE - 2007-03-26 - Hiram)
ssh kkstore05
mkdir /cluster/data/braFlo1/scaffolds
cp -p /cluster/data/oryLat1/jkStuff/lft2BitToFa.pl \
/cluster/data/braFlo1/jkStuff
cp -p /cluster/data/oryLat1/jkStuff/agpToLift.pl \
/cluster/data/braFlo1/jkStuff
cd /cluster/data/braFlo1/scaffolds
ln -s ../Un/chrUn.agp .
../jkStiff/agpToLift.pl chrUn.agp > braFlo1.lift
../jkStuff/lft2BitToFa.pl ../braFlo1.2bit braFlo1.lift \
> chrUn.scaffolds.fa
twoBitToFa ../braFlo1.2bit chrM.fa
faToTwoBit chrUn.scaffolds.fa chrM.fa braFlo1UnScaffolds.2bit
twoBitInfo braFlo1UnScaffolds.2bit stdout | sort -k2nr \
> braFlo1UnScaffolds.sizes
cp -p braFlo1UnScaffolds.sizes braFlo1UnScaffolds.2bit braFlo1.lift \
/san/sanvol1/scratch/braFlo1
#########################################################################
## Reload rmsk to take care of empty chrM (DONE - 2007-03-26 - Hiram)
ssh hgwdev
cd /cluster/data/braFlo1/bed/RepeatMasker
hgLoadOut -table=rmsk -nosplit braFlo1 braFlo1.fa.out
hgsql -e "drop table chrUn_rmsk;" braFlo1
# before:
featureBits braFlo1 rmsk
# 24973684 bases of 923355587 (2.705%) in intersection
# after:
featureBits braFlo1 rmsk
# 24973684 bases of 923355587 (2.705%) in intersection
#########################################################################
## BLASTZ HUMAN HG18 (DONE - 2007-03-26 - Hiram)
# measurement on hg18
ssh kkstore02
cd /cluster/data/hg18/bed/blastz.braFlo1.2007-03-26
cat fb.hg18.chainBraFlo1Link.txt
# 26455595 bases of 2881515245 (0.918%) in intersection
# and now the swap
mkdir /cluster/data/braFlo1/bed/blastz.hg18.swap
cd /cluster/data/braFlo1/bed/blastz.hg18.swap
time doBlastzChainNet.pl -chainMinScore=2000 -chainLinearGap=loose \
/cluster/data/hg18/bed/blastz.braFlo1.2007-03-26/DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -verbose=2 \
-swap > swap.log 2>&1 &
# real 83m46.258s
cat fb.braFlo1.chainHg18Link.txt
# 30912893 bases of 923355587 (3.348%) in intersection
# create reciprocal best chains/nets for 5-way maf alignments
ssh hgwdev
cd /cluster/data/braFlo1/bed/blastz.hg18.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 hg18 \
time nice -n +19 $HOME/kent/src/hg/utils/automation/doRecipBest.pl \
braFlo1 hg18 > rbest.log 2>&1 &
# real 34m14.843s
# real 32m11.695s
# This did not work right. Something went haywire somewhere:
# Error: braFlo1 rbest net coverage 14190116 != hg18 14270477
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-18 - Hiram)
# Use -repMatch=328 (based on size -- for human we use 1024, and
# lancelet size is ~32% of human judging by gapless braFlo1 vs. hg18
# genome sizes from featureBits.
ssh kolossus
cd /cluster/data/braFlo1
blat braFlo1.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=jkStuff/11.ooc -repMatch=328
# Wrote 9536 overused 11-mers to jkStuff/11.ooc
cp -p jkStuff/11.ooc /san/sanvol1/scratch/braFlo1
#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-03-26,27 - Hiram)
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add braFlo1 just after gasAcu1
# braFlo1 (Branchiostoma floridae - Lancelet)
braFlo1.serverGenome = /cluster/data/braFlo1/braFlo1.2bit
braFlo1.clusterGenome = /san/sanvol1/scratch/braFlo1/braFlo1.2bit
braFlo1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
braFlo1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
braFlo1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
braFlo1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
braFlo1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
braFlo1.ooc = /san/sanvol1/scratch/braFlo1/11.ooc
braFlo1.lift = /cluster/data/braFlo1/jkStuff/liftAll.lft
braFlo1.refseq.mrna.native.load = no
braFlo1.refseq.mrna.xeno.load = yes
braFlo1.genbank.mrna.xeno.load = yes
braFlo1.genbank.est.native.load = yes
braFlo1.downloadDir = braFlo1
braFlo1.perChromTables = no
cvs ci -m "Added braFlo1." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
# Edit src/lib/gbGenome.c to add new species.
cvs ci -m "Added Branchiostoma floridae (lancelet)." src/lib/gbGenome.c
make install-server
ssh kkstore05
cd /cluster/data/genbank
time nice -n +19 bin/gbAlignStep -initial braFlo1 &
# logFile: var/build/logs/2007.03.26-21:58:34.braFlo1.initalign.log
# real 230m8.123s
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad braFlo1
# logFile: var/dbload/hgwdev/logs/2007.03.27-07:52:23.dbload.log
# real 6m57.557s
## re-load to get the est track after adding that to the .conf file
# var/dbload/hgwdev/logs/2007.03.29-17:09:35.dbload.log
# real 19m2.168s
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add braFlo1 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added braFlo1 - Branchiostoma floridae (lancelet)" \
etc/align.dbs etc/hgwdev.dbs
make etc-update
############################################################################
## Adding a photograph (DONE - 2007-03-28 - Hiram)
## From Hector Escriva: hector.escriva@obs-banyuls.fr
# Dr. Clawson
# All the european teams working with amphioxus have changed to B.
# lanceolatum species and none of us is still working with B. floridae. We
# are able here to induce spawnings everyday (on B. lanceolatum) and this
# is impossible with the american species which limits very much the
# amount of experiments that one can perform. So I have pictures of B.
# lanceolatum, which is morphologically the same as floridae (even me I
# cannot distinghish them). So I send you here a picture of a B.
# lanceolatum juvenile in which we clearly see the main morphological
# aspects of this animal. You should however write the correct name of the
# species in order to be serious.
# I would also ask you to write in "Photo courtesy of", the name of my
# technicien who did the picture and myself. Michael Fuentes and Hector Escriva.
# Let me know if you agree with this picture.
# Sincerely Hector Escriva
mkdir /cluster/data/braFlo1/photograph
cd /cluster/data/braFlo1/photograph
convert -sharpen 0 -normalize -geometry 350x200 \
PostmetamorphicLarvae.jpg Branchiostoma_lanceolatum.jpg
############################################################################
# BLATSERVERS ENTRY (DONE - 2007-04-04 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("braFlo1", "blat11", "17784", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("braFlo1", "blat11", "17785", "0", "1");' \
hgcentraltest
# test it with some sequence
############################################################################
## Default position set at IFG-1 (DONE - 2007-04-09 - Hiram)
ssh hgwdev
hgsql -e 'update dbDb set defaultPos="chrUn:619013791-619054719"
where name="braFlo1";' hgcentraltest
############################################################################
# BLASTZ petMar1 Lamprey (DONE - 2008-04-11 - Hiram)
ssh kkstore05
screen # use a screen to control this multi-day job
mkdir /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
cat << '_EOF_' > DEF
# Lancelet vs Lamprey
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Lancelet braFlo1 - largest chunk big enough for largest scaffold
# Largest scaffold 7,200,735 - 3032 scaffolds + chrM
SEQ1_DIR=/san/sanvol1/scratch/braFlo1/braFlo1.2bit
SEQ1_LEN=/san/sanvol1/scratch/braFlo1/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/braFlo1/braFlo1UnScaffolds.sizes
SEQ1_LIFT=/san/sanvol1/scratch/braFlo1/braFlo1.lift
SEQ1_CHUNK=10000000
SEQ1_LIMIT=50
SEQ1_LAP=10000
# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/cluster/data/petMar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=200
SEQ2_LAP=0
BASE=/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# Completed: 78590 of 78590 jobs
# CPU time in finished jobs: 10056466s 167607.76m 2793.46h 116.39d 0.319 y
# IO & Wait Time: 6633099s 110551.65m 1842.53h 76.77d 0.210 y
# Average job time: 212s 3.54m 0.06h 0.00d
# Longest finished job: 4572s 76.20m 1.27h 0.05d
# Submission to last job: 107694s 1794.90m 29.91h 1.25d
# continuing after various interruptions and finished the batch manually
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
# real 12m43.792s
cat fb.braFlo1.chainPetMar1Link.txt
# 27418217 bases of 923355587 (2.969%) in intersection
# create reciprocal best chains/nets for 5-way maf alignments
ssh hgwdev
cd /cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 petMar1 \
> rbest.log 2>&1 &
# real 30m11.990s
# and the swap
mkdir /cluster/data/petMar1/bed/blastz.braFlo1.swap
cd /cluster/data/petMar1/bed/blastz.braFlo1.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/braFlo1/bed/blastzPetMar1.2008-04-11/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
# real 194m53.514s
cat fb.petMar1.chainBraFlo1Link.txt
# 27373264 bases of 831696438 (3.291%) in intersection
##########################################################################
## BLASTZ Chicken GalGal3 swap (DONE - 2008-04-16 - Hiram)
# the original
ssh kkstore03
cd /cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15
cat fb.galGal3.chainBraFlo1Link.txt
# 19795687 bases of 1042591351 (1.899%) in intersection
# and the swap
mkdir /cluster/data/braFlo1/bed/blastz.galGal3.swap
cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
time nice -n +19 doBlastzChainNet.pl \
/cluster/data/galGal3/bed/blastzBraFlo1.2008-04-15/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk -verbose=2 > swap.log 2>&1 &
# real 4m45.054s
cat fb.braFlo1.chainGalGal3Link.txt
# 30287175 bases of 923355587 (3.280%) in intersection
# create reciprocal best chains/nets for 5-way maf alignments
ssh hgwdev
cd /cluster/data/braFlo1/bed/blastz.galGal3.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 galGal3 \
> rbest.log 2>&1 &
# real 24m30.834s
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2008-04-21)
# (auto) establish native host of /cluster/data/<assembly>
df /cluster/data/braFlo1
ssh kkstore05
# bash if not using bash shell already
mkdir /cluster/data/braFlo1/blastDb
cd /cluster/data/braFlo1
awk '{if ($2 > 1000000) print $1}' scaffolds/braFlo1UnScaffolds.sizes > 1meg.lst
if test -s 1meg.lst; then
twoBitToFa -seqList=1meg.lst scaffolds/braFlo1UnScaffolds.2bit temp.fa
faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
rm temp.fa
fi
rm 1meg.lst
awk '{if ($2 <= 1000000) print $1}' scaffolds/braFlo1UnScaffolds.sizes > less1meg.lst
if test -s less1meg.lst; then
twoBitToFa -seqList=less1meg.lst scaffolds/braFlo1UnScaffolds.2bit temp.fa
faSplit about temp.fa 1000000 blastDb/y
rm temp.fa
fi
rm less1meg.lst
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
ls *.nsq | wc -l
# 1063
mkdir -p /san/sanvol1/scratch/braFlo1/blastDb
cd /cluster/data/braFlo1/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/braFlo1/blastDb
done
mkdir -p /cluster/data/braFlo1/bed/tblastn.hg18KG
cd /cluster/data/braFlo1/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/braFlo1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 1063 query.lst
# we want around 100,000 jobs per gig (0.0001 jobs per base)
numJobs=`awk '{sum += $2} END {print sum * 0.0001 }' /cluster/data/braFlo1/chrom.sizes`
# we want around numJobs jobs
numGenes=`wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`
numQueries=`wc query.lst | awk '{print $1}'`
numKGFiles=`awk -v ng=$numGenes -v nq=$numQueries -v nj=$numJobs 'BEGIN {printf "%d", ng/(nj/nq);exit}' `
mkdir -p /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa
split -l $numKGFiles /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/braFlo1/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
if test -s /cluster/data/braFlo1/blastDb.lft
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/braFlo1/blastDb.lft carry $f.2
else
mv $f.2 $f.3
fi
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit
ssh pk
cd /cluster/data/braFlo1/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 93544 of 93544 jobs
# CPU time in finished jobs: 8499504s 141658.40m 2360.97h 98.37d 0.270 y
# IO & Wait Time: 669830s 11163.84m 186.06h 7.75d 0.021 y
# Average job time: 98s 1.63m 0.03h 0.00d
# Longest finished job: 376s 6.27m 0.10h 0.00d
# Submission to last job: 44890s 748.17m 12.47h 0.52d
ssh kkstore05
cd /cluster/data/braFlo1/bed/tblastn.hg18KG
mkdir chainRun
cd chainRun
tcsh
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh memk
cd /cluster/data/braFlo1/bed/tblastn.hg18KG/chainRun
para create chainSpec
para try, check, push, check etc.
# Completed: 88 of 88 jobs
# CPU time in finished jobs: 906s 15.09m 0.25h 0.01d 0.000 y
# IO & Wait Time: 19435s 323.92m 5.40h 0.22d 0.001 y
# Average job time: 231s 3.85m 0.06h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 322s 5.37m 0.09h 0.00d
# Submission to last job: 812s 13.53m 0.23h 0.01d
ssh kkstore05
cd /cluster/data/braFlo1/bed/tblastn.hg18KG/blastOut
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/braFlo1/bed/tblastn.hg18KG/preblastHg18KG.psl
cd ..
pslCheck preblastHg18KG.psl
liftUp -nosort -type=".psl" -nohead blastHg18KG.psl /cluster/data/braFlo1/scaffolds/braFlo1.lift carry preblastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/braFlo1/bed/tblastn.hg18KG
hgLoadPsl braFlo1 blastHg18KG.psl
# check coverage
featureBits braFlo1 blastHg18KG
# 12239766 bases of 923355587 (1.326%) in intersection
featureBits braFlo1 xenoRefGene:cds blastHg18KG -enrichment
# xenoRefGene:cds 1.206%, blastHg18KG 1.326%, both 0.621%, cover 51.51%, enrich 38.86x
ssh kkstore05
rm -rf /cluster/data/braFlo1/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/braFlo1/bed/tblastn.hg18KG/blastOut
#end tblastn
## BLASTZ Mouse mm9 swap (DONE - 2008-04-17 - Hiram)
# the original
ssh kkstore06
cd /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
cat fb.mm9.chainBraFlo1Link.txt
# 26725980 bases of 2620346127 (1.020%) in intersection
# That is OK, now for the swap:
mkdir /cluster/data/braFlo1/bed/blastz.mm9.swap
cd /cluster/data/braFlo1/bed/blastz.mm9.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -bigClusterHub=kk > swap.log 2>&1 &
# real 12m23.402s
cat fb.braFlo1.chainMm9Link.txt
# 31517169 bases of 923355587 (3.413%) in intersection
# create reciprocal best chains/nets for 5-way maf alignments
ssh hgwdev
cd /cluster/data/braFlo1/bed/blastz.mm9.swap
time nice -n +19 /cluster/bin/scripts/doRecipBest.pl braFlo1 mm9 \
> rbest.log 2>&1 &
# real 35m15.178s
#############################################################################
## 5-Way Multiz (DONE - 2008-04-17 - Hiram)
##
ssh hgwdev
mkdir /cluster/data/braFlo1/bed/multiz5way
cd /cluster/data/braFlo1/bed/multiz5way
# take the 6-way tree from mm9 and eliminate genomes not in
# this alignment. Manually add in petMar1 and braFlo1
# arbitrarily somewhere outside of the fishes
# rearrange to get braFlo1 on the top of the graph
# paste this tree into the on-line phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to create the image for the tree diagram
# select the 5 organisms from the 6-way on Lamprey petMar1
/cluster/bin/phast/tree_doctor \
--prune-all-but \
Human_hg18,Mouse_mm9,Chicken_galGal3,Lamprey_petMar1,Lanclet_braFlo1 \
/cluster/data/petMar1/bed/multiz6way/petMar1.6-way.nh \
> 5-way.fullNames.nh
# looks something like this:
((Lamprey_petMar1:1.600000,(Chicken_galGal3:0.488824,
(Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.725954):0.400000,
Lanclet_braFlo1:2.000000);
# rearrange to get Lanclet at the top, this leaves us with:
cat << '_EOF_' > braFlo1.5-way.nh
(Lanclet_braFlo1:2.0,(Lamprey_petMar1:1.600000,(Chicken_galGal3:0.488824,
(Mouse_mm9:0.325818,Human_hg18:0.126901):0.470757):0.725954):0.400000);
'_EOF_'
# << happy emacs
# create a species list from that file:
sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' braFlo1.5-way.nh \
| sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
| sed -e "s/.*_//; s/:.*//" | sort > species.list
# verify that has 5 db names in it
# create a stripped down nh file for use in autoMZ run
echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:[012].[0-9]*//g; s/[,;]/ /g' braFlo1.5-way.nh \
| sed -e "s/ / /g"` > tree.5.nh
# that looks like, as a single line:
# (braFlo1 (petMar1 (galGal3 (mm9 hg18))))
# verify all blastz's exists
cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/braFlo1/bed/multiz5way
foreach db (`grep -v braFlo1 species.list`)
set bdir = /cluster/data/braFlo1/bed/blastz.$db
if (-e $bdir/mafRBestNet/chrUn.maf.gz) then
echo "$db mafRBestNet"
else if (-e $bdir/mafSynNet/chrUn.maf.gz) then
echo "$db mafSynNet"
else if (-e $bdir/mafNet/chrUn.maf.gz) then
echo "$db mafNet"
else
echo "$db mafs not found"
endif
end
'_EOF_'
# << happy emacs
chmod +x ./listMafs.csh
# see what it says, these should all be mafRBestNet
./listMafs.csh
# galGal3 mafRBestNet
# hg18 mafRBestNet
# mm9 mafRBestNet
# petMar1 mafRBestNet
/cluster/bin/phast/all_dists braFlo1.5-way.nh > 5way.distances.txt
grep -i braflo 5way.distances.txt | sort -k3,3n
Lanclet_braFlo1 Chicken_galGal3 3.614778
Lanclet_braFlo1 Human_hg18 3.723612
Lanclet_braFlo1 Mouse_mm9 3.922529
Lanclet_braFlo1 Lamprey_petMar1 4.000000
# use the calculated
# distances in the table below to order the organisms and check
# the button order on the browser. Zebrafish ends up before
# tetraodon and fugu on the browser despite its distance.
# And if you can fill in the table below entirely, you have
# succeeded in finishing all the alignments required.
#
# featureBits chainLink measures
# chainBraFlo1Link chain linearGap
# distance on BraFlo1 on other minScore
# 1 3.614778 Chicken_galGal3 (% 3.280) (% 1.899) 5000 loose
# 2 3.723612 Human_hg18 (% 3.348) (% 0.918) 2000 loose
# 3 3.922529 Mouse_mm9 (% 3.413) (% 1.020) 5000 loose
# 4 4.000000 Lamprey_petMar1 (% 2.969) (% 3.291) 5000 loose
# copy net mafs to cluster-friendly storage, splitting chroms
# XXX - no need to split, there is only chrM and chrUn
mkdir mafLinks
cd mafLinks
for D in galGal3 petMar1 mm9 hg18
do
mkdir ${D}
ln -s /cluster/data/braFlo1/bed/blastz.${D}/mafRBestNet/*.maf.gz ./${D}
done
# copy to kluster friendly san for multiz run
ssh kkstore05
mkdir -p /san/sanvol1/scratch/braFlo1/multiz5way/mafSrc
cd /cluster/data/braFlo1/bed/multiz5way/mafLinks
rsync -a --progress -L ./ /san/sanvol1/scratch/braFlo1/multiz5way/mafSrc
# ready for the multiz run
ssh memk
mkdir /cluster/data/braFlo1/bed/multiz5way/cons
cd /cluster/data/braFlo1/bed/multiz5way/cons
mkdir -p maf run
cd run
mkdir penn
# use latest penn utilities
P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
cp -p $P/{autoMZ,multiz,maf_project} penn
# set the db and pairs directories here
cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef
set db = braFlo1
set c = $1
set result = $2
set resultDir = $result:h
set run = `pwd`
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz5way/mafSrc
rm -fr $tmp
mkdir -p $tmp
mkdir -p $resultDir
cp ../../tree.5.nh ../../species.list $tmp
pushd $tmp
foreach s (`grep -v $db species.list`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($run/penn $path); rehash
$run/penn/autoMZ + T=$tmp E=$db "`cat tree.5.nh`" $db.*.sing.maf $c.maf
popd
rm -f $result
cp $tmp/$c.maf $result
rm -fr $tmp
rmdir --ignore-fail-on-non-empty /scratch/tmp/$db
'_EOF_'
# << happy emacs
chmod +x autoMultiz.csh
cat << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/braFlo1/bed/multiz5way/cons/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << emacs
echo "chrM.maf" > mafList
echo "chrUn.maf" >> mafList
gensub2 mafList single template jobList
para create jobList
para try ... check ... push ... etc
# Completed: 2 of 2 jobs
# CPU time in finished jobs: 375s 6.25m 0.10h 0.00d 0.000 y
# IO & Wait Time: 17s 0.28m 0.00h 0.00d 0.000 y
# Average job time: 196s 3.27m 0.05h 0.00d
# Longest finished job: 385s 6.42m 0.11h 0.00d
# Submission to last job: 385s 6.42m 0.11h 0.00d
# load tables for a look
ssh hgwdev
mkdir -p /gbdb/braFlo1/multiz5way/maf
ln -s /cluster/data/braFlo1/bed/multiz5way/cons/maf/*.maf \
/gbdb/braFlo1/multiz5way/maf
# this generates an immense multiz5way.tab file in the directory
# where it is running. Best to run this over in scratch.
cd /scratch/tmp
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/braFlo1/multiz5way/maf braFlo1 multiz5way
# Loaded 220560 mafs in 2 files from /gbdb/braFlo1/multiz5way/maf
# real 0m4.151s
# load summary table
time nice -n +19 cat /gbdb/braFlo1/multiz5way/maf/*.maf \
| hgLoadMafSummary braFlo1 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz5waySummary stdin
# Created 87678 summary blocks from 366073 components and 220521 mafs from stdin
# real 0m4.911s
# Gap Annotation
# prepare bed files with gap info
ssh kkstore05
mkdir /cluster/data/braFlo1/bed/multiz5way/anno
cd /cluster/data/braFlo1/bed/multiz5way/anno
mkdir maf run
# these actually already all exist from previous multiple alignments
# if there are done you will see an ls of them
# or else the twoBitInfo command will be echoed, run it if you want
for DB in `cat ../species.list`
do
CDIR="/cluster/data/${DB}"
if [ ! -f ${CDIR}/${DB}.N.bed ]; then
echo "creating ${DB}.N.bed"
echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
else
ls -og ${CDIR}/${DB}.N.bed
fi
done
cd run
rm -f nBeds sizes
for DB in `grep -v braFlo1 ../../species.list`
do
echo "${DB} "
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
echo ${DB}.len >> sizes
done
ssh memk
cd /cluster/data/braFlo1/bed/multiz5way/anno/run
cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
set dir = /cluster/data/braFlo1/bed/multiz5way/cons
set c = $1
cat $dir/maf/${c}.maf | nice \
mafAddIRows -nBeds=nBeds stdin /cluster/data/braFlo1/braFlo1.2bit $2
'_EOF_'
# << happy emacs
chmod +x doAnno.csh
cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ ../maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
cut -f1 /cluster/data/braFlo1/chrom.sizes > chrom.list
gensub2 chrom.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 2 of 2 jobs
# CPU time in finished jobs: 7240s 120.66m 2.01h 0.08d 0.000 y
# IO & Wait Time: 9s 0.16m 0.00h 0.00d 0.000 y
# Average job time: 3625s 60.41m 1.01h 0.04d
# Longest finished job: 7241s 120.68m 2.01h 0.08d
# Submission to last job: 7241s 120.68m 2.01h 0.08d
# load the results
ssh hgwdev
cd /cluster/data/braFlo1/bed/multiz5way/anno
mkdir -p /gbdb/braFlo1/multiz5way/anno
ln -s `pwd`/maf/*.maf /gbdb/braFlo1/multiz5way/anno
# by loading this into the table multiz5way, it will replace the
# previously loaded table with the unannotated mafs
# huge temp files are made, do them on local disk
cd /scratch/tmp
time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/braFlo1/multiz5way/anno \
braFlo1 multiz5way
# Loaded 311293 mafs in 2 files from /gbdb/braFlo1/multiz5way/anno
# real 0m7.505s
# normally filter this for chrom size > 1,000,000 and only load
# those chroms. But this is a scaffold assembly, load everything.
time nice -n +19 cat /gbdb/braFlo1/multiz5way/anno/*.maf \
| hgLoadMafSummary braFlo1 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 multiz5waySummary stdin
# Created 87678 summary blocks from 366073 components and 311253 mafs from stdin
# real 0m6.998s
# by loading this into the table multiz5waySummary, it will replace
# the previously loaded table with the unannotated mafs
# remove the multiz5way*.tab files in this /scratch/tmp directory
rm multiz5way*.tab
# And, you can remove the previously loaded non-annotated maf file link:
rm /gbdb/braFlo1/multiz5way/maf/*.maf
rmdir /gbdb/braFlo1/multiz5way/maf
###########################################################################
## Annotate 5-way multiple alignment with gene annotations
## (DONE - 2008-05-01 - Hiram)
# Gene frames
## following pattern used in Lamprey petMar1
# hg18 is in a bit of transition at the moment, so use ensGene
# mm9 can use knownGene
# xenoMrna for braFlo1, petMar1
# ensGene v49 for galGal3
ssh hgwdev
mkdir /cluster/data/braFlo1/bed/multiz5way/frames
cd /cluster/data/braFlo1/bed/multiz5way/frames
mkdir genes
# knownGene
for DB in mm9
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# ensGene
for DB in hg18 galGal3
do
hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/${DB}.tmp.gz
mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
echo "${DB} done"
done
# use xenoMrna for petMar1, braFlo1
for DB in petMar1 braFlo1
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select xenoMrna.qName,cds.name,xenoMrna.* \
from xenoMrna,gbCdnaInfo,cds \
where (xenoMrna.qName = gbCdnaInfo.acc) and \
(gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2 ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100 ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done
ls -og genes
# -rw-rw-r-- 1 1040639 May 1 09:53 braFlo1.gp.gz
# -rw-rw-r-- 1 1557911 May 1 09:50 galGal3.gp.gz
# -rw-rw-r-- 1 2151412 May 1 09:50 hg18.gp.gz
# -rw-rw-r-- 1 1965274 May 1 09:50 mm9.gp.gz
# -rw-rw-r-- 1 596023 May 1 09:51 petMar1.gp.gz
ssh kkstore05
cd /cluster/data/braFlo1/bed/multiz5way/frames
# hg18 is in a bit of transition at the moment, so use ensGene
# mm9 can use knownGene
# xenoMrna for braFlo1, braFlo1
# ensGene v49 for galGal3, oryLat1
# anything to annotate is in a pair, e.g.: braFlo1 genes/braFlo1.gp.gz
time (cat ../anno/maf/*.maf | nice -n +19 genePredToMafFrames braFlo1 stdin stdout braFlo1 genes/braFlo1.gp.gz hg18 genes/hg18.gp.gz mm9 genes/mm9.gp.gz galGal3 genes/galGal3.gp.gz petMar1 genes/petMar1.gp.gz | gzip > multiz5way.mafFrames.gz) > frames.log 2>&1
# see what it looks like in terms of number of annotations per DB:
zcat multiz5way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
# 44216 petMar1
# 71268 braFlo1
# 107089 galGal3
# 107857 hg18
# 111452 mm9
# load the resulting file
ssh hgwdev
cd /cluster/data/braFlo1/bed/multiz5way/frames
time nice -n +19 hgLoadMafFrames braFlo1 multiz5wayFrames \
multiz5way.mafFrames.gz
# real 0m7.902s
# enable the trackDb entries:
# frames multiz5wayFrames
# irows on
#############################################################################
# phastCons 5-way (DONE - 2008-05-01 - Hiram)
# split 5way mafs into 10M chunks and generate sufficient statistics
# files for # phastCons
ssh memk
mkdir /cluster/data/braFlo1/bed/multiz5way/msa.split
cd /cluster/data/braFlo1/bed/multiz5way/msa.split
mkdir -p /san/sanvol1/scratch/braFlo1/multiz5way/cons/ss
cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set MAFS = /cluster/data/braFlo1/bed/multiz5way/anno/maf
set WINDOWS = /san/sanvol1/scratch/braFlo1/multiz5way/cons/ss
pushd $WINDOWS
set c = $1
rm -f $c.out
twoBitToFa -seq=$c /scratch/data/braFlo1/braFlo1.2bit /scratch/tmp/braFlo1.$c.fa
/cluster/bin/phast/$MACHTYPE/msa_split $MAFS/$c.maf -i MAF \
-M /scratch/tmp/braFlo1.$c.fa \
-o SS -r $c -w 10000000,0 -I 1000 -B 5000
rm -f /scratch/tmp/braFlo1.$c.fa
popd
date > $c.out
'_EOF_'
# << happy emacs
chmod +x doSplit.csh
cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).out}
#ENDLOOP
'_EOF_'
# << happy emacs
# create list of maf files:
(cd ../anno/maf; find . -type f) | sed -e "s#^./##" > maf.list
gensub2 maf.list single template jobList
para create jobList
para try ... check ... etc
# Completed: 2 of 2 jobs
# CPU time in finished jobs: 337s 5.61m 0.09h 0.00d 0.000 y
# IO & Wait Time: 40s 0.67m 0.01h 0.00d 0.000 y
# Average job time: 189s 3.14m 0.05h 0.00d
# Longest finished job: 375s 6.25m 0.10h 0.00d
# Submission to last job: 375s 6.25m 0.10h 0.00d
# take the cons and noncons trees from the Lamprey 6-way
# Estimates are not easy to make, probably more correctly,
# take the 6-way .mod file, and re-use it here.
ssh hgwdev
cd /cluster/data/braFlo1/bed/multiz5way
cp -p /cluster/data/petMar1/bed/multiz6way/cons/all/all.mod \
./petMar1.6way.mod
# Run phastCons
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
ssh memk
mkdir -p /cluster/data/braFlo1/bed/multiz5way/cons/run.cons
cd /cluster/data/braFlo1/bed/multiz5way/cons/run.cons
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working directory
# $cwd:t which is the "grp" in this script. It is one of:
# all gliers placentals
# Well, that's what it was when used in the Mm9 30-way,
# in this instance, there is only the directory "all"
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin
set f = $1
set c = $1:r
set len = $2
set cov = $3
set rho = $4
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/braFlo1/bed/multiz5way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/braFlo1/multiz5way/cons
if (-s $cons/$grp/$grp.non-inf) then
cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
cp -p $san/ss/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
cp -p $cons/$grp/$grp.mod $tmp
cp -p $san/ss/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
$PHASTBIN/phastCons $f.ss $grp.mod \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp $san/$grp/bed
sleep 4
touch $san/$grp/pp $san/$grp/bed
rm -f $san/$grp/pp/$f.pp
rm -f $san/$grp/bed/$f.bed
mv $tmp/$f.pp $san/$grp/pp
mv $tmp/$f.bed $san/$grp/bed
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod a+x doPhast.csh
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/pp/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
pushd /san/sanvol1/scratch/braFlo1/multiz5way/cons
find ./ss -type f -name "*.ss" | sed -e "s#^./##; s/.ss$//" \
> /cluster/data/braFlo1/bed/multiz5way/cons/ss.list
popd
# run for all species
cd ..
mkdir -p all run.cons/all
cd all
/cluster/bin/phast.build/cornellCVS/phast.2007-05-04/bin/tree_doctor \
../../petMar1.6way.mod \
--prune-all-but=braFlo1,hg18,galGal3,mm9,petMar1 \
> all.mod
cd ../run.cons/all
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create template file for "all" run
cat << '_EOF_' > template
#LOOP
../doPhast.csh $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/braFlo1/multiz5way/cons/all/pp/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
gensub2 ../../ss.list single template jobList
para create jobList
para try ... check ... push ... etc.
# Completed: 94 of 94 jobs
# CPU time in finished jobs: 1945s 32.42m 0.54h 0.02d 0.000 y
# IO & Wait Time: 810s 13.49m 0.22h 0.01d 0.000 y
# Average job time: 29s 0.49m 0.01h 0.00d
# Longest finished job: 34s 0.57m 0.01h 0.00d
# Submission to last job: 201s 3.35m 0.06h 0.00d
# create Most Conserved track
ssh kolossus
cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
find ./bed -type f -name "chr*.bed" | xargs cat \
| sort -k1,1 -k2,2n | \
awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
/cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
cp -p mostConserved.bed /cluster/data/braFlo1/bed/multiz5way/cons/all
# load into database
ssh hgwdev
cd /cluster/data/braFlo1/bed/multiz5way/cons/all
time nice -n +19 hgLoadBed braFlo1 phastConsElements5way mostConserved.bed
# Loaded 122683 elements of size 5
# real 0m2.299s
# Try for 5% overall cov, and 70% CDS cov
# We don't have any gene tracks to compare CDS coverage
# --rho .31 --expected-length 45 --target-coverage .3
featureBits braFlo1 phastConsElements5way
# 35581251 bases of 923355587 (3.853%) in intersection
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
# sort by chromName, chromStart so that items are in numerical order
# for wigEncode
cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
mkdir phastCons5wayScores
cat pp/chrM*.pp | gzip > phastCons5wayScores/chrM.data.gz
# this is a bit of a cheat on the sorting order here, but it works.
ls pp/chrUn*.pp | sort -t. -k2n | xargs cat \
| gzip > phastCons5wayScores/chrUn.data.gz
# copy the phastCons5wayScores to:
mkdir -p \
/cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
cp -p phastCons5wayScores/chr*.data.gz \
/cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
# for hgdownload downloads
# Create merged posterier probability file and wiggle track data files
# currently doesn't matter where this is performed, the san is the same
# network distance from all machines.
cd /san/sanvol1/scratch/braFlo1/multiz5way/cons/all
zcat phastCons5wayScores/chr*.data.gz \
| wigEncode stdin phastCons5way.wig phastCons5way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
time nice -n +19 cp -p *.wi? /cluster/data/braFlo1/bed/multiz5way/cons/all
# real 0m1.216s
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/braFlo1/bed/multiz5way/cons/all
ln -s `pwd`/phastCons5way.wib /gbdb/braFlo1/multiz5way/phastCons5way.wib
time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/braFlo1/multiz5way braFlo1 \
phastCons5way phastCons5way.wig
# real 0m2.166s
# remove garbage
rm wiggle.tab
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/braFlo1/bed/multiz5way/cons/all
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=braFlo1 phastCons5way > histogram.data 2>&1
# real 5m0.608s
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Lancelet BraFlo1 Histogram phastCons5way track"
set xlabel " phastCons5way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# it is a bit lopsided toward the highly conserved end, but that
# is because there is so little bits being used
# These trackDb entries turn on the wiggle phastCons data track:
# type wigMaf 0.0 1.0
# maxHeightPixels 100:40:11
# wiggle phastCons5way
# spanList 1
- # autoScaleDefault Off
+ # autoScale Off
# windowingFunction mean
# pairwiseHeight 12
# yLineOnOff Off
#############################################################################
# Downloads (DONE - 2008-05-01 - Hiram)
# Let's see if the downloads will work
ssh hgwdev
cd /cluster/data/braFlo1
# expecting to find repeat masker .out file here:
ln -s bed/RepeatMasker/braFlo1.fa.out .
cd bed/simpleRepeat
mkdir trfMaskChrom
splitFileByColumn trfMask.bed trfMaskChrom
touch trfMaskChrom/chrM.bed
cd /cluster/data/braFlo1
time nice -n +19 /cluster/bin/scripts/makeDownloads.pl \
-workhorse=hgwdev braFlo1 > jkStuff/downloads.log 2>&1
# real 11m53.475s
# failed making upstream sequences:
# featureBits calJac1 mgcGenes:upstream:1000 -fa=stdout
# setpriority: Permission denied.
# the 'nice' from my bash shell causes trouble inside the csh
# script which uses nice. Finish off the install step manually
# with the mgcGenes upstreams ...
#############################################################################
# PushQ entries (DONE - 2008-05-01 - Hiram)
ssh hgwdev
/cluster/data/braFlo1
/cluster/bin/scripts/makePushQSql.pl braFlo1 > jkStuff/pushQ.sql
# output warnings:
# braFlo1 does not have seq
# Could not tell (from trackDb, all.joiner and hardcoded lists of supporting
# and genbank tables) which tracks to assign these tables to:
# gbWarn
# multiz5wayFrames
# phastCons5way
scp -p jkStuff/pushQ.sql hgwbeta:/tmp/braFlo1.sql
ssh hgwbeta
cd /tmp
hgsql qapushq < braFlo1.sql
#############################################################################
# Create 5-way downloads (DONE - 2008-05-01 - Hiram)
ssh hgwdev
mkdir -p /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
cd /cluster/data/braFlo1/bed/multiz5way/downloads/phastCons5way
# if these haven't been copied here from before:
cp -p \
/san/sanvol1/scratch/braFlo1/multiz5way/cons/all/phastCons5wayScores/* .
ln -s ../../cons/all/all.mod ./5way.mod
cp /cluster/data/petMar1/bed/multiz6way/downloads/phastCons6way/README.txt .
# edit that README.txt to be correct for this 5-way alignment
cd ..
mkdir multiz5way
cd multiz5way
cp -p /cluster/data/petMar1/bed/multiz6way/downloads/multiz6way/README.txt .
# edit that README.txt to be correct for this 5-way alignment
ssh kkstore05
cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
ln -s ../../braFlo1.5-way.nh ./5way.nh
cp -p ../../anno/maf/chr*.maf .
gzip chr*.maf
ssh hgwdev
cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
# creating upstream files from xenoRefGene, bash script:
cat << '_EOF_' > mkUpstream.sh
#!/bin/bash
DB=braFlo1
GENE=xenoRefGene
NWAY=multiz5way
export DB GENE
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags ${DB} ${NWAY} \
stdin stdout \
-orgs=/cluster/data/${DB}/bed/${NWAY}/species.list \
| gzip -c > upstream${S}.maf.gz
echo "done upstream${S}.maf.gz"
done
'_EOF_'
# << happy emacs
chmod +x ./mkUpstream.sh
time nice -n +19 ./mkUpstream.sh
# real 3m1.228s
# -rw-rw-r-- 1 126900 May 1 16:51 upstream1000.maf.gz
# -rw-rw-r-- 1 224320 May 1 16:52 upstream2000.maf.gz
# -rw-rw-r-- 1 501687 May 1 16:53 upstream5000.maf.gz
# check the names in these upstream files to ensure sanity:
zcat upstream1000.maf.gz | grep "^s " | awk '{print $2}' \
| sort | uniq -c | sort -rn | less
# should be a list of the other 4 species with a high count,
# then xenoRefGene names, e.g.:
# 364 petMar1
# 364 mm9
# 364 hg18
# 364 galGal3
# 7 NM_069006
# 7 NM_054045
# 7 NM_021059
ssh kkstore05
cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
md5sum *.maf.gz > md5sum.txt
cd ../phastCons5way
md5sum *.data.gz *.mod > md5sum.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way
mkdir /usr/local/apache/htdocs/goldenPath/braFlo1/phastCons5way
cd /cluster/data/braFlo1/bed/multiz5way/downloads/multiz5way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way
rm /usr/local/apache/htdocs/goldenPath/braFlo1/multiz5way/mkUpstream.sh
cd ../phastCons5way
ln -s `pwd`/* /usr/local/apache/htdocs/goldenPath/braFlo1/phastCons5way
# if your ln -s `pwd`/* made extra links to files you don't want there,
# check the goldenPath locations and remove those extra links
#############################################################################
# add strings of N's to the gap track (DONE - 2008-05-09 - Hiram)
ssh kkstore05
mkdir /cluster/data/braFlo1/bed/gap
cd /cluster/data/braFlo1/bed/gap
time nice -n +19 findMotif -strand=+ -verbose=4 -motif=gattaca \
../../braFlo1.2bit > braFlo1.gaps.txt 2>&1
# real 0m16.659s
grep "^#GAP chrUn" braFlo1.gaps.txt | sed -e "s/#GAP //" > chrUn.gap.bed
# what are their sizes like:
ave -col=5 chrUn.gap.bed
Q1 50.000000
median 60.000000
Q3 704.000000
average 1004.236543
min 1.000000
max 144292.000000
count 94786
total 95187565.000000
standard deviation 4497.943141
# don't let this overlap with the gap track already
ssh hgwdev
cd /cluster/data/braFlo1/bed/gap
# already existing gap track
featureBits braFlo1 gap
# 3031000 bases of 923355587 (0.328%) in intersection
# prove that it is a complete subset of this new set
featureBits braFlo1 gap chrUn.gap.bed
# 3031000 bases of 923355587 (0.328%) in intersection
# let's make sure none of the regular gaps run-on into these new
# found gaps
hgsql -e "select chrom,chromStart,chromEnd from chrUn_gap;" braFlo1 \
| sort -k1,1 -k2,2n > gap.bed
awk '{printf "%s\t%s\t%s\n", $1, $2, $3}' chrUn.gap.bed \
| sort -k1,1 -k2,2n > new.bed
# after working with that for a while, indeed there is a problem
# found in the existing gap track. There are "contig" gaps in the
# the track which claim to be 1000 bases, but the DNA sequence
# has 1001 N's in that location. Let's see if that is because the
# scaffold itself ends in a single N. Yes, after some investigation,
# it is found there are scaffolds which either begin with Ns, or
# or end with N's, and it looks like they are single N's.
# there are 30 gap locations where this happens.
# So, let's make up a proper exclusive or of the new gaps:
# intersect the new gaps with everything that is not in the current
# gap track. Get the regions that are not in the gap track:
featureBits -not -countGaps braFlo1 gap -chrom=chrUn -bed=notGap.bed
# then, find only those bits in the new gaps that intersect that:
featureBits -countGaps braFlo1 notGap.bed new.bed -bed=newGaps.bed
# and profile their sizes:
awk '{print $3-$2}' newGaps.bed | ave stdin
Q1 50.000000
median 51.000000
Q3 591.000000
average 1004.037315
min 1.000000
max 144292.000000
count 91785
total 92155565.000000
standard deviation 4570.916383
# make the new gaps into a properly formatted gap table
# currently, the last index number in the gap track is:o
hgsql -e "select max(ix) from chrUn_gap;" braFlo1
# 6062
awk '
BEGIN{ ix=6063 }
{
printf "%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n", $1, $2, $3, ix, $3-$2
++ix
}' newGaps.bed > newGaps.gap.table
# findMotif has made an error on the last one, there is an extra one
# at the end that is not legitimate, take it off.
# let's get the SQL definition:
mkdir dump
cd dump
hgsqldump -c --tab=. braFlo1 chrUn_gap
cd ..
# and, make the bin column
hgLoadBed -noLoad braFlo1 xyt newGaps.gap.table
# count before:
hgsql -e "select count(*) from chrUn_gap;" braFlo1
# 3031
# then, add this business to the existing table:
hgLoadSqlTab -append braFlo1 chrUn_gap dump/chrUn_gap.sql bed.tab
# and after
hgsql -e "select count(*) from chrUn_gap;" braFlo1
# 94815
# that is the sum of old and new:
wc -l gap.bed bed.tab
# 3031 gap.bed
# 91784 bed.tab
# 94815 total
#############################################################################
# GENBANK reload due to problems found by gbSanity (2008-05-09 markd)
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep3 -drop -initialLoad braFlo1&
#############################################################################
################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
braFlo1.upstreamGeneTbl = xenoRefGene
braFlo1.upstreamMaf = multiz5way /hive/data/genomes/braFlo1/bed/multiz5way/species.list