src/hg/makeDb/doc/gasAcu1.txt 1.42
1.42 2010/01/09 00:28:46 hiram
finishing off tetNig2 and into the pushQ
Index: src/hg/makeDb/doc/gasAcu1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/gasAcu1.txt,v
retrieving revision 1.41
retrieving revision 1.42
diff -b -B -U 1000000 -r1.41 -r1.42
--- src/hg/makeDb/doc/gasAcu1.txt 23 Dec 2009 19:09:24 -0000 1.41
+++ src/hg/makeDb/doc/gasAcu1.txt 9 Jan 2010 00:28:46 -0000 1.42
@@ -1,2810 +1,2817 @@
# for emacs: -*- mode: sh; -*-
# Gasterosteus aculeatus - Broad Institute 1.0
# Jeff McKinnon gave permission for photo 8/16/06 and asked for notification
# when the browser is public --
# e-mail: mckinnoj@uww.edu http://facstaff.uww.edu/mckinnoj/mckinnon.html
#########################################################################
# DOWNLOAD SEQUENCE (DONE 8/15/06 angie)
ssh kkstore05
mkdir /cluster/store12/gasAcu1
ln -s /cluster/store12/gasAcu1 /cluster/data/gasAcu1
mkdir /cluster/data/gasAcu1/downloads
cd /cluster/data/gasAcu1/downloads
foreach f (Stickleback1.0.agp \
Stickleback1.0.agp.chromosome.fasta.gz \
Stickleback1.0.agp.chromosome.qual.gz \
assembly.agp \
assembly.bases.gz \
assembly.ps)
wget --timestamping \
ftp://ftp.broad.mit.edu/pub/assemblies/fish/stickleback/gasAcu1/$f
end
faSize Stickleback1.0.agp.chromosome.fasta.gz
#463338706 bases (16726587 N's 446612119 real 446612119 upper 0 lower) in 105 sequences in 1 files
#Total size: mean 4412749.6 sd 1314228.9 min 83130 (XIII.20000001-20083130) max 5000000 (I.1-5000000) median 5000000
#N count: mean 159300.8 sd 213368.9
#U count: mean 4253448.8 sd 1311287.4
#L count: mean 0.0 sd 0.0
# Doh! Their chromosome fasta is chunked into 5M pieces. Not so
# useful for us.
# AGP and FA use different names, too. AGP has "groupI" and FA has "I".
# A lot of our software really wants to see "chr" at the beginning.
# So make AGP with substituted names and use that to construct the
# chrom FA.
sed -e 's/^group/chr/' Stickleback1.0.agp > UCSC.gasAcu1.agp
# Their chromosome qual is chunked into 5M pieces too. (There is also
# an assembly.agp.qual.gz that has *scaffolds* chunked.) Unchunk,
# relying on the fact that chunks are in order in the file:
zcat Stickleback1.0.agp.chromosome.qual.gz \
| perl -wpe 's/^>(\S+)\.1-\d+.*/>chr$1/; s/^>\S+\.\d+-\d+.*\n$//;' \
> UCSC.gasAcu1.qual
#########################################################################
# MAKE GENOME DB (DONE 8/17/06 angie)
ssh kkstore05
cd /cluster/data/gasAcu1
cat > gasAcu1.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db gasAcu1
clade vertebrate
genomeCladePriority 120
scientificName Gasterosteus aculeatus
commonName Stickleback
assemblyDate Feb. 2006
assemblyLabel Broad Institute gasAcu1 (1.0)
orderKey 40
mitoAcc 16356995
fastaFiles /cluster/data/gasAcu1/downloads/assembly.bases.gz
agpFiles /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.agp
qualFiles /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.qual
dbDbSpeciesDir stickleback
EOF
# Run with -debug to see what it would do / check for issues:
~kent/src/utils/makeGenomeDb.pl gasAcu1.config.ra -debug -verbose=3
# Run it for real:
~kent/src/utils/makeGenomeDb.pl gasAcu1.config.ra \
>& makeGenomeDb.log & tail -f makeGenomeDb.log
# Follow the directions at the end of the log after
#NOTES -- STUFF THAT YOU WILL HAVE TO DO --
#########################################################################
# REPEATMASKER (DONE (but might need better libs??) 8/16/06 angie)
ssh kkstore05
# Run -debug to create the dir structure and preview the scripts:
~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 -debug
# Run it for real and tail the log:
cd /cluster/data/gasAcu1/bed/RepeatMasker.2006-08-16
~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 \
>& do.log & tail -f do.log
# RM version info from do.log:
#grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker
## March 20 2006 (open-3-1-5) version of RepeatMasker
#grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
#CC RELEASE 20060315; *
# The cluster job crashed because no repeats were found on chrM
# so check out line+ failed. NBD. Manually made run.cluster/run.time
# and continued:
~/kent/src/utils/doRepeatMasker.pl gasAcu1 -verbose 3 -continue cat \
>>& do.log & tail -f do.log
ssh hgwdev
featureBits gasAcu1 rmsk
#11005259 bases of 446627861 (2.464%) in intersection
# Wow, that is awfully skimpy. Stickleback is a ~675M vertebrate genome,
# probably pretty compact, but still...
# RepeatMaskerLib.embl has only 2 stickleback-specific repeats.
#########################################################################
# REPEATMASKER - FUGU LIBS (DONE 8/17/06 angie)
# RepeatMasker with stickleback as the species had very low coverage.
# I found a paper on stickleback sex determination where they studied
# repeats and they said they used fugu libs, so give that a try...
# PMID: 15324658
# http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6VRT-4D56PX0-N&_coverDate=08%2F24%2F2004&_alid=435874407&_rdoc=1&_fmt=&_orig=search&_qd=1&_cdi=6243&_sort=d&view=c&_acct=C000059601&_version=1&_urlVersion=0&_userid=4428&md5=e5d62090265bd7b92b95f8f4cda5e079
ssh kkstore05
# Run -debug to create the dir structure and preview the scripts:
~/kent/src/utils/doRepeatMasker.pl gasAcu1 -species fugu -verbose 3 -debug
# Run it for real and tail the log:
cd /cluster/data/gasAcu1/bed/RepeatMasker.2006-08-17
~/kent/src/utils/doRepeatMasker.pl gasAcu1 -species fugu -verbose 3 \
>& do.log & tail -f do.log
# RM version info from do.log:
#grep version of RepeatMasker$ /cluster/bluearc/RepeatMasker/RepeatMasker
## March 20 2006 (open-3-1-5) version of RepeatMasker
#grep RELEASE /cluster/bluearc/RepeatMasker/Libraries/RepeatMaskerLib.embl
#CC RELEASE 20060315; *
featureBits gasAcu1 rmsk
#16563793 bases of 446627861 (3.709%) in intersection
# Still seems rather slim, but it's an improvement.
#########################################################################
# SIMPLE REPEATS (TRF) (DONE 8/16/06 angie)
ssh kolossus
nice tcsh
mkdir /cluster/data/gasAcu1/bed/simpleRepeat
cd /cluster/data/gasAcu1/bed/simpleRepeat
twoBitToFa ../../gasAcu1.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=simpleRepeat.bed -tempDir=/tmp \
>& trf.log & tail -f trf.log
# ~40 minutes
# Make a filtered version for sequence masking:
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
splitFileByColumn trfMask.bed trfMaskChrom
# Load unfiltered repeats into the database:
ssh hgwdev
hgLoadBed gasAcu1 simpleRepeat \
/cluster/data/gasAcu1/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
featureBits gasAcu1 simpleRepeat
#11062299 bases of 446627861 (2.477%) in intersection
#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 8/16/06 angie)
ssh kolossus
cd /cluster/data/gasAcu1
time twoBitMask gasAcu1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed gasAcu1.2bit
# This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.372u 0.716s 0:04.34 24.8% 0+0k 0+0io 0pf+0w
# Link to it from /gbdb:
ssh hgwdev
ln -s /cluster/data/gasAcu1/gasAcu1.2bit /gbdb/gasAcu1/gasAcu1.2bit
#########################################################################
# MAKE DOWNLOADABLE / GOLDENPATH FILES (DONE 8/17/06 angie)
cd /cluster/data/gasAcu1
~/kent/src/utils/makeDownloads.pl gasAcu1 -verbose=2 \
>& jkStuff/downloads.log & tail -f jkStuff/downloads.log
# Edit README.txt files when done:
# *** Edit each README.txt to resolve any notes marked with "***":
# /cluster/data/gasAcu1/goldenPath/database/README.txt
# /cluster/data/gasAcu1/goldenPath/bigZips/README.txt
# /cluster/data/gasAcu1/goldenPath/chromosomes/README.txt
#########################################################################
# BLASTZ/CHAIN/NET DANRER4 (DONE 10/17/06 angie)
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16
cd /cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16
cat << '_EOF_' > DEF
# Stickleback vs. Zebrafish
# Try "human-fugu" (more distant, less repeat-killed than mammal) params +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes
# QUERY: Zebrafish danRer4
SEQ2_DIR=/san/sanvol1/scratch/danRer4/danRer4.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/danRer4/danRer4ChrUnNAScafs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/danRer4/liftNAandUnScaffoldsToChrom.lft
SEQ2_LEN=/cluster/data/danRer4/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/danRer4/chromsUnNAScafs.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
-bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/gasAcu1danRer4 >& do.log &
tail -f do.log
ln -s blastz.danRer4.2006-10-16 /cluster/data/gasAcu1/bed/blastz.danRer4
nice featureBits gasAcu1 -chrom=chrI chainDanRer4Link
#9296061 bases of 27557028 (33.734%) in intersection
time nice -n 19 featureBits gasAcu1 chainDanRer4Link \
> fb.gasAcu1.chainDanRer4Link.txt 2>&1
# 157049518 bases of 446627861 (35.163%) in intersection
# I'm curious what a swap would look like on this one
# (DONE - 2006-12-08 - Hiram)
ssh kkstore04
mkdir /cluster/data/danRer4/bed/blastz.gasAcu1.swap
cd /cluster/data/danRer4/bed/blastz.gasAcu1.swap
time doBlastzChainNet.pl \
/cluster/data/gasAcu1/bed/blastz.danRer4.2006-10-16/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-verbose=2 -swap -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/gasAcu1danRer4 > swap.log 2>&1 &
# real 38m51.527s
ssh hgwdev
cd /cluster/data/danRer4/bed/blastz.gasAcu1.swap
time nice -n 19 featureBits danRer4 chainGasAcu1Link \
> fb.danRer4.chainGasAcu1Link.txt 2>&1
# 213083380 bases of 1626093931 (13.104%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET TETNIG1 (DONE 10/17/06 angie)
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17
cd /cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17
cat << '_EOF_' > DEF
# Stickleback vs. Tetraodon
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes
# QUERY: Tetraodon tetNig1
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/chromsAndScafs/tetNig1ChromsRandomScafs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.lft
SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/chromsAndScafs/chromsAndScafs.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
-bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/gasAcu1tetNig1 >& do.log &
tail -f do.log
ln -s blastz.tetNig1.2006-10-17 /cluster/data/gasAcu1/bed/blastz.tetNig1
nice featureBits gasAcu1 -chrom=chrI chainTetNig1Link
#11175314 bases of 27557028 (40.553%) in intersection
time nice -n 19 featureBits gasAcu1 chainTetNig1Link \
> fb.gasAcu1.chainTetNig1Link.txt 2>&1 &
# 187779775 bases of 446627861 (42.044%) in intersection
# I'm curious what a swap would look like on this one
# (DONE - 2006-12-08 - Hiram)
ssh kkstore04
mkdir /cluster/data/tetNig1/bed/blastz.gasAcu1.swap
cd /cluster/data/tetNig1/bed/blastz.gasAcu1.swap
doBlastzChainNet.pl \
/cluster/data/gasAcu1/bed/blastz.tetNig1.2006-10-17/DEF \
-chainMinScore=5000 -chainLinearGap=loose \
-verbose=2 -swap -bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/gasAcu1tetNig1 > swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.gasAcu1.swap
time nice -n 19 featureBits tetNig1 chainGasAcu1Link \
> fb.tetNig1.chainGasAcu1Link.txt 2>&1 &
# 171216660 bases of 342403326 (50.004%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET HG18 (DONE 10/17/06 angie)
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17
cd /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17
cat << '_EOF_' > DEF
# Stickleback vs. Human
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes
# QUERY: Human hg18
SEQ2_DIR=/scratch/hg/hg18/hg18.2bit
SEQ2_LEN=/cluster/data/hg18/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/gasAcu1hg18 >& do.log &
tail -f do.log
ln -s blastz.hg18.2006-10-17 /cluster/data/gasAcu1/bed/blastz.hg18
nice featureBits gasAcu1 -chrom=chrI chainHg18Link
#2973001 bases of 27557028 (10.789%) in intersection
ssh hgwdev
cd /cluster/data/gasAcu1/bed/blastz.hg18.2006-10-17
time nice -n 19 featureBits gasAcu1 chainHg18Link \
> fb.gasAcu1.chainHg18Link.txt 2>&1 &
# 50705450 bases of 446627861 (11.353%) in intersection
# Been swapped
ssh hgwdev
cd /cluster/data/hg18/bed/blastz.gasAcu1.swap
time nice -n 19 featureBits hg18 chainGasAcu1Link \
> fb.hg18.chainGasAcu1Link.txt 2>&1 &
# 55424609 bases of 2881515245 (1.923%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET MM8 (DONE 10/17/06 angie)
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17
cd /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17
cat << '_EOF_' > DEF
# Stickleback vs. Mouse
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes
# QUERY: Mouse mm8
SEQ2_DIR=/scratch/hg/mm8/mm8.2bit
SEQ2_LEN=/cluster/data/mm8/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LAP=0
SEQ2_LIMIT=100
BASE=/cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-bigClusterHub pk \
-blastzOutRoot /cluster/bluearc/gasAcu1mm8 >& do.log &
tail -f do.log
ln -s blastz.mm8.2006-10-17 /cluster/data/gasAcu1/bed/blastz.mm8
nice featureBits gasAcu1 -chrom=chrI chainMm8Link
#2981604 bases of 27557028 (10.820%) in intersection
ssh hgwdev
cd /cluster/data/gasAcu1/bed/blastz.mm8.2006-10-17
time nice -n 19 featureBits gasAcu1 chainMm8Link \
> fb.gasAcu1.chainMm8Link.txt 2>&1 &
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 11/14/06 angie)
# Use -repMatch=200 (based on size -- for human we use 1024, and
# stickleback size is ~15% of human judging by gapless gasAcu1 vs. hg18
# genome sizes from featureBits. Bump it up a bit to be conservative.
ssh kolossus
blat /cluster/data/gasAcu1/gasAcu1.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/bluearc/gasAcu1/11.ooc -repMatch=200
#Wrote 5700 overused 11-mers to /cluster/bluearc/gasAcu1/11.ooc
#########################################################################
# GENBANK AUTO UPDATE (DONE 11/15/06 angie)
# Make a liftAll.lft that specifies 5M chunks for genbank:
ssh kkstore05
cd /cluster/data/gasAcu1
simplePartition.pl gasAcu1.2bit 5000000 /tmp/gasAcu1P
cat /tmp/gasAcu1P/*/*.lft > jkStuff/liftAll.lft
rm -r /tmp/gasAcu1P
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add gasAcu1 just after dm2...
# gasAcu1 (G. Aculeatus)
gasAcu1.serverGenome = /cluster/data/gasAcu1/gasAcu1.2bit
gasAcu1.clusterGenome = /iscratch/i/gasAcu1/gasAcu1.2bit
gasAcu1.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter}
gasAcu1.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
gasAcu1.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
gasAcu1.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
gasAcu1.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter}
gasAcu1.ooc = /cluster/bluearc/gasAcu1/11.ooc
gasAcu1.lift = /cluster/data/gasAcu1/jkStuff/liftAll.lft
gasAcu1.genbank.mrna.xeno.load = yes
gasAcu1.downloadDir = gasAcu1
cvs ci -m "Added gasAcu1." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
# Edit src/lib/gbGenome.c to add new species.
cvs ci -m "Added G. aculeatus (stickleback)." src/lib/gbGenome.c
make install-server
ssh kkstore02
cd /cluster/data/genbank
nice bin/gbAlignStep -initial gasAcu1 &
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
nice ./bin/gbDbLoadStep -drop -initialLoad gasAcu1 &
# enable daily alignment and update of hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add gasAcu1 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added gasAcu1." etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# EXTRACT CHROM TO SCAFFOLD LIFTUP (DONE 3/6/07 angie)
# Ensembl genes use scaffold_* instead of chrUn. Broad doesn't seem to
# provide a scaffold->chrom mapping in their download files, but here's
# a way to extract one from ctg->chrom and ctg->scaffold agp...
cd /cluster/data/gasAcu1/downloads
# Distill contig coordinates to a single field that will be the same
# for ctg->chrom and ctg->scaf maps:
awk '$5 != "N" {print $6 ":" $7 ":" $8 "\t" $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $9;}' \
UCSC.gasAcu1.agp \
| sort > chrom.tmp
awk '$5 != "N" {print $6 ":" $7 ":" $8 "\t" $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $9;}' \
assembly.agp \
| sort > scaf.tmp
# Join the maps into what is AGP except it has a bunch of lines for
# each scaffold that must be collapsed into one line per scaffold:
join -t " " -o '1.2 1.3 1.4 1.5 1.6 2.2 2.3 2.4 1.7' chrom.tmp scaf.tmp \
| sort -k 1,1 -k 2n,2n \
> tmp.agp
# Make sure the join went right (same #lines in all):
wc -l chrom.tmp scaf.tmp tmp.agp
# 16967 chrom.tmp
# 16967 scaf.tmp
# 16967 tmp.agp
# Now collapse the many per-contig lines for each scaffold into a single
# line per scaffold. This produces gapless AGP with sequence numbers
# that aren't consecutive (but still monotonically increasing):
perl -we 'while (<>) { \
chomp; @w = split; \
if (! defined $lastScaf) { \
($chr, $chrS, $chrE, $seq, $type, \
$lastScaf, $firstScafS, $firstScafE, $strand) = @w; \
$lastScafS = $firstScafS; $lastScafE = $firstScafE; \
} elsif ($lastScaf ne $w[5]) { \
$scafS = ($firstScafS < $lastScafS) ? $firstScafS : $lastScafS;\
$scafE = ($firstScafE > $lastScafE) ? $firstScafE : $lastScafE;\
print "$chr\t$chrS\t$chrE\t$seq\t$type\t" . \
"$lastScaf\t$scafS\t$scafE\t$strand\n"; \
($chr, $chrS, $chrE, $seq, $type, \
$lastScaf, $firstScafS, $firstScafE, $strand) = @w; \
$lastScafS = $firstScafS; $lastScafE = $firstScafE; \
} else { \
($thisChr,undef,$chrE,undef,undef, \
undef,$lastScafS,$lastScafE,undef) = @w; \
die "$lastScaf $lastScafE: $thisChr != $chr" \
if ($thisChr ne $chr); \
} \
} \
$scafS = ($firstScafS < $lastScafS) ? $firstScafS : $lastScafS;\
$scafE = ($firstScafE > $lastScafE) ? $firstScafE : $lastScafE;\
print "$chr\t$chrS\t$chrE\t$seq\t$type\t" . \
"$lastScaf\t$scafS\t$scafE\t$strand\n"; \
' tmp.agp \
> UCSC.chromToScaffoldSansGaps.agp
# fortunately agpToLift doesn't need gap lines in its input, just frags:
agpToLift -revStrand < UCSC.chromToScaffoldSansGaps.agp \
> ../jkStuff/UCSC.chromToScaffoldSansGaps.lft
#########################################################################
# ENSEMBL GENES (DONE 11/27/06 angie - RELOADED 3/8/07 angie)
ssh hgwdev
mkdir /cluster/data/gasAcu1/bed/ensembl
cd /cluster/data/gasAcu1/bed/ensembl
# Go to www.ensembl.org and click around their evolving interface
# to get the following types of data:
# 1. a tab-separated file that relates Ensembl gene, transcript and
# protein IDs. Save as ensGtp.txt.gz
# 2. peptide fasta with transcript IDs in header -> ensemblPep.fa.gz
# They have started dumping GTF files so we can download that directly:
wget 'ftp://ftp.ensembl.org/pub/current_gasterosteus_aculeatus/data/gtf/*'
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software.
# Also need to lift scaffold coords up to chrUn:
gunzip -c Gasterosteus_aculeatus.BROADS1.43.gtf.gz \
| sed -e 's/^group/chr/; s/^MT/chrM/;' \
| liftUp -type=.gtf stdout ../../jkStuff/UCSC.chromToScaffoldSansGaps.lft \
carry stdin \
> ensGene.gtf
ldHgGene -gtf -genePredExt gasAcu1 ensGene \
/cluster/data/gasAcu1/bed/ensembl/ensGene.gtf
# strip header line:
gunzip -c ensGtp.txt.gz | tail +2 > ensGtp.txt
hgLoadSqlTab gasAcu1 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt
gunzip -c ensemblPep.fa.gz \
| perl -wpe 's/\|.*//' \
> ensPep.fa
# ONE TIME ONLY... seven protein sequences were chopped up in the file
# from BioMart and the fragments appeared as competing duplicate
# sequences. Ensembl helpdesk is still looking at it but sent the
# correct whole sequences for the seven, so I'm manually editing those
# in. The ID's are ENSGACT00000000021, ENSGACT00000000176,
# ENSGACT00000017572, ENSGACT00000011000, ENSGACT00000000042,
# ENSGACT00000000087, ENSGACT00000022185.
hgPepPred gasAcu1 generic ensPep ensPep.fa
#########################################################################
#########################################################################
#########################################################################
# BLASTZ/CHAIN/NET FR1 (DONE - 2006-12-20 - Hiram)
## no chrUn in gasAcu1, and align to fr1 scaffolds,
## results lifted to fr1 chrUn coordinates
ssh kkstore04
mkdir /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08
cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08
cat << '_EOF_' > DEF
# Stickleback vs. Fugu
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stippleback gasAcu1, no chrUn in this run
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=30
SEQ1_LAP=0
# QUERY: Fugu fr1
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/gasAcu1Fr1 > do.log 2>&1 &
# real 272m33.395s
# user 0m0.189s
# sys 0m0.163s
ssh hgwdev
cd /cluster/data/gasAcu1/bed
ln -s blastz.fr1.2006-12-08 blastz.fr1
cd blastz.fr1
time nice -n 19 featureBits gasAcu1 chainFr1Link \
> fb.gasAcu1.chainFr1Link.txt 2>&1 &
# 146655780 bases of 446627861 (32.836%) in intersection
mkdir /cluster/data/fr1/bed/blastz.gasAcu1.swap
cd /cluster/data/fr1/bed/blastz.gasAcu1.swap
time doBlastzChainNet.pl \
/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > do.log 2>&1 &
time doBlastzChainNet.pl \
/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \
-continue=net -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > net.log 2>&1 &
# real 24m33.761s
# user 0m0.125s
# sys 0m0.080s
ssh hgwdev
cd /cluster/data/fr1/bed/blastz.gasAcu1.swap
time nice -n 19 featureBits fr1 chainGasAcu1Link \
> fb.fr1.chainGasAcu1Link.txt 2>&1 &
# 148005715 bases of 315518167 (46.909%) in intersection
#######################################################################
## Swap of oryLat1 alignments to gasAcu1 (DONE - 2006-12-08 - Hiram)
## This sequence duplicated in oryLat1.txt too
ssh kkstore04
cd /cluster/data/gasAcu1/bed
rm blastz.oryLat1.swap
mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.swap
cd /cluster/data/gasAcu1/bed/blastz.oryLat1.swap
time $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
/cluster/data/oryLat1/bed/blastz.gasAcu1.2006-12-08/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=repeats -qRepeats=windowmaskerSdust -bigClusterHub=pk \
-swap > swap.log 2>&1 &
# real 26m32.549s
ssh hgwdev
cd /cluster/data/gasAcu1/bed/blastz.oryLat1.swap
nice -n 19 featureBits -noRandom gasAcu1 chainOryLat1Link \
>fb.gasAcu1.oryLat1.txt 2>&1 &
# 148335558 bases of 391398261 (37.899%) in intersection
###########################################################################
## checking measurements to determine which fish is close to which fish
## 2006-12-08 - Hiram
# featureBits chainLink measures
# chainGasAcu1Link chain linearGap
# distance on gasAcu1 on other minScore
# 1 0.xxxx - tetraodon tetNig1 (% 42.044) (% 50.004) 5000 loose
# 3 0.xxxx - zebrafish danRer4 (% 35.163) (% 13.104) 5000 loose
# 4 0.xxxx - fugu fr1 (% 32.836) (% 46.909) 2000 loose
# 5 0.xxxx - human hg18 (% 11.353) (% 1.923) 2000 loose
# 6 0.xxxx - mouse mm8 (% 11.070) (% x.923) 2000
# from oryLat1.txt:
# 2 0.xxxx - medaka oryLat1 (% 37.899) (% 28.110) 2000 loose
######
danRer4 # 157049518 bases of 446627861 (35.163%) in intersection
tetNig1 # 187779775 bases of 446627861 (42.044%) in intersection
hg18 # 50705450 bases of 446627861 (11.353%) in intersection
on hg18 # 55424609 bases of 2881515245 (1.923%) in intersection
on tetNig1 # 171216660 bases of 342403326 (50.004%) in intersection
on danRer4 # 213083380 bases of 1626093931 (13.104%) in intersection
fr1 # 146655780 bases of 446627861 (32.836%) in intersection
on fr1 # 148005715 bases of 315518167 (46.909%) in intersection
#######################################################################
## BLASTZ oryLat1 chrUn contigs/scaffolds only (DONE - 2006-12-20 - Hiram)
## This was an experiment, not used
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20
cd /cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20
cat << '_EOF_' > DEF
# Stickleback vs. Medaka, both chrUn random contigs/scaffolds only
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1 chrUn only
# chrUn in contigs for this alignment run
# The largest is 418,000 bases and there are 5,000 of them.
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.sizes
SEQ1_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ1_CHUNK=1000000
SEQ1_LIMIT=100
SEQ1_LAP=0
# QUERY: Medaka oryLat1 chrUn only
# chrUn in scaffolds for this alignment run
# The largest scaffold is only 76,000 bases, there are 36,000 of them
# and a total size of 119 million.
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=100
SEQ2_LAP=0
BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.Un.2006-12-20
TMPDIR=/scratch/tmp
'_EOF_'
time doBlastzChainNet.pl -verbose=2 DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/gasAcu1OryLat1 \
-bigClusterHub=kk > do.log 2>&1 &
# real 106m42.410s
# user 0m0.087s
# sys 0m0.062s
# Had some difficulty with the kk kluster, continuing after getting the
# jobs completed
time doBlastzChainNet.pl -verbose=2 DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/gasAcu1OryLat1 \
-continue=cat -bigClusterHub=kk > cat.log 2>&1 &
#######################################################################
## BLASTZ Stickleback chroms vs Medaka random contigs
## (DONE - 2006-12-20 - Hiram)
## This was an experiment, not used
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20
cd /cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20
cat << '_EOF_' > DEF
# Stickleback chroms vs. Medaka random contigs
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms
# The largest is 32 million bases, there are 22 of them
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
# QUERY: Medaka oryLat1 chrUn only
# chrUn in scaffolds for this alignment run
# The largest scaffold is only 76,000 bases, there are 36,000 of them
# and a total size of 119 million.
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.chrUn.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1.UnScaffoldsOnly.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=40
SEQ2_LAP=0
BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.ChrUn.2006-12-20
TMPDIR=/scratch/tmp
'_EOF_'
time doBlastzChainNet.pl -verbose=2 DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/gasAcu1ChrOryLat1Un \
-bigClusterHub=kk > do.log 2>&1 &
#######################################################################
## BLASTZ Stickleback chroms vs Medaka chroms and random contigs
## (DONE - 2006-12-22 - Hiram)
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22
cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22
cat << '_EOF_' > DEF
# Stickleback chroms vs. Medaka random contigs
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms
# The largest is 32 million bases, there are 22 of them
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
# QUERY: Medaka oryLat1 chroms and chrUn in scaffolds
# The largest scaffold is only 76,000 bases, there are 36,000 of them
# and a total size of 119 million.
# The biggest chrom is just under 40 million
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=20
SEQ2_LAP=0
BASE=/cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/gasAcu1ChrOryLat1ChrUn \
-bigClusterHub=pk > do.log 2>&1 &
# real 132m54.403s
# user 0m0.213s
# sys 0m0.138s
ssh hgwdev
cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22
time featureBits gasAcu1 chainOryLat1Link \
> fb.gasAcu1.chainOryLat1Link.txt 2>&1
# 164155231 bases of 446627861 (36.754%) in intersection
#########################################################################
## Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
hgsql -h genome-testdb hgcentraltest \
-e "update dbDb set orderKey = 470 where name = 'gasAcu1';"
#########################################################################
## 7-Way Multiz (DONE - 2006-12-22 - Hiram)
## replaced by 8-Way done below
ssh hgwdev
mkdir /cluster/data/gasAcu1/bed/multiz7way
cd /cluster/data/gasAcu1/bed/multiz7way
# Constructed this tree by pruining the 17-way tree from mm8
# multiz17way, and then adding in gasAcu1 and oryLat1 branch
# Arbitrarily set 0.2 distances for this added branch
# All other distances remain as specified in the 17way.nh
cat << '_EOF_' > 7way.nh
((human_hg18:0.054922,mouse_mm8:0.412790):0.754413,
(
((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.2,
(stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961,
zebrafish_danRer4:0.782561):0.156067);
'_EOF_'
# << happy emacs
/cluster/bin/phast/draw_tree 7way.nh > 7way.ps
/cluster/bin/phast/all_dists 7way.nh > 7way.distances.txt
# Use this output to create the table below
grep -y gasAcu1 7way.distances.txt | sort -k3,3n
#
# If you can fill in all the numbers in this table, you are ready for
# the multiple alignment procedure
#
# featureBits chainLink measures
# chainGasAcu1Link chain linearGap
# distance on gasAcu1 on other minScore
# 1 0.4000 - medaka oryLat1 (% 38.754) (% 27.044) 2000 loose
# 2 0.7994 - tetraodon tetNig1 (% 42.044) (% 50.004) 5000 loose
# 3 0.8399 - fugu fr1 (% 32.836) (% 49.909) 2000 loose
# 4 1.4755 - zebrafish danRer4 (% 35.163) (% 13.104) 5000 loose
# 6 1.6584 - human hg18 (% 11.353) (% 1.923) 2000 loose
# 7 2.0162 - mouse mm8 (% 11.070) (% 2.056) 2000
cd /cluster/data/gasAcu1/bed/multiz7way
# bash shell syntax here ...
export H=/cluster/data/gasAcu1/bed
mkdir mafLinks
for G in oryLat1 tetNig1 fr1 danRer4 hg18 mm8
do
mkdir mafLinks/$G
if [ ! -d ${H}/blastz.${G}/mafNet ]; then
echo "missing directory blastz.${G}/mafNet"
exit 255
fi
ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
done
# Copy MAFs to some appropriate NFS server for kluster run
ssh kkstore05
mkdir /san/sanvol1/scratch/gasAcu1/multiz7way
cd /san/sanvol1/scratch/gasAcu1/multiz7way
time rsync -a --copy-links --progress \
/cluster/data/gasAcu1/bed/multiz7way/mafLinks/ .
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
# the autoMultiz cluster run
ssh pk
cd /cluster/data/gasAcu1/bed/multiz7way/
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
7way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir run maf
cd run
# NOTE: you need to set the db and multiz dirname properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = gasAcu1
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz7way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz7way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/gasAcu1/bed/multiz7way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 23 jobs
para try ... check ... push ... etc ...
# Completed: 23 of 23 jobs
# CPU time in finished jobs: 9086s 151.43m 2.52h 0.11d 0.000 y
# IO & Wait Time: 194s 3.24m 0.05h 0.00d 0.000 y
# Average job time: 403s 6.72m 0.11h 0.00d
# Longest finished job: 707s 11.78m 0.20h 0.01d
# Submission to last job: 707s 11.78m 0.20h 0.01d
# combine results into a single file for loading and gbdb reference
ssh kkstore05
cd /cluster/data/gasAcu1/bed/multiz7way
time catDir maf > multiz7way.maf
# real 0m6.467s
# makes an 1.5 Gb file:
# -rw-rw-r-- 1 1577409197 Dec 22 14:49 multiz7way.maf
# Create per-chrom individual maf files for downloads
ssh kkstore05
cd /cluster/data/gasAcu1/bed/multiz7way
mkdir mafDownloads
time for M in maf/chr*.maf
do
B=`basename $M`
cp -p ${M} mafDownloads/${B}
gzip mafDownloads/${B}
echo ${B} done
done
# real 4m58.923s
# user 4m41.178s
# sys 0m14.387s
# deliver to downloads
ssh hgwdev
ln -s /cluster/data/gasAcu1/bed/multiz7way/mafDownloads \
/usr/local/apache/htdocs/goldenPath/gasAcu1/multiz7way
# Load into database
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz7way
mkdir /gbdb/gasAcu1/multiz7way
ln -s /cluster/data/gasAcu1/bed/multiz7way/multiz7way.maf \
/gbdb/gasAcu1/multiz7way
time nice -n +19 hgLoadMaf gasAcu1 multiz7way
# Loaded 1925835 mafs in 1 files from /gbdb/gasAcu1/multiz7way
# real 0m45.970s
# user 0m25.438s
# sys 0m8.453s
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 gasAcu1 multiz7waySummary multiz7way.maf
# Created 684283 summary blocks from 5182575 components
# and 1925835 mafs from multiz7way.maf
# real 0m54.289s
# user 0m48.004s
# sys 0m1.779s
# Create tree image for details page
# You can get a better image from the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
cp 7way.nh treeImage.nh
# Edit treeImage.nh to make the names as desired
# Submit this treeImage.nh to the phyloGif program, or:
/cluster/bin/phast/draw_tree -b -s treeImage.nh > treeImage.ps
############################################################################
# ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2006-12-22 - Hiram)
## replaced by 8-Way done below
ssh kolossus
mkdir /cluster/data/gasAcu1/bed/multiz7way/anno
cd /cluster/data/gasAcu1/bed/multiz7way/anno
mkdir maf run
cd run
rm -f sizes nBeds
ln -s /cluster/data/gasAcu1/chrom.sizes gasAcu1.len
twoBitInfo -nBed /cluster/data/gasAcu1/gasAcu1.sdTrf.{2bit,N.bed}
ln -s /cluster/data/gasAcu1/gasAcu1.sdTrf.N.bed gasAcu1.bed
for DB in `grep -v gasAcu1 /cluster/data/gasAcu1/bed/multiz7way/species.lst`
do
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
echo ${DB}.len >> sizes
echo $DB
done
echo '#!/bin/csh -ef' > jobs.csh
echo date >> jobs.csh
# do smaller jobs first so you can see some progress immediately:
for F in `ls -1rS ../../maf/*.maf`
do
echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $F \
/cluster/data/gasAcu1/gasAcu1.sdTrf.2bit ../maf/`basename $F` >> jobs.csh
echo "echo $F" >> jobs.csh
done
echo date >> jobs.csh
chmod +x jobs.csh
time ./jobs.csh > jobs.log 2>&1 &
# to watch progress;
tail -f jobs.log
# real 19m20.705s
# user 7m11.818s
# sys 11m31.508s
# Load anno/maf
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz7way/anno/maf
mkdir -p /gbdb/gasAcu1/multiz7way/anno/maf
ln -s /cluster/data/gasAcu1/bed/multiz7way/anno/maf/*.maf \
/gbdb/gasAcu1/multiz7way/anno/maf
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/gasAcu1/multiz7way/anno/maf gasAcu1 multiz7way
# Loaded 2269386 mafs in 23 files from /gbdb/gasAcu1/multiz7way/anno/maf
# real 0m52.053s
# user 0m35.608s
# sys 0m3.762s
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kolossus
cd /cluster/data/gasAcu1/bed/multiz7way/anno/maf
time cat *.maf | \
nice -n +19 hgLoadMafSummary gasAcu1 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 -test multiz7waySummary stdin
#######################################################################
## BLASTZ Stickleback chroms vs chicken chroms and random contigs
## (DONE - 2006-12-28 - 2007-01-05 - Hiram)
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28
cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28
cat << '_EOF_' > DEF
# Stickleback chroms vs. Chicken random contigs
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1 no chrUn sequence, only chroms
# The largest is 32 million bases, there are 22 of them
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
# QUERY: Chicken galGal3 chroms and randoms in contigs
# The largest chrom is 200M bases, the largest contig 122,228
# The smallest chrom is 1028 bases, smallest contig 54 bases
# there are 25,991 chroms and contig pieces
# total size 1,089,690,673 bases
# 47107538 N's 1042583135 real 834274605 upper 208308530 lower
SEQ2_DIR=/san/sanvol1/scratch/galGal3/galGal3.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/galGal3/galGal3.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/galGal3/galGal3.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/galGal3/galGal3.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/galGal3/randomContigs.lft
SEQ2_CHUNK=10000000
SEQ2_LIMIT=20
SEQ2_LAP=0
BASE=/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF \
-smallClusterHub=pk \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/gasAcu1ChrGalGal3ChrUn \
-bigClusterHub=pk > do.log 2>&1 &
# real 53m30.713s
# failed during the load step due to difficulties on hgwdev
# continuing 2007-01-02
cd axtChain
./loadUp.csh
cd ..
time doBlastzChainNet.pl -verbose=2 DEF \
-smallClusterHub=pk \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/gasAcu1ChrGalGal3ChrUn \
-continue=download -bigClusterHub=pk > download.log 2>&1 &
# And, to swap:
mkdir /cluster/data/galGal3/bed/blastz.gasAcu1.swap
cd /cluster/data/galGal3/bed/blastz.gasAcu1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
-smallClusterHub=pk \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# failed on the net step:
# HgStepManager: executing step 'net'.
# netChains: looks like previous stage was not successful
# (can't find [galGal3.gasAcu1.]all.chain[.gz]).
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/DEF \
-smallClusterHub=pk \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=net -swap -bigClusterHub=pk > net_swap.log 2>&1 &
# real 7m11.286s
featureBits gasAcu1 chainGalGal3Link > fb.gasAcu1.chainGalGal2Link.txt
# 37154459 bases of 446627861 (8.319%) in intersection
featureBits galGal3 chainGasAcu1Link > fb.galGal3.chainGasAcu1Link.txt
# 32781658 bases of 1042591351 (3.144%) in intersection
# Adding the chrUn result from the galGal3 swapped alignments
ssh kkstore05
cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain
chainSplit chain gasAcu1.galGal3.all.chain.gz
cp -p ../../blastz.galGal3.swap/axtChain/chain/chrUn.chain chain/chrUn.chain
ssh hgwdev
cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain/chain
hgLoadChain gasAcu1 chrUn_chainGalGal3 chrUn.chain
Loading 13904 chains into gasAcu1.chrUn_chainGalGal3
ssh kkstore05
cd /cluster/data/gasAcu1/bed/blastz.galGal3.2006-12-28/axtChain/chain
chainMergeSort *.chain | gzip -c > ../gasAcu1.galGal3.all.chain.gz
# This changes the IDs, but nothing else
# reusing the existing netChains with some alterations
cp netChains.csh netChainsUn.csh
# fixup the reference to chrom.sizes and the 2bit file in this script
# and running it after moving existing items out of the way
#########################################################################
## 8-Way Multiz (DONE - 2007-01-02 - 2007-01-05 - Hiram)
## re-done with new chrUn.maf 2007-01-13 - Hiram
## re-done with fr2 instead of fr1 - 2007-02-02 - Hiram
ssh hgwdev
mkdir /cluster/data/gasAcu1/bed/multiz8way
cd /cluster/data/gasAcu1/bed/multiz8way
/cluster/bin/phast/tree_doctor --prune chimp_panTro2,macaque_rheMac2,rat_rn4,rabbit_oryCun1,cow_bosTau2,dog_canFam2,armadillo_dasNov1,elephant_loxAfr1,tenrec_echTel1,monodelphis_monDom4,xenopus_xenTro2 17way.nh
# use the output of that to manually construct this tree.
# Arbitrarily set 0.2 distances for this added branch
# All other distances remain as specified in the 17way.nh
cat << '_EOF_' > 8way.nh
(((human_hg18:0.054922,mouse_mm8:0.412790):0.475049,
chicken_galGal3:0.454691):0.279364,
(((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
(stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961,
zebrafish_danRer4:0.782561):0.156067);
'_EOF_'
# << happy emacs
# Use this specification in the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to obtain a gif image for htdocs/images/phylo/gasAcu1_8way.gif
/cluster/bin/phast/all_dists 8way.nh > 8way.distances.txt
# Use this output to create the table below
grep -y gasAcu1 8way.distances.txt | sort -k3,3n
#
# If you can fill in all the numbers in this table, you are ready for
# the multiple alignment procedure
#
# featureBits chainLink measures
# chainGasAcu1Link chain linearGap
# distance on gasAcu1 on other minScore
# 1 0.4000 - medaka oryLat1 (% 38.308) (% 27.044) 2000 loose
# 2 0.7994 - tetraodon tetNig1 (% 42.044) (% 50.004) 5000 loose
# 3 0.8399 - fugu fr2 (% 37.574) (% 40.269) 2000 loose
# 3 0.8399 - fugu fr1 (% 35.925) (% 46.909) 2000 loose
# 4 1.4755 - zebrafish danRer4 (% 35.163) (% 13.104) 5000 loose
# 6 1.5831 - chicken galGal3 (% 9.181) (% 3.144) 2000 loose
# 7 1.6584 - human hg18 (% 11.353) (% 1.923) 2000 loose
# 8 2.0162 - mouse mm8 (% 11.070) (% 2.056) 2000 loose
cd /cluster/data/gasAcu1/bed/multiz8way
# bash shell syntax here ...
export H=/cluster/data/gasAcu1/bed
mkdir mafLinks
for G in oryLat1 tetNig1 fr2 danRer4 galGal3 hg18 mm8
do
mkdir mafLinks/$G
if [ ! -d ${H}/blastz.${G}/mafNet ]; then
echo "missing directory blastz.${G}/mafNet"
exit 255
fi
ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
done
# Copy MAFs to some appropriate NFS server for kluster run
ssh kkstore05
mkdir /san/sanvol1/scratch/gasAcu1/multiz8way
cd /san/sanvol1/scratch/gasAcu1/multiz8way
time rsync -a --copy-links --progress \
/cluster/data/gasAcu1/bed/multiz8way/mafLinks/ .
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
# the autoMultiz cluster run
ssh pk
cd /cluster/data/gasAcu1/bed/multiz8way/
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
8way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir run maf
cd run
# NOTE: you need to set the db and multiz dirname properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = gasAcu1
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz8way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz8way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/gasAcu1/bed/multiz8way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 23 jobs
para try ... check ... push ... etc ...
# Completed: 23 of 23 jobs
# CPU time in finished jobs: 11075s 184.58m 3.08h 0.13d 0.000 y
# IO & Wait Time: 155s 2.58m 0.04h 0.00d 0.000 y
# Average job time: 488s 8.14m 0.14h 0.01d
# Longest finished job: 1123s 18.72m 0.31h 0.01d
# Submission to last job: 1123s 18.72m 0.31h 0.01d
# combine results into a single file for loading and gbdb reference
ssh kkstore05
cd /cluster/data/gasAcu1/bed/multiz8way
time nice -n +19 catDir maf > multiz8way.maf
# real 0m7.656s
# makes an 1.7 Gb file:
# -rw-rw-r-- 1 1742307109 Feb 2 16:58 multiz8way.maf
# Create per-chrom individual maf files for downloads
# NOT NECESSARY HERE - DONE LATER WITH THE ANNOTATED MAFS
ssh kkstore05
cd /cluster/data/gasAcu1/bed/multiz8way
mkdir mafDownloads
time for M in maf/chr*.maf
do
B=`basename $M`
cp -p ${M} mafDownloads/${B}
gzip mafDownloads/${B}
echo ${B} done
done
# real 5m9.273
# deliver to downloads *!* NOT NECESSARY HERE - DONE LATER WITH
# THE ANNOTATED MAFS
ssh hgwdev
ln -s /cluster/data/gasAcu1/bed/multiz8way/mafDownloads \
/usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way
# Load into database
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz8way
mkdir /gbdb/gasAcu1/multiz8way
ln -s /cluster/data/gasAcu1/bed/multiz8way/multiz8way.maf \
/gbdb/gasAcu1/multiz8way
time nice -n +19 hgLoadMaf gasAcu1 multiz8way
# Loaded 1989046 mafs in 1 files from /gbdb/gasAcu1/multiz8way
# real 0m47.121s
time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
-maxSize=50000 gasAcu1 multiz8waySummary multiz8way.maf
# Created 775687 summary blocks from 6070904 components
# and 2047322 mafs from multiz8way.maf
# real 1m0.331s
# Create tree image for details page
# You can get a better image from the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# cp 8way.nh treeImage.nh
# Edit treeImage.nh to make the names as desired
# Submit this treeImage.nh to the phyloGif program, or:
# /cluster/bin/phast/draw_tree -b -s treeImage.nh > treeImage.ps
############################################################################
# ANNOTATE MULTIZ8WAY MAF AND LOAD TABLES (DONE - 2007-01-05 - Hiram)
## re-done with new chrUn.maf 2007-01-13 - Hiram
## re-done with fr2 instead of fr1 - 2007-02-02 - Hiram
## re-done with correct nBeds and sizes files, 2007-03-27 - Hiram
ssh kolossus
mkdir /cluster/data/gasAcu1/bed/multiz8way/anno
cd /cluster/data/gasAcu1/bed/multiz8way/anno
mkdir maf run
cd run
rm -f sizes nBeds
ln -s /cluster/data/gasAcu1/chrom.sizes gasAcu1.len
# This were done before during the 7-way
# twoBitInfo -nBed /cluster/data/gasAcu1/gasAcu1.sdTrf.{2bit,N.bed}
ln -s /cluster/data/gasAcu1/gasAcu1.sdTrf.N.bed gasAcu1.bed
for DB in \
`cat /cluster/data/gasAcu1/bed/multiz8way/species.lst | sed -e "s/ gasAcu1//"`
do
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
echo ${DB}.len >> sizes
echo $DB
done
echo '#!/bin/csh -ef' > jobs.csh
echo date >> jobs.csh
# do smaller jobs first so you can see some progress immediately:
for F in `ls -1rS ../../maf/*.maf`
do
echo mafAddIRows -nBeds=nBeds -sizes=sizes $F \
/cluster/data/gasAcu1/gasAcu1.sdTrf.2bit ../maf/`basename $F` >> jobs.csh
echo "echo $F" >> jobs.csh
done
echo date >> jobs.csh
chmod +x jobs.csh
time nice -n +19 ./jobs.csh > jobs.log 2>&1 &
# to watch progress;
tail -f jobs.log
# this was run on titan (can not do this for larger genomes on titan)
# real 48m40.965s
# Load anno/maf
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf
mkdir -p /gbdb/gasAcu1/multiz8way/anno/maf
ln -s /cluster/data/gasAcu1/bed/multiz8way/anno/maf/*.maf \
/gbdb/gasAcu1/multiz8way/anno/maf
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/gasAcu1/multiz8way/anno/maf gasAcu1 multiz8way
# Loaded 2395584 mafs in 23 files from /gbdb/gasAcu1/multiz8way/anno/maf
# real 2m16.918s
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kolossus
cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf
time cat *.maf | \
nice -n +19 hgLoadMafSummary gasAcu1 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 -test multiz8waySummary stdin
# Created 322344 summary blocks from 6111763 components
# and 2395584 mafs from stdin
# real 23m49.073s
# -rw-rw-r-- 1 15063781 Mar 27 12:02 multiz8waySummary.tab
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz8way/anno/maf
sed -e 's/mafSummary/multiz8waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
> /tmp/multiz8waySummary.sql
time nice -n +19 hgLoadSqlTab gasAcu1 multiz8waySummary \
~/kent/src/hg/lib/mafSummary.sql multiz8waySummary.tab
# real 0m4.525
#########################################################################
# Adding automatic generation of upstream files (DONE - 2009-08-13 - Hiram)
# edit src/hg/makeDb/genbank/genbank.conf to add:
gasAcu1.upstreamGeneTbl = ensGene
gasAcu1.upstreamMaf = multiz8way /hive/data/genomes/gasAcu1/bed/multiz8way/species.lst
#########################################################################
# MULTIZ8WAY DOWNLOADABLES (DONE - 2007-01-05 - Hiram)
## re-done with new chrUn.maf 2007-01-13 - Hiram
## re-done with fr2 in place of fr1 - 2007-02-03 - Hiram
# Annotated MAF is now documented, so use anno/maf for downloads/
ssh hgwdev
mkdir /cluster/data/gasAcu1/bed/multiz8way/mafDownloads
cd /cluster/data/gasAcu1/bed/multiz8way
# upstream mafs
# rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst
# did not have mm8 as the first one
# There isn't any refGene table, using ensGene instead
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 gasAcu1 \
ensGene:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags gasAcu1 multiz8way \
stdin stdout -orgs=species.lst \
| gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz
echo "done ensGene.upstream${S}.maf.gz"
done
## re-done after correction to anno/maf files 2007-03-27 - Hiram
ssh kkstore05
cd /cluster/data/gasAcu1/bed/multiz8way
time for M in anno/maf/chr*.maf
do
B=`basename $M`
nice -n +19 gzip -c ${M} > mafDownloads/${B}.gz
echo ${B}.gz done
done
# real 6m1.454
cd mafDownloads
nice -n +19 md5sum *.maf.gz > md5sum.txt
# Make a README.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way
cd /usr/local/apache/htdocs/goldenPath/gasAcu1/multiz8way
ln -s /cluster/data/gasAcu1/bed/multiz8way/mafDownloads/{*.gz,*.txt} .
#######################################################################
# MULTIZ8WAY MAF FRAMES (DONE - 2007-01-10 - Hiram)
## re-done with new chrUn.maf 2007-01-13 - Hiram
## re-donw with fr2 in place of fr1 - 2007-02-03 - Hiram
ssh hgwdev
mkdir /cluster/data/gasAcu1/bed/multiz8way/frames
cd /cluster/data/gasAcu1/bed/multiz8way/frames
mkdir genes
# The following is adapted from the mm8 sequence
#------------------------------------------------------------------------
# get the genes for all genomes
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using ensGene for gasAcu1 oryLat1
for qDB in danRer4 galGal3
do
geneTbl=refGene
echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
rm -f $tmpExt
done
# using knownGene for mm8 hg18
# using genscan for tetNig1
# using ensGene for gasAcu1, oryLat1 and fr2
# genePreds; (must keep only the first 10 columns for knownGene)
for qDB in mm8 hg18 gasAcu1 oryLat1 fr2 tetNig1
do
if [ $qDB = "gasAcu1" -o $qDB = "oryLat1" -o $qDB = "fr2" ]; then
geneTbl=ensGene
elif [ $qDB = "tetNig1" ]; then
geneTbl=genscan
else
geneTbl=knownGene
fi
echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
rm -f $tmpExt
done
###
ssh kkstore05
cd /cluster/data/gasAcu1/bed/multiz8way/frames
time cat ../maf/*.maf | nice -n +19 genePredToMafFrames gasAcu1 stdin stdout gasAcu1 genes/gasAcu1.gp.gz oryLat1 genes/oryLat1.gp.gz fr2 genes/fr2.gp.gz tetNig1 genes/tetNig1.gp.gz danRer4 genes/danRer4.gp.gz galGal3 genes/galGal3.gp.gz mm8 genes/mm8.gp.gz hg18 genes/hg18.gp.gz | gzip > multiz8way.mafFrames.gz
# real 0m55.706
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz8way/frames
time nice -n +19 hgLoadMafFrames gasAcu1 multiz8wayFrames \
multiz8way.mafFrames.gz
# real 0m32.220s
############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
# (DONE - 2007-01-10 - 2007-01-17 - Hiram)
## re-done with fr2 in place of fr1 - 2007-02-03 - Hiram
# Estimate phastCons parameters
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/multiz8way/cons
cd /cluster/data/gasAcu1/bed/multiz8way/cons
twoBitToFa -seq=chrIV ../../../gasAcu1.sdTrf.2bit chrIV.fa
# Create a starting-tree.mod based on chrIV (the largest one)
time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrIV.maf \
--refseq ./chrIV.fa --in-format MAF \
--windows 100000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
# 26 seconds
time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
--tree "(((hg18,mm8),galGal3),(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer4))" \
--out-root starting-tree
# Fitting tree model to s1.1-32632948.ss using REV ...
# Writing model to starting-tree.mod ...
# real 3m53.084
rm s1.*.ss
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.449
# This 0.449 is used in the --gc argument below
## the fa files are needed for the sequence and they are created during
# this loop if they haven't been done before
# Create SS files on san filesystem
# Using a size of 10,000,000 to slow down the phastCons pk jobs
ssh kkstore05
mkdir -p /san/sanvol1/scratch/gasAcu1/cons/ss
cd /san/sanvol1/scratch/gasAcu1/cons/ss
time for C in `awk '{print $1}' /cluster/data/gasAcu1/chrom.sizes`
do
if [ -s /cluster/data/gasAcu1/bed/multiz8way/maf/${C}.maf ]; then
mkdir -p ${C}
mkdir -p /cluster/data/gasAcu1/bed/multiz8way/cons/${C}
echo msa_split $C
if [ ! -s /cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa ]; then
twoBitToFa -seq=${C} /cluster/data/gasAcu1/gasAcu1.sdTrf.2bit \
/cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa
fi
chrN=${C/chr/}
chrN=${chrN/_random/}
nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \
/cluster/data/gasAcu1/bed/multiz8way/maf/${C}.maf \
--refseq /cluster/data/gasAcu1/bed/multiz8way/cons/${C}/${C}.fa \
--in-format MAF --windows 10000000,0 --between-blocks 5000 \
--out-format SS --out-root ${C}/${C}
fi
done &
# real 6m31.721
# Create a random list of 50 1 mb regions (do not use the _randoms)
cd /san/sanvol1/scratch/gasAcu1/cons/ss
ls -1l chr*/chr*.ss | grep -v Un | \
awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
# Set up parasol directory to calculate trees on these 50 regions
ssh pk
mkdir /san/sanvol1/scratch/gasAcu1/cons/treeRun6
cd /san/sanvol1/scratch/gasAcu1/cons/treeRun6
mkdir tree log
# Tuning this loop should come back to here to recalculate
# Create little script that calls phastCons with right arguments
# --target-coverage of 0.20 is about right for mouse, will be
# tuned exactly below
cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
/cluster/data/gasAcu1/bed/multiz8way/cons/starting-tree.mod \
--gc 0.449 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-length 10 --target-coverage 0.20 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
# << happy emacs
chmod a+x makeTree.csh
# Create gensub file
cat > template << '_EOF_'
#LOOP
./makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Make cluster job and run it
gensub2 ../randomSs.list single template jobList
para create jobList
para try/push/check/etc
# Completed: 47 of 47 jobs
# CPU time in finished jobs: 78013s 1300.22m 21.67h 0.90d 0.002 y
# IO & Wait Time: 224s 3.73m 0.06h 0.00d 0.000 y
# Average job time: 1665s 27.74m 0.46h 0.02d
# Longest finished job: 2123s 35.38m 0.59h 0.02d
# Submission to last job: 2123s 35.38m 0.59h 0.02d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh pk
cd /san/sanvol1/scratch/gasAcu1/cons/treeRun6
ls -1 tree/chr*/*.cons.mod > cons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
--output-average ave.cons.mod > cons_summary.txt 2>&1 &
ls -1 tree/chr*/*.noncons.mod > noncons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
--output-average ave.noncons.mod > noncons_summary.txt
cp -p ave.*.mod ..
cd ..
cp -p ave.*.mod /cluster/data/gasAcu1/bed/multiz8way/cons
# measuring entropy
# consEntopy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
# never stops with the --NH argument
time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \
0.20 10 ave.{cons,noncons}.mod
## 0.20 10 with fr2
( Solving for new omega: 10.000000 12.945684 12.467518 12.456325 )
Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000
Relative entropy: H=1.364850 bits/site
Expected min. length: L_min=6.767655 sites
Expected max. length: L_max=4.721873 sites
Phylogenetic information threshold: PIT=L_min*H=9.236837 bits
Recommended expected length: omega=12.456325 sites (for L_min*H=9.783400)
## 0.20 10
( Solving for new omega: 10.000000 12.922391 12.450309 12.439360 )
Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000
Relative entropy: H=1.358227 bits/site
Expected min. length: L_min=6.803722 sites
Expected max. length: L_max=4.741215 sites
Phylogenetic information threshold: PIT=L_min*H=9.241002 bits
Recommended expected length: omega=12.439360 sites (for L_min*H=9.783400)
## 0.01 12 does not convert, trying without --NH
Transition parameters: gamma=0.010000, omega=12.000000, mu=0.083333, nu=0.000842
Relative entropy: H=1.012920 bits/site
Expected min. length: L_min=15.386530 sites
Expected max. length: L_max=10.103812 sites
Phylogenetic information threshold: PIT=L_min*H=15.585327 bits
## 0.10 12
# ( Solving for new omega: 12.000000 7.303911 5.603783 4.937054 4.700330 4.651391 4.648962 4.648956 )
# Transition parameters: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.200128 bits/site
# Expected min. length: L_min=9.375672 sites
# Expected max. length: L_max=6.463613 sites
# Phylogenetic information threshold: PIT=L_min*H=11.252009 bits
# Recommended expected length: omega=4.648956 sites (for L_min*H=9.783400)
## 0.17 12
# ( Solving for new omega: 12.000000 10.568744 10.445646 10.444628 )
# Trans parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
# Relative entropy: H=1.267443 bits/site
# Expected min. length: L_min=7.976979 sites
# Expected max. length: L_max=5.664374 sites
# Phylogenetic information threshold: PIT=L_min*H=10.110368 bits
# Recommended expected length: omega=10.444628 sites (for L_min*H=9.783400)
/cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \
ave.cons.mod ave.noncons.mod
Transition parameters: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
Relative entropy: H=1.253111 bits/site
Expected min. length: L_min=8.076184 sites
Expected max. length: L_max=5.736142 sites
Phylogenetic information threshold: PIT=L_min*H=10.120357 bits
#Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
# Relative entropy: H=1.454874 bits/site
# Required length: N=7.596943 sites
# Total entropy: NH=11.052595 bits
# SKIP to here passing by the tuning numbers
ssh pk
# Create cluster dir to do main phastCons run
mkdir /san/sanvol1/scratch/gasAcu1/cons/consRun5
cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
mkdir ppRaw bed
# Create script to run phastCons with right parameters
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.{cons,noncons}.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
--expected-length 10 --target-coverage 0.20 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# << happy emacs
chmod a+x doPhast
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << '_EOF_'
#LOOP
./doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
# These jobs are very fast and very I/O intensive, even on the san
# they will hang it up as they work at full tilt.
# Completed: 58 of 58 jobs
# CPU time in finished jobs: 1739s 28.99m 0.48h 0.02d 0.000 y
# IO & Wait Time: 302s 5.03m 0.08h 0.00d 0.000 y
# Average job time: 35s 0.59m 0.01h 0.00d
# Longest finished job: 47s 0.78m 0.01h 0.00d
# Submission to last job: 77s 1.28m 0.02h 0.00d
# combine predictions and transform scores to be in 0-1000 interval
# it uses a lot of memory, so on kolossus:
ssh kolossus
cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
# The sed's and the sort get the file names in chrom,start order
# You might like to verify it is correct by first looking at the
# list it produces:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
# if that looks right, then let it run:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved8Way.bed
# ~ 16 seconds
cp -p mostConserved8Way.bed /cluster/data/gasAcu1/bed/multiz8way
# Figure out how much is actually covered by the bed file as so:
# Get the non-n genome size from faSize on all chroms:
ssh kkstore05
cd /cluster/data/gasAcu1/bed/multiz8way/cons
faSize chr*/*.fa
# 463354448 bases (16726587 N's 446627861 real 348847599
# upper 97780262 lower) in 23 sequences in 23 files
cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
# The 446627861 comes from the non-n genome as counted above.
awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/446627861\n",100.0*sum/446627861,sum}' \
mostConserved8Way.bed
# % 11.58 = 100.0*51703035/446627861 --exp-len 10 --tar-cov 0.20 fr2
# % 11.49 = 100.0*51326297/446627861 --exp-len 10 --tar-cov 0.20
# % 8.36 = 100.0*37347218/446627861 --exp-len 12 --tar-cov 0.01
# % 10.24 = 100.0*45733412/446627861 --exp-len 12 --tar-cov 0.10
# % 11.45 = 100.0*51136218/446627861 --exp-len 12 --tar-cov 0.17
# Aiming for %70 coverage in
# the following featureBits measurement on CDS:
# Beware of negative scores when too high. The logToBedScore
# will output an error on any negative scores.
HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits gasAcu1 \
-enrichment ensGene:cds mostConserved8Way.bed
# --expected-length 10 --target-coverage 0.20 fr2
# ensGene:cds 6.618%, mostConserved8Way.bed 11.576%, both 4.070%, cover
# 61.49%, enrich 5.31x
# --expected-length 10 --target-coverage 0.20
# ensGene:cds 6.618%, mostConserved8Way.bed 11.492%, both 4.041%, cover
# 61.07%, enrich 5.31x
# --expected-length 12 --target-coverage 0.01
# ensGene:cds 6.618%, mostConserved8Way.bed 8.362%, both 3.628%, cover
# 54.82%, enrich 6.56x
# --expected-length 12 --target-coverage 0.10
# ensGene:cds 6.618%, mostConserved8Way.bed 10.240%, both 3.920%, cover
# 59.24%, enrich 5.79x
# --expected-length 12 --target-coverage 0.17
# ensGene:cds 6.618%, mostConserved8Way.bed 11.449%, both 4.092%, cover
# 61.84%, enrich 5.40x
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz8way
# ended up using the set: --expected-length 10 --target-coverage 0.20
time nice -n +19 hgLoadBed -strict gasAcu1 phastConsElements8way \
mostConserved8Way.bed
# Loaded 1263039 elements of size 5
# real 0m29.697
# should measure the same as above
time nice -n +19 \
featureBits gasAcu1 -enrichment ensGene:cds phastConsElements8way
# At: --expected-length 10 --target-coverage 0.20 fr2
# ensGene:cds 6.618%, phastConsElements8way 11.576%, both 4.070%, cover
# 61.49%, enrich 5.31x
# At: --expected-length 10 --target-coverage 0.20
# ensGene:cds 6.618%, phastConsElements8way 11.492%, both 4.041%, cover
# 61.07%, enrich 5.31x
# At: --expected-length 12 --target-coverage 0.10 result:
# ensGene:cds 6.618%, phastConsElements8way 10.240%, both 3.920%, cover
# 59.24%, enrich 5.79x
# At: --expected-length 12 --target-coverage 0.17 result:
# ensGene:cds 6.618%, phastConsElements8way 11.449%,
# both 4.092%, cover 61.84%, enrich 5.40x
# Create merged posterier probability file and wiggle track data files
ssh pk
cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
# the sed business gets the names sorted by chromName, chromStart
# so that everything goes in numerical order into wigEncode
# This was verified above to be correct
time nice -n +19 find ./ppRaw -type f \
| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
phastCons8.wig phastCons8.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 3m55.780
# -rw-rw-r-- 1 297832681 Feb 3 14:50 phastCons8.wib
# -rw-rw-r-- 1 49168315 Feb 3 14:50 phastCons8.wig
time nice -n +19 cp -p phastCons8.wi? /cluster/data/gasAcu1/bed/multiz8way/
# prepare compressed copy of ascii data values for downloads
ssh pk
cd /san/sanvol1/scratch/gasAcu1/cons/consRun5
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons8Scores
for D in ppRaw/chr*
do
C=${D/ppRaw\/}
out=phastCons8Scores/${C}.data.gz
echo "========================== ${C} ${D}"
find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 5m31.855s
# copy them for downloads
ssh kkstore05
# this directory is actually a symlink from store9 to store8 to
# avoid the data full problem on store9
mkdir /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores
cd /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores
cp -p /san/sanvol1/scratch/gasAcu1/cons/consRun5/phastCons8Scores/* .
# make a README.txt file here, and an md5sum
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/gasAcu1/phastCons8Scores
cd /usr/local/apache/htdocs/goldenPath/gasAcu1/phastCons8Scores
ln -s /cluster/data/gasAcu1/bed/multiz8way/phastCons8Scores/* .
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz8way
ln -s `pwd`/phastCons8.wib /gbdb/gasAcu1/wib/phastCons8.wib
# ended up using the set: --expected-length 10 --target-coverage 0.20
time nice -n +19 hgLoadWiggle gasAcu1 phastCons8 phastCons8.wig
# real 0m9.256s
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/gasAcu1/bed/multiz8way
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=gasAcu1 phastCons8 > histogram.data 2>&1
# real 0m30.744
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Stickleback gasAcu1 Histogram phastCons8 track"
set xlabel " phastCons8 score - --expected-length 10 --target-coverage 0.20"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# The mostConserved can also be characterized by a histogram
awk '{print $3-$2}' mostConserved8Way.bed > mostCons.txt
textHistogram -verbose=2 -autoScale=1000 -pValues mostCons.txt \
> mostCons.histo.txt
cat << '_EOF_' | gnuplot > mostCons.png
set terminal png small color \
xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Stickleback gasAcu1 histogram: lengths of mostConserved track elements"
set xlabel " mostConserved element length - --expected-length 10 --target-coverage 0.20"
set ylabel " # of elements at this length"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set xrange [0:200]
set y2tics
set boxwidth 2
set style fill solid
plot "mostCons.histo.txt" using 2:3 title " # of elements" with boxes, \
"mostCons.histo.txt" using 2:6 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
##############################################################################
## Medaka oryLat1 swap (DONE - 2007-01-12 - Hiram)
## And the swap
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.orylat1.swap
cd /cluster/data/gasAcu1/bed/blastz.orylat1.swap
time doBlastzChainNet.pl -verbose=2 -smallClusterHub=pk \
/cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -stop=net -bigClusterHub=pk > swap.log 2>&1 &
time doBlastzChainNet.pl -verbose=2 -smallClusterHub=pk \
/cluster/data/oryLat1/bed/blastz.gasAcu1.2007-01-11/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=net -swap -stop=net -bigClusterHub=pk > net-swap.log 2>&1 &
# real 18m21.951s
## Taking this chrUn chain result over to the other global result
ssh kkstore05
# Note: previous result files temporarily moved out of the way
cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22/axtChain
chainSplit chain gasAcu1.oryLat1.all.chain.gz
cp -p ../../blastz.orylat1.swap/axtChain/chain/chrUn.chain ./chain
cd chain
chainMergeSort *.chain | gzip -c > ../gasAcu1.oryLat1.all.chain.gz
cd ..
rm -fr chain
# they have new IDs
chainSplit chain gasAcu1.oryLat1.all.chain.gz
# And then reusing the netChains.csh script, fixing up references
# to the appropriate chrom.sizes and 2bit file for gasAcu1
cp netChains.csh netChainsUn.csh
time ./netChainsUn.csh
# real 30m9.385s
## reloading, reuse the load.csh script, fixup reference to correct
## chrom.sizes
ssh hgwdev
cd /cluster/data/gasAcu1/bed/blastz.oryLat1.2006-12-22/axtChain
###########################################################################
## SWAP FR1 BLASTZ (DONE - 2007-01-13 - Hiram)
ssh kkstore02
mkdir /cluster/data/gasAcu1/bed/blastz.fr1.swap
cd /cluster/data/gasAcu1/bed/blastz.fr1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -stop=net -bigClusterHub=pk > swap.log 2>&1 &
# real 130m31.028s
## Take that chrUn result into the primary fr1 blastz result
cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/axtChain
chainSplit chain gasAcu1.fr1.all.chain.gz
cd chain
cp -p /cluster/data/gasAcu1/bed/blastz.fr1.swap/axtChain/chain/chrUn.chain .
chainMergeSort *.chain | gzip -c > ../gasAcu1.fr1.all.chain.gz
cd ..
chainSplit chain gasAcu1.fr1.all.chain.gz
# And then reusing the netChains.csh script, fixing up references
# to the appropriate chrom.sizes and 2bit file for gasAcu1
ssh hgwdev
cd /cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/axtChain
cp netChains.csh netChainsUn.csh
time ./netChainsUn.csh
# real 122m36.719s
# And then reloading these, reusing the loadUp.csh script, fixup
# reference to chrom.sizes
cp loadUp.csh reLoad.csh
time ./reLoad.csh > reLoad.out 2>&1
# real 3m38.799s
# After moving previous stuff out of the way
./makeMd5sum.csh
./installDownloads.csh
#########################################################################
# BLASTZ/CHAIN/NET FR2 (DONE - 2007-01-23 - Hiram)
## no chrUn in gasAcu1, and align to fr2 scaffolds,
## results lifted to fr2 chrUn coordinates
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23
cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23
cat << '_EOF_' > DEF
# Stickleback vs. Fugu
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stippleback gasAcu1, no chrUn in this run
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=40000000
SEQ1_LIMIT=30
SEQ1_LAP=0
# QUERY: Fugu fr2
# Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/gasAcu1Fr2 > do.log 2>&1 &
# real 27m33.395s
# user 0m0.189s
# sys 0m0.163s
# Error because of inconsistent chrom sizes. We had attempted to modify
# our lift file to remove a gap inserted at the end of the entire
# sequence, which we should not have done.
time doBlastzChainNet.pl DEF -chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -continue=chainRun -bigClusterHub=pk \
-blastzOutRoot /cluster/bluearc/gasAcu1Fr2 > chainRun.log 2>&1 &
######################################################
## Now that it's finished, let's see how it did. This already
## set up chain and net tracks for fr2 on gasAcu1, though a modification
## to ~/kent/src/hg/makeDb/trackDb/trackDb.ra needed to be done to make
## sure fr2 got displayed in the browser.
##
## track chainFr2
## shortLabel $o_db Chain
## longLabel $o_Organism ($o_date/$o_db) Chained Alignments
## group compGeno
## priority 140
## visibility hide
## color 100,50,0
## altColor 255,240,200
## matrix 16 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91
## spectrum on
## type chain fr2
## otherDb fr2
## track netFr2
## shortLabel $o_db Net
## longLabel $o_Organism ($o_date/$o_db) Alignment Net
## group compGeno
## priority 140.1
## visibility hide
## spectrum on
## type netAlign fr2 chainFr2
## otherDb fr2
## To see how the matching did, let's use featureBits to find percentage of matching bases.
## This compares the gasAcu1 entries to only those in the Fr2 chain links that were created.
ssh hgwdev
cd /cluster/data/gasAcu1/bed
ln -s blastz.fr2.2007-01-23 blastz.fr2
cd blastz.fr2
time nice -n 19 featureBits gasAcu1 chainFr2Link \
> fb.gasAcu1.chainFr2Link.txt 2>&1 &
# 153570946 bases of 446627861 (34.385%) in intersection
## In addition, we want to put the chains and nets from gasAcu1 onto the
## fr2 browser, done below:
# This is going to make the link back on the fr2 browser to
# gasAcu1.
mkdir /cluster/data/fr2/bed/blastz.gasAcu1.swap
cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
time doBlastzChainNet.pl \
/cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# real 24m33.761s
# user 0m0.125s
# sys 0m0.080s
## Now let's see how well these were matched back to Fr2.
ssh hgwdev
cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
time nice -n 19 featureBits fr2 chainGasAcu1Link \
> fb.fr2.chainGasAcu1Link.txt 2>&1 &
# 158383996 bases of 393312790 (40.269%) in intersection
###########################################################################
# Manually fixing up the fr2 alignments to gasAcu1 to get chrUn results
## (DONE - 2007-01-31 - 2007-02-02 - Hiram)
## Ran the blastz on fr2 and aligned just the chrUn sequence to fr2
## swap back to gasAcu1
mkdir /cluster/data/gasAcu1/bed/blastz.fr2.swap
cd /cluster/data/gasAcu1/bed/blastz.fr2.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -stop=chainMerge -bigClusterHub=pk > swap.log 2>&1 &
## Now, with that chain result in hand, place it manually back in with
## the full chroms chains and re-run the nets and so forth.
ssh kkstore05
cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/axtChain
# restore the chain directory which was removed by cleanUp.csh
chainSplit chain gasAcu1.fr2.all.chain.gz
# copy into here the chrUn.chain result
cd chain
cp -p \
/cluster/data/gasAcu1/bed/blastz.fr2.swap/axtChain/chain/chrUn.chain .
time nice -n +19 chainMergeSort *.chain \
| gzip -c > ../gasAcu1.fr2.all.chain.gz
cd ..
rm -fr chain
# they have new IDs
chainSplit chain gasAcu1.fr2.all.chain.gz
# And then reusing the netChains.csh script, fixing up references
# to the appropriate chrom.sizes and 2bit file for gasAcu1
cp netChains.csh netChainsUn.csh
time nice -n +19 ./netChainsUn.csh
# real 30m9.385s
## reloading, reuse the load.csh script, fixup reference to correct
## chrom.sizes
ssh hgwdev
cd /cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/axtChain
time nice -n +19 ./reLoad.csh > reLoad.out 2>&1
# real 3m53.174s
nice -n +19 featureBits gasAcu1 chainFr2Link \
> fb.gasAcu1.chainFr2Link.txt 2>&1
# 167817405 bases of 446627861 (37.574%) in intersection
# without chrUn covered before, it was:
# 153570946 bases of 446627861 (34.385%) in intersection
# and fr2 is covered:
nice -n +19 featureBits fr2 chainGasAcu1Link \
> fb.fr2.chainGasAcu1Link.txt 2>&1
# 158383996 bases of 393312790 (40.269%) in intersection
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-27)
ssh kkstore05
bash # if not using bash shell already
mkdir /cluster/data/gasAcu1/blastDb
cd /cluster/data/gasAcu1
awk '/chrUn/ {if ($5 == "W") print $6}' downloads/UCSC.gasAcu1.agp > chrUnContigs.lst
zcat downloads/assembly.bases.gz | faSomeRecords stdin chrUnContigs.lst chrUnContigs.fa
cat noUn/chr*fa > temp.fa
faSplit gap temp.fa 1000000 blastDb/
faSplit sequence chrUnContigs.fa 100 blastDb/y
rm temp.fa
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
mkdir -p /san/sanvol1/scratch/gasAcu1/blastDb
cd /cluster/data/gasAcu1/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/gasAcu1/blastDb
done
mkdir -p /cluster/data/gasAcu1/bed/tblastn.hg18KG
cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/gasAcu1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 609 query.lst
# we want around 200000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(200000/609) = 111.833715
mkdir -p /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa
split -l 112 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/gasAcu1/blastDb.lft carry $f.2
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit # back to bash
ssh pk
cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 199752 of 199752 jobs
# CPU time in finished jobs: 4730447s 78840.78m 1314.01h 54.75d 0.150 y
# IO & Wait Time: 929809s 15496.82m 258.28h 10.76d 0.029 y
# Average job time: 28s 0.47m 0.01h 0.00d
# Longest running job: 0s 0.00m 0.00h 0.00d
# Longest finished job: 519s 8.65m 0.14h 0.01d
# Submission to last job: 65891s 1098.18m 18.30h 0.76d
ssh kkstore05
cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
tcsh
mkdir chainRun
cd chainRun
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=100000 stdin /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh kk
cd /cluster/data/gasAcu1/bed/tblastn.hg18KG/chainRun
para create chainSpec
para maxNode 30
para try, check, push, check etc.
# Completed: 328 of 328 jobs
# CPU time in finished jobs: 25906s 431.77m 7.20h 0.30d 0.001 y
# IO & Wait Time: 77171s 1286.18m 21.44h 0.89d 0.002 y
# Average job time: 314s 5.24m 0.09h 0.00d
# Longest finished job: 942s 15.70m 0.26h 0.01d
# Submission to last job: 3630s 60.50m 1.01h 0.04d
ssh kkstore05
cd /cluster/data/gasAcu1/bed/tblastn.hg18KG/blastOut
bash # if using another shell
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/gasAcu1/bed/tblastn.hg18KG/unliftBlastHg18KG.psl
cd ..
pslCheck unliftBlastHg18KG.psl
liftUp blastHg18KG.psl ../../Un/chrUn.lft carry unliftBlastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/gasAcu1/bed/tblastn.hg18KG
hgLoadPsl gasAcu1 blastHg18KG.psl
# check coverage
featureBits gasAcu1 blastHg18KG
# 20127551 bases of 446627861 (4.507%) in intersection
featureBits gasAcu1 ensGene:cds blastHg18KG -enrichment
# ensGene:cds 6.618%, blastHg18KG 4.507%, both 3.790%, cover 57.26%, enrich 12.71x
ssh kkstore05
rm -rf /cluster/data/gasAcu1/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/gasAcu1/bed/tblastn.hg18KG/blastOut
#end tblastn
#######################################################################
## BLASTZ LIZARD ANOCAR1 - (DONE - 2007-02-18 - Hiram)
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18
cd /cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18
cat << '_EOF_' > DEF
# stickleback vs. lizard
# Use same params as used for gasAcu1-danRer4
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Stickleback gasAcu1 - largest chunk big enough
# for all chroms except chrUn
SEQ1_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.2bit
SEQ1_LEN=/cluster/data/gasAcu1/chrom.sizes
SEQ1_CHUNK=40000000
SEQ1_LAP=10000
# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-18
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl -verbose=2 DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot=/san/sanvol1/scratch/gasAcu1AnoCar1 > do.log 2>&1 &
time doBlastzChainNet.pl -verbose=2 DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-blastzOutRoot=/san/sanvol1/scratch/gasAcu1AnoCar1 \
-continue=cleanup > cleanup.log 2>&1 &
## measuring
time nice -n +19 featureBits gasAcu1 chainAnoCar1Link \
> fb.gasAcu1.chainAnoCar1Link.txt 2>&1
# 56386298 bases of 446627861 (12.625%) in intersection
## swap documented in anoCar1.txt
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-19/DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-swap > swap.log 2>&1 &
time nice -n +19 featureBits anoCar1 chainGasAcu1Link \
> fb.anoCar1.chainGasAcu1Link.txt 2>&1
# real 1m14.245s
# 54464074 bases of 1741478929 (3.127%) in intersection
#######################################################################
# SELF CHAINS (DONE 3/14/07 angie)
# Requested by Gill.
ssh kkstore05
mkdir /cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13
cd /cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13
cat << '_EOF_' > DEF
# stickleback vs. self
BLASTZ_H=2000
BLASTZ_M=200
# TARGET
# Stickleback
SEQ1_DIR=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ1_LEN=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
# QUERY
# Stickleback
SEQ2_DIR=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.2bit
SEQ2_LEN=/iscratch/i/gasAcu1/gasAcu1.noUn.sdTrf.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
BASE=/cluster/data/gasAcu1/bed/blastz.gasAcu1.2007-03-13
TMPDIR=/scratch/tmp
'_EOF_'
# << emacs
doBlastzChainNet.pl DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-chainMinScore=5000 -chainLinearGap=medium \
-blastzOutRoot=/cluster/bluearc/gasAcu1Self >& do.log &
tail -f do.log
# hgwdev can't see /iscratch, so the loadUp.csh script failed looking
# for SEQ1_LEN. So I edited DEF to use /cluster/data location and
# continued -load.
##########################################################################
# NSCAN track - (2007-06-05 markd)
# uses danRer4 as informant, no pseudogene masking
#
cd /cluster/data/gasAcu1/bed/nscan/
# obtainedf NSCAN predictions from michael brent's group
# at WUSTL
wget -nv http://mblab.wustl.edu/predictions/Gasterosteus_aculeatus/gasAcu1.gtf
wget -nv http://mblab.wustl.edu/predictions/Gasterosteus_aculeatus/gasAcu1.prot.fa
gzip -9 gasAcu1.*
chmod a-w *.gz
# load tracks. Note that these have *utr features, rather than
# exon features. currently ldHgGene creates separate genePred exons
# for these.
ldHgGene -bin -gtf -genePredExt gasAcu1 nscanGene gasAcu1.gtf.gz
hgPepPred gasAcu1 generic nscanPep gasAcu1.prot.fa.gz
rm -f *.tab
# update trackDb; need a gasAcu1-specific page to describe informants
stickleback/gasAcu1/nscanGene.html
stickleback/gasAcu1/trackDb.ra
#########################################################################
# BLASTZ SWAP from Mouse Mm9 (DONE - 2007-09-07 - Hiram)
# the original
cd /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
cat fb.mm9.chainGasAcu1Link.txt
# 48448585 bases of 2620346127 (1.849%) in intersection
# and the swap
mkdir /cluster/data/gasAcu1/bed/blastz.mm9.swap
cd /cluster/data/gasAcu1/bed/blastz.mm9.swap
time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06/DEF \
-qRepeats=windowmaskerSdust -chainLinearGap=loose \
-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
cat fb.gasAcu1.chainMm9Link.txt
# 43730193 bases of 446627861 (9.791%) in intersection
#########################################################################
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram)
ssh kkstore02 # not too important since everything moved to hive
screen # use screen to control this job
cd /cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26
cat fb.oryLat2.chainGasAcu1Link.txt
# 211589769 bases of 700386597 (30.210%) in intersection
# And, for the swap:
mkdir /cluster/data/gasAcu1/bed/blastz.oryLat2.swap
cd /cluster/data/gasAcu1/bed/blastz.oryLat2.swap
time doBlastzChainNet.pl -chainMinScore=3000 -chainLinearGap=medium -swap \
/cluster/data/oryLat2/bed/blastz.gasAcu1.2008-08-26/DEF \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=2 -smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
# real 31m44.711s
cat fb.gasAcu1.chainOryLat2Link.txt
# 189793612 bases of 446627861 (42.495%) in intersection
############################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
hgsql -e 'update dbDb set
sourceName="Broad Institute gasAcu1 (1.0) (NCBI project 13579, AANH01000000)" where name="gasAcu1";' hgcentraltest
###########################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
see doc/builds.txt for specific details.
############################################################################
# BLASTZ/CHAIN/NET TetNig2 (DONE - 2009-08-10,09-15 - Hiram)
# create contigs only sequence to align properly to gasAcu1 contigs
mkdir /hive/data/genomes/gasAcu1/nonBridged
cd /hive/data/genomes/gasAcu1/nonBridged
gapToLift -verbose=2 gasAcu1 gasAcu1.contigs.lift \
-bedFile=gasAcu1.contigs.bed
# chrom count: 23
# WARNING: gap at end of chromosome not telomere at
# chrUn:62549211-62550211, type: clone
# found 16945 gaps
# bed output requested to gasAcu1.contigs.bed
# no gaps on chrom: chrM, size: 15742
~/kent/src/hg/utils/lft2BitToFa.pl ../gasAcu1.2bit gasAcu1.contigs.lift \
| gzip -c > gasAcu1.contigs.fa.gz
# make sure nothing was destroyed:
faCount *.fa.gz > faCount.contigs.txt 2>&1
twoBitToFa ../gasAcu1.2bit stdout | faCount stdin > faCount.2bit.txt 2>&1
tail -1 faCount.contigs.txt
# total 461441448 123670916 99610982 99564587
# 123781376 14813587 14615136
tail -1 faCount.2bit.txt
# total 463354448 123670916 99610982 99564587
# 123781376 16726587 14615136
# only the total size and N count are different
faToTwoBit gasAcu1.contigs.fa.gz gasAcu1.contigs.2bit
twoBitInfo gasAcu1.contigs.2bit stdout | sort -k2nr > gasAcu1.contigs.sizes
cp -p gasAcu1.contigs.2bit gasAcu1.contigs.sizes gasAcu1.contigs.lift \
/hive/data/staging/data/gasAcu1
mkdir /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10
cd /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10
cat << '_EOF_' > DEF
# Stickleback vs. Tetraodon
# TARGET: Stickleback gasAcu1, chunk large enough to run largest piece
SEQ1_DIR=/scratch/data/gasAcu1/gasAcu1.2bit
SEQ1_LEN=/scratch/data/gasAcu1/chrom.sizes
SEQ1_CTGDIR=/scratch/data/gasAcu1/gasAcu1.contigs.2bit
SEQ1_CTGLEN=/scratch/data/gasAcu1/gasAcu1.contigs.sizes
SEQ1_LIFT=/scratch/data/gasAcu1/gasAcu1.contigs.lift
SEQ1_CHUNK=22000000
SEQ1_LAP=10000
SEQ1_LIMIT=50
# QUERY: Tetraodon TetNig2 - single chunk big enough to run single largest item
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50
BASE=/hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10
TMPDIR=/scratch/tmp
'_EOF_'
# << this line keeps emacs coloring happy
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
`pwd`/DEF \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> do.log 2>&1 &
# about 72 minutes
# forgot to indicate type of repeats, continuing the load:
cd /hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10/axtChain
netClass -tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-verbose=0 -noAr noClass.net gasAcu1 tetNig2 gasAcu1.tetNig2.net
netFilter -minGap=10 gasAcu1.tetNig2.net \
| hgLoadNet -verbose=0 gasAcu1 netTetNig2 stdin
cd ..
featureBits gasAcu1 chainTetNig2Link >&fb.gasAcu1.chainTetNig2Link.txt
cat fb.gasAcu1.chainTetNig2Link.txt
# 134497679 bases of 446627861 (30.114%) in intersection
+ # and finish the downloads
+ time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+ `pwd`/DEF \
+ -noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
+ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
+ -continue=download > downloads.log 2>&1
+
mkdir /hive/data/genomes/tetNig2/bed/blastz.gasAcu1.swap
cd /hive/data/genomes/tetNig2/bed/blastz.gasAcu1.swap
time nice -n +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/gasAcu1/bed/lastzTetNig2.2009-08-10/DEF \
-swap -qRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-noLoadChainSplit -chainMinScore=2000 -chainLinearGap=medium \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
> swap.log 2>&1 &
############################################################################
# TRANSMAP vertebrate.2009-09-13 build (2009-09-20 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
#########################################################################
# LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-21 - Galt)
# original alignment to danRer6
cd /hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21
cat fb.danRer6.chainGasAcu1Link.txt
# 147479454 bases of 1506896106 (9.787%) in intersection
# running the swap - DONE - 2009-12-21
mkdir /hive/data/genomes/gasAcu1/bed/blastz.danRer6.swap
cd /hive/data/genomes/gasAcu1/bed/blastz.danRer6.swap
time nice +19 doBlastzChainNet.pl -verbose=2 \
/hive/data/genomes/danRer6/bed/lastzGasAcu1.2009-12-21/DEF \
-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
-swap >& swap.log &
cat fb.gasAcu1.chainDanRer6Link.txt
# 114451338 bases of 446627861 (25.626%) in intersection
#######################################################################