src/hg/makeDb/doc/fr2.txt 1.23
1.23 2009/07/21 21:01:42 markd
added transmap update blurb
Index: src/hg/makeDb/doc/fr2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/fr2.txt,v
retrieving revision 1.22
retrieving revision 1.23
diff -b -B -U 1000000 -r1.22 -r1.23
--- src/hg/makeDb/doc/fr2.txt 28 Aug 2008 21:18:58 -0000 1.22
+++ src/hg/makeDb/doc/fr2.txt 21 Jul 2009 21:01:42 -0000 1.23
@@ -1,1715 +1,1724 @@
# for emacs: -*- mode: sh; -*-
# This file describes browser build for the Medaka
# genome, Takifugu rubripes, October 2004, MEDAKA1 from
# Institute of Genetics (NIG) and the University of Tokyo
# NIG: http://dolphin.lab.nig.ac.jp/medaka/
# UTGB: http://medaka.utgenome.org/o
# Data release policy: http://medaka.utgenome.org/#policy
# Ensembl: http://www.ensembl.org/Oryzias_latipes/index.html
#
# "$Id$"
#
##########################################################################
### Fetch sequence (DONE - 2007-01-22 - Cory McLean and Hiram)
ssh kkstore02
mkdir /cluster/store5/fr2
ln -s /cluster/store5/fr2 /cluster/data/fr2
cd /cluster/data/fr2
mkdir jgi
cd jgi
cat << '_EOF_' > fetch.sh
#!/bin/sh
wget --timestamping "ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/v4.0/*"
gunzip fugu.041029.scaffolds.fasta.gz
scaffoldFaToAgp fugu.041029.scaffolds.fasta
gzip fugu.041029.scaffolds.fasta
'_EOF_'
# << happy emacs
chmod +x fetch.sh
./fetch.sh
##########################################################################
# Run the makeGenomeDb.pl script (DONE - 2007-01-22 - Cory and Hiram)
# prepare for the makeGenomeDb.pl script:
ssh hgwdev
cd /cluster/data/fr2
# the config.ra script pretty much specifies everything
cat << '_EOF_' > config.ra
db fr2
scientificName Takifugu Rubripes
assemblyDate Oct. 2004
assemblyLabel JGI V4.0
# orderKey = fr1.orderKey - 1
orderKey 464
# NC_004299.1
mitoAcc 23397366
fastaFiles /cluster/data/fr2/jgi/fugu.041029.scaffolds.fasta.gz
dbDbSpeciesDir fugu
agpFiles /cluster/data/fr2/jgi/fugu.041029.scaffolds.agp
commonName Fugu
clade Vertebrate
genomeCladePriority 110
'_EOF_'
# << happy emacs
makeGenomeDb.pl config.ra > mgdb.out 2>&1
# This sequence creates and loads the following files into the
# new database oryLat1
# chr*_gap, chr*_gold, chromInfo, gc5Base, grp
# And, when you follow the instructions it gives at the end to check in
# the trackDb files it creates, and you do a make in your trackDb
# hierarchy, you will then create the trackDb and hgFindSpec tables
# (with a 'make alpha') or your specific trackDb_logname
# hgFindSpec_logname with a simple 'make' with no arguments.
# The sequence also adds an entry to the dbDb table to turn on this
# organism in the drop-down menus
############################################
# Checked in trackDb fr2 files to the source tree. Instructions
# located in ~/kent/src/product/README.trackDb
ssh hgwdev
cd /gbdb/fr2
ln -s /cluster/data/fr2/fr2.unmasked.2bit ./fr2.2bit
################################################
## WINDOWMASKER (DONE - 2007-01-22 - Cory and Hiram)
cd /cluster/data/fr2/bed/
~/kent/src/hg/utils/automation/doWindowMasker.pl fr2 \
-workhorse=kolossus > wmRun.log 2>&1 &
# Save the log
mv wmRun.log WindowMasker.2007-01-22
# Masking statistics
cd WindowMasker.2007-01-22
twoBitToFa fr2.wmsk.2bit stdout | faSize stdin
# 400525790 bases (49313545 N's 351212245 real 284686886 upper
# 66525359 lower)
hgLoadBed -strict fr2 windowmaskerSdust windowmasker.sdust.bed.gz
# Loaded 1747418 elements of size 3
#########################################################################
# SIMPLE REPEATS (TRF) (DONE 2007-01-22 - Cory and Hiram)
ssh kolossus
mkdir /cluster/data/fr2/bed/simpleRepeat
cd /cluster/data/fr2/bed/simpleRepeat
# This missed chrM sequence
time nice -n 19 twoBitToFa ../../fr2.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
# ~31m
# Make a filtered version for sequence masking:
awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
splitFileByColumn trfMask.bed trfMaskChrom
# Load unfiltered repeats into the database:
ssh hgwdev
hgLoadBed fr2 simpleRepeat \
/cluster/data/fr2/bed/simpleRepeat/simpleRepeat.bed \
-sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
featureBits fr2 simpleRepeat
# 9915088 bases of 393312790 (2.521%) in intersection
featureBits fr1 simpleRepeat
# 6801339 bases of 315518167 (2.156%) in intersection
# recovery attempt to get chrM masked
time nice -n 19 twoBitToFa -seq=chrM ../../fr2.unmasked.2bit stdout \
| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
-bedAt=chrM.simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
# It finds nothing !
# make an empty trfMaskChrom file:
touch trfMaskChrom/chrM.bed
#########################################################################
## Add TRF mask to WindowMasker masked sequence
ssh kkstore02
cd /cluster/data/fr2
twoBitMask bed/WindowMasker.2007-01-22/fr2.wmsk.sdust.2bit \
-add bed/simpleRepeat/trfMask.bed fr2.2bit
# Received ignorable warning:
# Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it
# might contain block coordinates, but this program uses only the first three
# fields (the entire span -- no support for blocks).
# Make this be the actual file the get DNA browser sees
ssh hgwdev
cd /gbdb/fr2/
rm fr2.2bit
ln -s /cluster/data/fr2/fr2.2bit fr2.2bit
#########################################################################
## Lift our .2bit file against the .lft file to create a scaffold fasta file
cd /cluster/data/fr2/jkStuff
cp ../jgi/fugu.041029.scaffolds.lft liftAll.lft
cp /cluster/data/tetNig1/jkStuff/lft2BitToFa.pl .
cd ..
mkdir noUn
cd noUn
time ../jkStuff/lft2BitToFa.pl ../fr2.2bit ../jkStuff/liftAll.lft \
> chrUn.scaffolds.fa
# real 5m4.520s
twoBitToFa -seq=chrM ../fr2.2bit chrM.fa
faToTwoBit *.fa fr2.scaffolds.2bit
twoBitInfo *.2bit stdout | sort -k2nr > fr2.scaffolds.sizes
##########################################################################
## Move the data out to the cluster
cd /san/sanvol1/scratch/
mkdir fr2
cd fr2
cp -p /cluster/data/fr2/jkStuff/liftAll.lft
cp -p /cluster/data/fr2/chrom.sizes .
cp -p /cluster/data/fr2/fr2.2bit .
cp -p /cluster/data/fr2/noUn/*2bit .
cp -p /cluster/data/fr2/noUn/*sizes .
## Edit the kent/src/hg/makeDb/doc/gasAcu1.txt doc to show what we're going to
## do there.
##
## To display the new chains and nets in the gasAcu1 browser, we had to edit the
## trackDb.ra file to include the new chain and net:
##
## track chainFr2
## shortLabel $o_db Chain
## longLabel $o_Organism ($o_date/$o_db) Chained Alignments
## group compGeno
## priority 140
## visibility hide
## color 100,50,0
## altColor 255,240,200
## matrix 16 91,-90,-25,-100,-90,100,-100,-25,-25,-100,100,-90,-100,-25,-90,91
## spectrum on
## type chain fr2
## otherDb fr2
## track netFr2
## shortLabel $o_db Net
## longLabel $o_Organism ($o_date/$o_db) Alignment Net
## group compGeno
## priority 140.1
## visibility hide
## spectrum on
## type netAlign fr2 chainFr2
## otherDb fr2
## Then we verified that the chain covered a higher percentage of gasAcu1 than fr1 did:
## Look into the gasAcu1.txt file to see the commands going on there.
## Then we needed to update the links to the detail pages of chains and nets:
## /cluster/home/cmclean/kent/src/hg/makeDb/trackDb/chainFr2.html and netFr2.html
###########################################################
#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-01-24 - Hiram and Cory)
# This will find repeats within the genome that should not be matched
# against. Uses 11-mers.
# Use -repMatch=128 (based on size -- for human we use 1024, and
# fugu size is ~12% of human judging by gapless fr2 vs. hg18
# genome sizes from featureBits.
featureBits hg18 gap
featureBits -countGaps hg18 gap
ssh kolossus
blat /cluster/data/fr2/fr2.2bit /dev/null /dev/null -tileSize=11 \
-makeOoc=/cluster/data/fr2/11.ooc -repMatch=128
# Wrote 8898 overused 11-mers to /cluster/data/fr2/11.ooc
cp -p /cluster/data/fr2/11.ooc /cluster/bluearc/fugu/fr2
cp -p /cluster/data/fr2/jkStuff/liftAll.lft /cluster/bluearc/fugu/fr2
#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-01-24 - Hiram and Cory)
# Make a liftAll.lft that specifies 5M chunks for genbank:
# Actually not necessary, since our chunks are small enough. If we had
# to, would look like this:
# ssh kkstore05
# cd /cluster/data/fr2
# simplePartition.pl fr2.2bit 5000000 /tmp/fr2
# cat /tmp/fr2/*/*.lft > jkStuff/liftAll.lft
# rm -r /tmp/fr2
# align with latest genbank process.
ssh hgwdev
cd ~/kent/src/hg/makeDb/genbank
cvsup
# edit etc/genbank.conf to add fr2 just after fr1
# fr2
fr2.serverGenome = /cluster/data/fr2/fr2.2bit
fr2.clusterGenome = /cluster/bluearc/fugu/fr2/fr2.2bit
fr2.ooc = /cluster/bluearc/fugu/fr2/11.ooc
fr2.align.unplacedChroms = chrUn
fr2.lift = /cluster/bluearc/fugu/fr2/liftAll.lft
fr2.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter}
fr2.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
fr2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
fr2.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
fr2.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter}
fr2.genbank.mrna.xeno.loadDesc = yes
fr2.refseq.mrna.native.load = no
fr2.refseq.mrna.xeno.load = no
cvs ci -m "Added fr2." etc/genbank.conf
# update /cluster/data/genbank/:
make etc-update
# Edit src/lib/gbGenome.c to add new species. Not necessary here since
# fugu already exists.
#
# cvs ci -m "Added Oryzias latipes (medaka)." src/lib/gbGenome.c
# make install-server
cd /cluster/data/genbank
screen
# This is a call to a script that will push our jobs out to the cluster
# since it's a big job.
nice -n +19 bin/gbAlignStep -initial fr2 &
# logFile: var/build/logs/2007.01.24-12:09:11.fr2.initalign.log
# We had an error because machine kkr4u02 was unable to ssh to. This
# happened in the middle of the -run subroutine.
para problems > problems.out 2>&1
grep host problems.out | sort | uniq -c | sort -n
# 62021 host: kkr4u02.kilokluster.ucsc.edu
parasol list machines | grep kkr4u02
# kkr4u02.kilokluster.ucsc.edu
parasol remove machine kkr4u02.kilokluster.ucsc.edu "unable to ssh"
para push -retries=5
# updated job database on disk
# Pushed Jobs: 14820
# Retried jobs: 14820
# We still need to finish the rest of the gbAlignStep since it failed
# because the -run subroutine did not finish correctly. We must manually
# call the rest of the routine.
nice -n 19 bin/gbAlignStep -continue=finish -initial fr2 &
# logFile: var/build/logs/2007.01.24-15:48:19.fr2.initalign.log
# load database when finished
ssh hgwdev
cd /cluster/data/genbank
time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad fr2
# enable daily alignment and update of hgwdev (DONE - 2007-02-05 - Hiram)
cd ~/kent/src/hg/makeDb/genbank
cvsup
# add fr2 to:
etc/align.dbs
etc/hgwdev.dbs
cvs ci -m "Added fr2." etc/align.dbs etc/hgwdev.dbs
make etc-update
#########################################################################
# ENSEMBL GENES (DONE - 2007-01-24 - Hiram and Cory)
# Good luck with the biomart interface. It seems to be different each time
# it is used.
mkdir /cluster/data/fr2/bed/ensembl
cd /cluster/data/fr2/bed/ensembl
# Get the ensembl gene data from
# http://www.biomart.org/biomart/martview/
#
# Follow this sequence through the pages:
# The default dataset will be Human. In the right side
# frame, if you scroll it up and down, you will come to a pull-down
# dataset menu where you can select the organism. There appear to be
# two pull-down menus in this frame, one for:
# Database: ENSEMBL 42 GENE (SANGER)
# and you can select the second:
# Dataset: Takifugu rubripes genes (FUGU4)
# After selecting the Dataset, in the left frame, click on the
# "Attributes" (Structures) label, now the right frame changes to radio
# buttons, Features, Sequences, Structures
# Click the "Structures" button, the three optional buttons can be
# expanded to select elements of these:
# REGION:
# GENE:
# EXON:
# REGION: has Chromosome checked
# GENE: has Ensembl Gene ID and Biotype selected
# EXON: has no selections
# In the GENE: menu:
# Unselect Biotype
# and Select
# Ensembl Gene ID
# Ensembl Transcript ID
# External Gene ID
# Click on the "Filters" section on the left-side frame.
# Under GENE in the right-side frame, select Gene type checkbox and
# highlight "protein_coding".
# Check: Click "Count" from top buttons, and 22,008/22,409 genes will
# be reported.
# Then, in the black menu bar above these frames, click the "Results"
# it shows the first ten rows. For this organism, there appear to be no
# External Gene ID in the HTML view. Change the "Display maximum"
# "rows as" pull-down
# to GFF, and use the "Export all results to" pull-down to
# Compressed web file (notify by email), press the "Go" button to download.
# After retrieving the URL where our data is located
# (http://www.biomart.org/biomart/martresults?file=martquery_0124222017_859.txt.gz),
# we get it from that place:
wget http://www.biomart.org/biomart/martresults?file=martquery_0124222017_859.txt.gz \
-O fr2.ensembl42.gff.gz
# Ensemble gives us coordinates on each scaffold relative to the beginning
# of *that scaffold.* We want to have them in chrUn coordinates instead.
# We use the liftUp program on the Ensembl data, with our liftAll.lft file
# we created a long time ago, to achieve this, and name it
# fr2.ensembl42.protein_coding.gff since we only took the protein coding
# genes.
zcat fr2.ensembl42.gff.gz | liftUp -type=.gff fr2.ensembl42.protein_coding.gff \
../../jkStuff/liftAll.lft error stdin
# On other files, sometimes we need to massage the input names to match
# the UCSC naming conventions. Below is an example, though it was not
# necessary today.
# Add "chr" to front of each line in the gene data gtf file to make
# it compatible with our software, and liftUp to get scaffolds onto chrUn
# The scaffolds and ultracontigs mentioned in this are not the scaffolds
# we have in our lift file for chrUn ... can't use them.
# zcat oryLat1.ensembl42.gff.gz | egrep -v "scaffold|ultracontig" \
# | sed -e "s/^\([0-9][0-9]*\)/chr\1/" | gzip -c > ensembl42.gff.gz
# Verify names OK:
# zcat ensembl42.gff.gz | awk '{print $1}' | sort | uniq -c
# 22938 chr1
# 15887 chr10
# 18645 chr11
# 20162 chr12
# 21474 chr13
# 21302 chr14
# 20148 chr15
# 22978 chr16
# 26164 chr17
# 13671 chr18
# 17109 chr19
# 10988 chr2
# 15705 chr20
# 21361 chr21
# 21030 chr22
# 12984 chr23
# 19009 chr24
# 21767 chr3
# 26204 chr4
# 24335 chr5
# 24300 chr6
# 24329 chr7
# 23323 chr8
# 27795 chr9
cd /cluster/data/fr2/bed/ensembl
ldHgGene -gtf -genePredExt fr2 ensGene fr2.ensembl42.protein_coding.gff
# Read 22102 transcripts in 407112 lines in 1 files
# 22102 groups 1 seqs 1 sources 4 feature types
# 22102 gene predictions
# The genome-test database will already populate this into our browser
# since Ensembl genes are a default track. However, the link to the
# Ensembl website is broken because it automatically assumes we are
# referencing the human genome. We need to edit the
# ~/kent/src/hg/makeDb/trackDb/fugu/trackDb.ra file in the ensGene track
# and update the URL to point to the correct spot. In this case, we want
# the incorrect URL:
# url http://www.ensembl.org/perl/transview?transcript=$$track genscan
# to the correct URL:
# url http://dec2006.archive.ensembl.org/Takifugu_rubripes/transview?transcript=$$
# Also, we add the following lines at the end
# urlName gene
# archive dec2006
## Now we want to get the proteins that are derived from these genes as a link at
# the top of the detail browser.
# ensGtp associates geneId/transcriptId/proteinId for hgPepPred and
# hgKnownToSuper. Use ensMart to create it as above, except:
# for the Attributes, choose the "Features" box, and then In "GENE:"
# select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.
# Results, choose txt output and a Compressed web file (notify by email).
# Save this as
# ensGtp42.txt.gz
wget http://www.biomart.org/biomart/martresults?file=martquery_0124225551_471.txt.gz \
-O ensGtp42.txt.gz
# Strip the first lines which is merely column heading labels
zcat ensGtp42.txt.gz | headRest 1 stdin | sed -e "s/ /\t/g" > ensGtp.txt
# We want to load our genes, but unfortunately the gene names are larger
# than the standard ensGtp.sql file that the following command handles:
# hgLoadSqlTab fr2 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.txt
# Instead, we make our own temporary .sql file to handle the insert.
sed -e "s/20/21/; s/18/21/" ~/kent/src/hg/lib/ensGtp.sql > ensGtp.bigcols.sql;
# And then perform the insert.
hgLoadSqlTab fr2 ensGtp ensGtp.bigcols.sql ensGtp.txt
rm ensGtp.bigcols.sql
hgsql -N -e "select count(*) from ensGtp;" fr2
# +-------+
# | 22102 |
# +-------+
wc -l ensGtp.txt
# 22102 ensGtp.txt
# Load Ensembl peptides:
# Get them from ensembl as above in the gene section except for
# for the Attributes, choose the "Sequences" box, and then
# SEQUENCES: Peptide and
# Header Information "Ensembl Transcript ID"
# Results output as FASTA
# Save file as ensPep.fa.gz
# XXX Still waiting for the proteins to be sent to us so that we can add
# this part.
# hgPepPred fr2 generic ensPep ensPep.fa
############################################################################
# BLATSERVERS ENTRY (DONE - 2007-01-25 - Hiram)
# After getting a blat server assigned by the Blat Server Gods,
ssh hgwdev
hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("fr2", "blat3", "17786", "1", "0"); \
INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
VALUES ("fr2", "blat3", "17787", "0", "1");' \
hgcentraltest
# test it with some sequence
#########################################################################
# BLASTZ/CHAIN/NET Hg18 (DONE - 2007-01-26 - Hiram)
## Swap back to fr2
mkdir /cluster/data/fr2/bed/blastz.hg18.swap
cd /cluster/data/fr2/bed/blastz.hg18.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/hg18/bed/blastz.fr2.2007-01-24/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -swap > swap.log 2>&1 &
# real 47m14.554s
ssh hgwdev
cd /cluster/data/fr2/bed/blastz.hg18.swap
time nice -n +19 featureBits fr2 chainHg18Link \
> fb.fr2.chainHg18Link.txt 2>&1 &
# 42875664 bases of 393312790 (10.901%) in intersection
###########################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-01-26)
ssh kkstore02
bash # if not using bash shell already
mkdir /cluster/data/fr2/blastDb
cd /cluster/data/fr2
zcat jgi/fugu.041029.scaffolds.fasta.gz > temp.fa
faSplit sequence temp.fa 500 blastDb/
rm temp.fa
cd blastDb
for i in *.fa
do
/cluster/bluearc/blast229/formatdb -i $i -p F
done
rm *.fa
mkdir -p /san/sanvol1/scratch/fr2/blastDb
cd /cluster/data/fr2/blastDb
for i in nhr nin nsq;
do
echo $i
cp *.$i /san/sanvol1/scratch/fr2/blastDb
done
mkdir -p /cluster/data/fr2/bed/tblastn.hg18KG
cd /cluster/data/fr2/bed/tblastn.hg18KG
echo /san/sanvol1/scratch/fr2/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//" > query.lst
wc -l query.lst
# 495 query.lst
# we want around 100000 jobs
calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(100000/`wc query.lst | awk "{print \\\$1}"`\)
# 36727/(100000/495) = 181.798650
mkdir -p /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa
split -l 180 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa/kg
ln -s /cluster/bluearc/fr2/bed/tblastn.hg18KG/kgfa kgfa
cd kgfa
for i in *; do
nice pslxToFa $i $i.fa;
rm $i;
done
cd ..
ls -1S kgfa/*.fa > kg.lst
mkdir -p /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
ln -s /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
for i in `cat kg.lst`; do mkdir blastOut/`basename $i .fa`; done
tcsh
cd /cluster/data/fr2/bed/tblastn.hg18KG
cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
mv $f.8 $f.1
break;
fi
done
if test -f $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
if pslCheck -prot $3.tmp
then
mv $3.tmp $3
rm -f $f.1 $f.2 $f.3 $f.4
fi
exit 0
fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
# << happy emacs
chmod +x blastSome
gensub2 query.lst kg.lst blastGsub blastSpec
exit # back to bash
ssh pk
cd /cluster/data/fr2/bed/tblastn.hg18KG
para create blastSpec
# para try, check, push, check etc.
para time
# Completed: 101475 of 101475 jobs
# CPU time in finished jobs: 3924977s 65416.29m 1090.27h 45.43d 0.124 y
# IO & Wait Time: 1118884s 18648.06m 310.80h 12.95d 0.035 y
# Average job time: 50s 0.83m 0.01h 0.00d
# Longest finished job: 3925s 65.42m 1.09h 0.05d
# Submission to last job: 46557s 775.95m 12.93h 0.54d
ssh kkstore04
cd /cluster/data/fr2/bed/tblastn.hg18KG
tcsh
mkdir chainRun
cd chainRun
cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
'_EOF_'
chmod +x chainOne
ls -1dS /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
gensub2 chain.lst single chainGsub chainSpec
# do the cluster run for chaining
ssh kk
cd /cluster/data/fr2/bed/tblastn.hg18KG/chainRun
para create chainSpec
para try, check, push, check etc.
# Completed: 205 of 205 jobs
# CPU time in finished jobs: 2262s 37.70m 0.63h 0.03d 0.000 y
# IO & Wait Time: 40989s 683.15m 11.39h 0.47d 0.001 y
# Average job time: 211s 3.52m 0.06h 0.00d
# Longest finished job: 360s 6.00m 0.10h 0.00d
# Submission to last job: 1492s 24.87m 0.41h 0.02d
ssh kkstore04
cd /cluster/data/fr2/bed/tblastn.hg18KG/blastOut
bash # if using another shell
for i in kg??
do
cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
awk "((\$1 / \$11) ) > 0.60 { print }" c60.$i.psl > m60.$i.psl
echo $i
done
sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/fr2/bed/tblastn.hg18KG/unliftBlastHg18KG.psl
cd ..
pslCheck unliftBlastHg18KG.psl
liftUp blastHg18KG.psl ../../jkStuff/liftAll.lft warn unliftBlastHg18KG.psl
# load table
ssh hgwdev
cd /cluster/data/fr2/bed/tblastn.hg18KG
hgLoadPsl fr2 blastHg18KG.psl
# check coverage
featureBits fr2 blastHg18KG
# 19761405 bases of 393312790 (5.024%) in intersection
featureBits fr2 refGene:cds blastHg18KG -enrichment
# ensGene:cds 8.216%, blastHg18KG 5.024%, both 4.401%, cover 53.57%, enrich 10.66x
ssh kkstore04
rm -rf /cluster/data/fr2/bed/tblastn.hg18KG/blastOut
rm -rf /cluster/bluearc/fr2/bed/tblastn.hg18KG/blastOut
#end tblastn
#########################################################################
# BLASTZ/CHAIN/NET TetNig1 SWAP (DONE - 2007-01-29 - Hiram)
## Align to fr2 scaffolds,
## results lifted to fr2 chrUn coordinates
## Swap to fr2
mkdir /cluster/data/fr2/bed/blastz.tetNig1.swap
cd /cluster/data/fr2/bed/blastz.tetNig1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-bigClusterHub=pk -swap > swap.log 2>&1 &
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/tetNig1/bed/blastz.fr2.2007-01-25/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-continue=net -bigClusterHub=pk -swap > net_swap.log 2>&1 &
# real 40m40.471s
ssh hgwdev
cd /cluster/data/tetNig1/bed/blastz.fr2.2007-01-25
time nice -n +19 featureBits tetNig1 chainFr2Link \
> fb.tetNig1.chainFr2Link.txt 2>&1
# 246828605 bases of 342403326 (72.087%) in intersection
cd /cluster/data/fr2/bed/blastz.tetNig1.swap
time nice -n +19 featureBits fr2 chainTetNig1Link \
> fb.fr2.chainTetNig1.txt 2>&1
# 247086553 bases of 393312790 (62.822%) in intersection
#########################################################################
# BLASTZ/CHAIN/NET gasAcu1 swap (DONE - 2007-01-23 - Hiram)
## no chrUn in gasAcu1, and align to fr2 scaffolds,
## results lifted to fr2 chrUn coordinates
ssh kkstore05
mkdir /cluster/data/fr2/bed/blastz.gasAcu1.swap
cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
time doBlastzChainNet.pl \
/cluster/data/gasAcu1/bed/blastz.fr2.2007-01-23/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > swap.log 2>&1 &
# real 24m33.761s
ssh hgwdev
cd /cluster/data/fr2/bed/blastz.gasAcu1.swap
time nice -n 19 featureBits fr2 chainGasAcu1Link \
> fb.fr2.chainGasAcu1Link.txt 2>&1 &
# 158383996 bases of 393312790 (40.269%) in intersection
#########################################################################
## BLASTZ/CHAIN/NET to gasAcu1 chrUn - the above swap does not include
## gasAcu1 chrUn - thus its browser would be empty for any fr2 alignments.
## This procedure will get fr2 alignments added to the gasAcu1 browser
## for chrUn
ssh kkstore02
mkdir /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31
cd /cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31
cat << '_EOF_' > DEF
# Stickleback vs. Fugu, Stickleback chrUn in contigs only, to fugu contigs
# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# TARGET: Fugu fr2
# Align to the scaffolds, results lifed up to chrUn.sdTrf
# coordinates
SEQ1_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ1_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ1_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ1_CHUNK=20000000
SEQ1_LIMIT=30
SEQ1_LAP=10000
# QUERY: Stickleback gasAcu1 chrUn only
# chrUn in contigs for this alignment run
# The largest is 418,000 bases and there are 5,000 of them.
SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUn.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.chrUnContigsOnly.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=30
SEQ2_LAP=0
BASE=/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31
TMPDIR=/scratch/tmp
'_EOF_'
# << happy emacs
time doBlastzChainNet.pl DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-blastzOutRoot /cluster/bluearc/fr2GasAcu1 \
-stop=net -bigClusterHub=pk > do.log 2>&1 &
# real 73m13.156s
## swap back to gasAcu1
mkdir /cluster/data/gasAcu1/bed/blastz.fr2.swap
cd /cluster/data/gasAcu1/bed/blastz.fr2.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/fr2/bed/blastz.gasAcu1.2007-01-31/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -stop=chainMerge -bigClusterHub=pk > swap.log 2>&1 &
## Now, with that chain result in hand, place it manually back in with
## the full chroms chains and re-run the nets and so forth.
#########################################################################
## swap oryLat1 results back to fr2 (DONE - 2007-01-24 - Hiram)
mkdir /cluster/data/fr2/bed/blastz.oryLat1.swap
cd /cluster/data/fr2/bed/blastz.oryLat1.swap
time doBlastzChainNet.pl -verbose=2 \
/cluster/data/oryLat1/bed/blastz.fr2.2007-01-24/DEF \
-chainMinScore=2000 -chainLinearGap=loose \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-swap -bigClusterHub=pk > swap.log 2>&1 &
ssh hgwdev
cd /cluster/data/oryLat1/bed/blastz.fr2.2007-01-24
time nice -n +19 featureBits oryLat1 chainFr2Link \
> fb.oryLat1.chainFr2Link.txt 2>&1
# 177508958 bases of 700386597 (25.344%) in intersection
cd /cluster/data/fr2/bed/blastz.oryLat1.swap]
time nice -n +19 featureBits fr2 chainOryLat1Link \
> fb.fr2.chainOryLat1Link.txt 2>&1
# 143996507 bases of 393312790 (36.611%) in intersection
#########################################################################
## 5-Way Multiz (DONE - 2007-02-03 - Hiram)
ssh hgwdev
mkdir /cluster/data/fr2/bed/multiz5way
cd /cluster/data/fr2/bed/multiz5way
cp /cluster/data/gasAcu1/bed/multiz8way/8way.nh .
/cluster/bin/phast/tree_doctor \
--prune human_hg18,mouse_mm8,chicken_galGal3 8way.nh
# use the output of that to manually construct this tree.
# Arbitrarily set 0.2 distances for this added branch
# All other distances remain as specified in the 17way.nh
cat << '_EOF_' > 5way.nh
(((tetraodon_tetNig1:0.199381,fugu_fr2:0.239894):0.2,
(stickleback_gasAcu1:0.2,medaka_oryLat1:0.2):0.2):0.292961,
zebrafish_danRer4:0.782561);
'_EOF_'
# << happy emacs
# Use this specification in the phyloGif tool:
# http://genome.ucsc.edu/cgi-bin/phyloGif
# to obtain a gif image for htdocs/images/phylo/fr2_5way.gif
/cluster/bin/phast/all_dists 5way.nh > 5way.distances.txt
# Use this output to create the table below
grep -y fr2 5way.distances.txt | sort -k3,3n
#
# If you can fill in all the numbers in this table, you are ready for
# the multiple alignment procedure
#
# featureBits chainLink measures
# chainGasAcu1Link chain linearGap
# distance on fr2 on other minScore
# 1 0.4393 - tetraodon tetNig1 (% 62.822) (% 72.087) 2000 loose
# 2 0.8399 - medaka oryLat1 (% 36.611) (% 25.344) 2000 loose
# 3 0.8399 - stickleback gasAcu1 (% 40.269) (% 37.574) 2000 loose
# 4 1.5156 - zebrafish danRer4 (% 20.585) (% 8.543) 5000 loose
cd /cluster/data/fr2/bed/multiz5way
# bash shell syntax here ...
export H=/cluster/data/fr2/bed
mkdir mafLinks
for G in oryLat1 tetNig1 gasAcu1 danRer4
do
mkdir mafLinks/$G
if [ ! -d ${H}/blastz.${G}/mafNet ]; then
echo "missing directory blastz.${G}/mafNet"
exit 255
fi
ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
done
# Copy MAFs to some appropriate NFS server for kluster run
ssh kkstore02
mkdir /san/sanvol1/scratch/fr2/multiz5way
cd /san/sanvol1/scratch/fr2/multiz5way
time rsync -a --copy-links --progress \
/cluster/data/fr2/bed/multiz5way/mafLinks/ .
mkdir penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
# the autoMultiz cluster run, there are only 2 jobs, kki is perfect
ssh kki
cd /cluster/data/fr2/bed/multiz5way/
# create species list and stripped down tree for autoMZ
sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
5way.nh > tmp.nh
echo `cat tmp.nh` > tree-commas.nh
echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
mkdir run maf
cd run
# NOTE: you need to set the db and multiz dirname properly in this script
cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = fr2
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz5way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz5way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
set in = $pairs/$s/$c.maf
set out = $db.$s.sing.maf
if ($s == $db) then
continue
endif
if (-e $in.gz) then
zcat $in.gz > $out
else if (-e $in) then
cp $in $out
else
echo "##maf version=1 scoring=autoMZ" > $out
endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
# << happy emacs
chmod +x autoMultiz
cat << '_EOF_' > template
#LOOP
autoMultiz $(root1) {check out line+ /cluster/data/fr2/bed/multiz5way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
# << happy emacs
awk '{print $1}' /cluster/data/fr2/chrom.sizes > chrom.lst
gensub2 chrom.lst single template jobList
para create jobList
# 2 jobs
para try ... check ... push ... etc ...
# Completed: 2 of 2 jobs
# CPU time in finished jobs: 7246s 120.77m 2.01h 0.08d 0.000 y
# IO & Wait Time: 357s 5.94m 0.10h 0.00d 0.000 y
# Average job time: 3802s 63.36m 1.06h 0.04d
# Longest finished job: 7601s 126.68m 2.11h 0.09d
# Submission to last job: 7601s 126.68m 2.11h 0.09d
# combine results into a single file for loading and gbdb reference
ssh kkstore02
cd /cluster/data/fr2/bed/multiz5way
nice -n +19 catDir maf > multiz5way.maf
# makes a 1.3 Gb file:
# -rw-rw-r-- 1 1341786986 Feb 3 19:52 multiz5way.maf
############################################################################
# ANNOTATE MULTIZ5WAY MAF AND LOAD TABLES (DONE - 2007-02-03 - Hiram)
## re-done 2007-03-27 with corrected nBeds and sizes files - Hiram
ssh kolossus
mkdir /cluster/data/fr2/bed/multiz5way/anno
cd /cluster/data/fr2/bed/multiz5way/anno
mkdir maf run
cd run
rm -f sizes nBeds
twoBitInfo -nBed /cluster/data/fr2/fr2.{2bit,N.bed}
for DB in `cat /cluster/data/fr2/bed/multiz5way/species.lst`
do
ln -s /cluster/data/${DB}/chrom.sizes ${DB}.len
ln -s /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
echo ${DB}.bed >> nBeds
echo ${DB}.len >> sizes
echo $DB
done
echo '#!/bin/csh -ef' > jobs.csh
echo date >> jobs.csh
# do smaller jobs first so you can see some progress immediately:
for F in `ls -1rS ../../maf/*.maf`
do
echo mafAddIRows -nBeds=nBeds -sizes=sizes $F \
/cluster/data/fr2/fr2.2bit ../maf/`basename $F` >> jobs.csh
echo "echo $F" >> jobs.csh
done
echo date >> jobs.csh
chmod +x jobs.csh
time nice -n +19 ./jobs.csh > jobs.log 2>&1 &
# to watch progress;
tail -f jobs.log
# real 165m43.716s
# Load anno/maf
ssh hgwdev
cd /cluster/data/fr2/bed/multiz5way/anno/maf
mkdir -p /gbdb/fr2/multiz5way/anno/maf
ln -s /cluster/data/fr2/bed/multiz5way/anno/maf/*.maf \
/gbdb/fr2/multiz5way/anno/maf
time nice -n +19 hgLoadMaf \
-pathPrefix=/gbdb/fr2/multiz5way/anno/maf fr2 multiz5way
# Loaded 1469786 mafs in 2 files from /gbdb/fr2/multiz5way/anno/maf
# real 0m41.123s
# Do the computation-intensive part of hgLoadMafSummary on a workhorse
# machine and then load on hgwdev:
ssh kolossus
cd /cluster/data/fr2/bed/multiz5way/anno/maf
time cat *.maf | \
nice -n +19 hgLoadMafSummary fr2 -minSize=30000 -mergeGap=1500 \
-maxSize=200000 -test multiz5waySummary stdin
# Created 146928 summary blocks from 3159326 components
# and 1469786 mafs from stdin
# real 0m58.171s
ssh hgwdev
cd /cluster/data/fr2/bed/multiz5way/anno/maf
sed -e 's/mafSummary/multiz5waySummary/' ~/kent/src/hg/lib/mafSummary.sql \
> /tmp/multiz5waySummary.sql
time nice -n +19 hgLoadSqlTab fr2 multiz5waySummary \
~/kent/src/hg/lib/mafSummary.sql multiz5waySummary.tab
# real 0m1.941
#######################################################################
# MULTIZ5WAY MAF FRAMES (DONE - 2007-02-03 - Hiram)
ssh hgwdev
mkdir /cluster/data/fr2/bed/multiz5way/frames
cd /cluster/data/fr2/bed/multiz5way/frames
mkdir genes
# The following is adapted from the gasAcu1 sequence
#------------------------------------------------------------------------
# get the genes for all genomes
# mRNAs with CDS. single select to get cds+psl, then split that up and
# create genePred
# using refGene for danRer4
for qDB in danRer4
do
geneTbl=refGene
echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-100 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
rm -f $tmpExt
done
# using genscan for tetNig1
# using ensGene for gasAcu1, oryLat1 and fr2
# genePreds; (must keep only the first 10 columns for knownGene)
for qDB in gasAcu1 oryLat1 fr2 tetNig1
do
if [ $qDB = "gasAcu1" -o $qDB = "oryLat1" -o $qDB = "fr2" ]; then
geneTbl=ensGene
elif [ $qDB = "tetNig1" ]; then
geneTbl=genscan
else
exit 255
fi
echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
| genePredSingleCover stdin stdout | gzip -2c \
> /scratch/tmp/$qDB.tmp.gz
mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
rm -f $tmpExt
done
###
ssh kkstore02
cd /cluster/data/fr2/bed/multiz5way/frames
time cat ../maf/*.maf | nice -n +19 genePredToMafFrames fr2 stdin stdout fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat1 genes/oryLat1.gp.gz tetNig1 genes/tetNig1.gp.gz danRer4 genes/danRer4.gp.gz | gzip > multiz5way.mafFrames.gz
# real 0m52.606
ssh hgwdev
cd /cluster/data/fr2/bed/multiz5way/frames
time nice -n +19 hgLoadMafFrames fr2 multiz5wayFrames \
multiz5way.mafFrames.gz
# real 0m20.580s
#########################################################################
# MULTIZ5WAY DOWNLOADABLES (DONE - 2007-02-05 - Hiram)
ssh hgwdev
mkdir /cluster/data/fr2/bed/multiz5way/mafDownloads
cd /cluster/data/fr2/bed/multiz5way
# upstream mafs
# rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst
# did not have fr2 as the first one
# There isn't any refGene table, using ensGene instead
for S in 1000 2000 5000
do
echo "making upstream${S}.maf"
nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 fr2 \
ensGene:upstream:${S} -fa=/dev/null -bed=stdout \
| perl -wpe 's/_up[^\t]+/\t0/' \
| $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags fr2 multiz5way \
stdin stdout -orgs=species.lst \
| gzip -c > mafDownloads/ensGene.upstream${S}.maf.gz
echo "done ensGene.upstream${S}.maf.gz"
done
ssh kkstore05
cd /cluster/data/fr2/bed/multiz5way
## re-done 2007-03-27 after correction to annotation step - Hiram
time for M in anno/maf/chr*.maf
do
B=`basename $M`
nice -n +19 gzip -c ${M} > mafDownloads/${B}.gz
echo ${B}.gz done
done
# real 4m39.440s
cd mafDownloads
nice -n +19 md5sum *.maf.gz > md5sum.txt
# Make a README.txt
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/fr2/multiz5way
cd /usr/local/apache/htdocs/goldenPath/fr2/multiz5way
ln -s /cluster/data/fr2/bed/multiz5way/mafDownloads/{*.gz,*.txt} .
############################################################################
# CREATE CONSERVATION WIGGLE WITH PHASTCONS
# (DONE - 2007-02-05 - Hiram)
# Estimate phastCons parameters
ssh kkstore05
mkdir /cluster/data/fr2/bed/multiz5way/cons
cd /cluster/data/fr2/bed/multiz5way/cons
# Create a starting-tree.mod based on one 25,000,000 window of chrUn
time nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chrUn.maf \
--refseq ../../../Un/chrUn.fa --in-format MAF \
--windows 25000000,1000 --out-format SS \
--between-blocks 5000 --out-root s1
# real 4m27.989s
time nice -n +19 /cluster/bin/phast/$MACHTYPE/phyloFit -i SS \
s1.174992629-199991578.ss \
--tree "(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer4)" \
--out-root starting-tree
# As an experiment, ran all of these ss files through this prediction
# and the resulting stats of the add up the C and G:
## min Q1 median Q3 max mean N sum stddev
# 0.45 0.457 0.463 0.469 0.479 0.461941 17 7.853 0.00759621
# Using the one closest to the mean: s1.174992629-199991578.ss
rm s1.*.ss
# add up the C and G:
grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
# 0.463
# This 0.463 is used in the --gc argument below
## the fa files are needed for the sequence and they are created during
# this loop if they haven't been done before
# Create SS files on san filesystem
ssh kkstore05
mkdir -p /san/sanvol1/scratch/fr2/cons/ss
cd /san/sanvol1/scratch/fr2/cons/ss
time for C in \
`awk '{print $1}' /cluster/data/fr2/chrom.sizes | sed -e "s/chr//"`
do
mkdir -p chr${C}
echo msa_split $C
nice -n +19 /cluster/bin/phast/$MACHTYPE/msa_split \
/cluster/data/fr2/bed/multiz5way/maf/chr${C}.maf \
--refseq /cluster/data/fr2/${C}/chr${C}.fa \
--in-format MAF --windows 2500000,0 --between-blocks 5000 \
--out-format SS --out-root chr${C}/chr${C}
done &
# real 4m2.736s
# Create a random list of 50 1 mb regions
cd /san/sanvol1/scratch/fr2/cons/ss
ls -1l chr*/chr*.ss \
| awk '$5 > 4000000 {print $9;}' \
| randomLines stdin 50 ../randomSs.list
# Set up parasol directory to calculate trees on these 50 regions
ssh pk
mkdir /san/sanvol1/scratch/fr2/cons/treeRun1
cd /san/sanvol1/scratch/fr2/cons/treeRun1
mkdir tree log
# Tuning this loop should come back to here to recalculate
# Create little script that calls phastCons with right arguments
# --target-coverage of 0.20 is about right for mouse, will be
# tuned exactly below
cat > makeTree.csh << '_EOF_'
#!/bin/csh -fe
set C=$1:h
mkdir -p log/${C} tree/${C}
/cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
/cluster/data/fr2/bed/multiz5way/cons/starting-tree.mod \
--gc 0.463 --nrates 1,1 --no-post-probs --ignore-missing \
--expected-length 10 --target-coverage 0.20 \
--quiet --log log/$1 --estimate-trees tree/$1
'_EOF_'
# << happy emacs
chmod a+x makeTree.csh
# Create gensub file
cat > template << '_EOF_'
#LOOP
./makeTree.csh $(path1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Make cluster job and run it
gensub2 ../randomSs.list single template jobList
para create jobList
para try/push/check/etc
# Completed: 50 of 50 jobs
# CPU time in finished jobs: 5204s 86.74m 1.45h 0.06d 0.000 y
# IO & Wait Time: 204s 3.39m 0.06h 0.00d 0.000 y
# Average job time: 108s 1.80m 0.03h 0.00d
# Longest finished job: 138s 2.30m 0.04h 0.00d
# Submission to last job: 141s 2.35m 0.04h 0.00d
# Now combine parameter estimates. We can average the .mod files
# using phyloBoot. This must be done separately for the conserved
# and nonconserved models
ssh pk
cd /san/sanvol1/scratch/fr2/cons/treeRun1
ls -1 tree/chr*/*.cons.mod > cons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
--output-average ave.cons.mod > cons_summary.txt 2>&1 &
ls -1 tree/chr*/*.noncons.mod > noncons.list
/cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
--output-average ave.noncons.mod > noncons_summary.txt
cp -p ave.*.mod ..
cd ..
cp -p ave.*.mod /cluster/data/fr2/bed/multiz5way/cons
# measuring entropy
# consEntopy <target coverage> <expected lengths>
# ave.cons.mod ave.noncons.mod --NH 9.78
# never stops with the --NH argument
time /cluster/bin/phast/$MACHTYPE/consEntropy --NH 9.7834 \
0.20 10 ave.{cons,noncons}.mod
## 0.20 10
( Solving for new omega: 10.000000 10.467305 10.449210 10.449184 )
Transition parameters: gamma=0.200000, omega=10.000000, mu=0.100000, nu=0.025000
Relative entropy: H=0.903779 bits/site
Expected min. length: L_min=10.726004 sites
Expected max. length: L_max=6.735382 sites
Phylogenetic information threshold: PIT=L_min*H=9.693936 bits
Recommended expected length: omega=10.449184 sites (for L_min*H=9.783400)
ssh pk
# Create cluster dir to do main phastCons run
mkdir /san/sanvol1/scratch/fr2/cons/consRun1
cd /san/sanvol1/scratch/fr2/cons/consRun1
mkdir ppRaw bed
# Create script to run phastCons with right parameters
# This job is I/O intensive in its output files, thus it is all
# working over in /scratch/tmp/
cat > doPhast << '_EOF_'
#!/bin/csh -fe
mkdir /scratch/tmp/${2}
cp -p ../ss/${1}/${2}.ss ../ave.{cons,noncons}.mod /scratch/tmp/${2}
pushd /scratch/tmp/${2} > /dev/null
/cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss ave.cons.mod,ave.noncons.mod \
--expected-length 10 --target-coverage 0.20 --quiet \
--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
popd > /dev/null
mkdir -p ppRaw/${1}
mkdir -p bed/${1}
mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
mv /scratch/tmp/${2}/${2}.bed bed/${1}
rm /scratch/tmp/${2}/ave.{cons,noncons}.mod
rm /scratch/tmp/${2}/${2}.ss
rmdir /scratch/tmp/${2}
'_EOF_'
# << happy emacs
chmod a+x doPhast
# root1 == chrom name, file1 == ss file name without .ss suffix
# Create gsub file
cat > template << '_EOF_'
#LOOP
./doPhast $(root1) $(file1)
#ENDLOOP
'_EOF_'
# << happy emacs
# Create parasol batch and run it
ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
gensub2 in.list single template jobList
para create jobList
para try/check/push/etc.
# These jobs are very fast and very I/O intensive, even on the san
# they will hang it up as they work at full tilt.
# Completed: 162 of 162 jobs
# CPU time in finished jobs: 1501s 25.02m 0.42h 0.02d 0.000 y
# IO & Wait Time: 924s 15.40m 0.26h 0.01d 0.000 y
# Average job time: 15s 0.25m 0.00h 0.00d
# Longest finished job: 23s 0.38m 0.01h 0.00d
# Submission to last job: 44s 0.73m 0.01h 0.00d
# combine predictions and transform scores to be in 0-1000 interval
# it uses a lot of memory, so on kolossus:
ssh kolossus
cd /san/sanvol1/scratch/fr2/cons/consRun1
# The sed's and the sort get the file names in chrom,start order
# You might like to verify it is correct by first looking at the
# list it produces:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
# if that looks right, then let it run:
find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved5Way.bed
# ~ 16 seconds
cp -p mostConserved5Way.bed /cluster/data/fr2/bed/multiz5way
# Figure out how much is actually covered by the bed file as so:
# Get the non-n genome size from faSize on all chroms:
ssh kkstore05
cd /cluster/data/fr2
faSize Un/chrUn.fa M/chrM.fa
# 400525790 bases (49313545 N's 351212245 real 284435760
# upper 66776485 lower) in 2 sequences in 2 files
cd /san/sanvol1/scratch/fr2/cons/consRun1
# The 351212245 comes from the non-n genome as counted above.
awk '
{sum+=$3-$2}
END{printf "%% %.2f = 100.0*%d/351212245\n",100.0*sum/351212245,sum}' \
mostConserved5Way.bed
# % 15.82 = 100.0*55573943/351212245 --exp-len 10 --tar-cov 0.20
# Aiming for %70 coverage in
# the following featureBits measurement on CDS:
# Beware of negative scores when too high. The logToBedScore
# will output an error on any negative scores.
HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits fr2 \
-enrichment ensGene:cds mostConserved5Way.bed
# --expected-length 10 --target-coverage 0.20 fr2
# ensGene:cds 8.216%, mostConserved5Way.bed 14.130%, both 5.188%, cover
# 63.14%, enrich 4.47x
# Load most conserved track into database
ssh hgwdev
cd /cluster/data/fr2/bed/multiz5way
# ended up using the set: --expected-length 10 --target-coverage 0.20
time nice -n +19 hgLoadBed -strict fr2 phastConsElements5way \
mostConserved5Way.bed
# Loaded 1140341 elements of size 5
# real 0m28.545
# should measure the same as above
time nice -n +19 \
featureBits fr2 -enrichment ensGene:cds phastConsElements5way
# At: --expected-length 10 --target-coverage 0.20 fr2
# ensGene:cds 8.216%, phastConsElements5way 14.130%, both 5.188%, cover
# 63.14%, enrich 4.47x
# Create merged posterier probability file and wiggle track data files
ssh pk
cd /san/sanvol1/scratch/fr2/cons/consRun1
# the sed business gets the names sorted by chromName, chromStart
# so that everything goes in numerical order into wigEncode
# This was verified above to be correct
time nice -n +19 find ./ppRaw -type f \
| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
| $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
phastCons5.wig phastCons5.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
# real 3m49.865s
# -rw-rw-r-- 1 288320445 Feb 5 13:51 phastCons5.wib
# -rw-rw-r-- 1 37122305 Feb 5 13:51 phastCons5.wig
time nice -n +19 cp -p phastCons5.wi? /cluster/data/fr2/bed/multiz5way/
# prepare compressed copy of ascii data values for downloads
ssh pk
cd /san/sanvol1/scratch/fr2/cons/consRun1
cat << '_EOF_' > gzipAscii.sh
#!/bin/sh
TOP=`pwd`
export TOP
mkdir -p phastCons5Scores
for D in ppRaw/chr*
do
C=${D/ppRaw\/}
out=phastCons5Scores/${C}.data.gz
echo "========================== ${C} ${D}"
find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
| sort -k7,7 -k9,9n \
| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
gzip > ${out}
done
'_EOF_'
# << happy emacs
chmod +x gzipAscii.sh
time nice -n +19 ./gzipAscii.sh
# real 5m21.400s
# copy them for downloads
ssh kkstore05
# this directory is actually a symlink from store9 to store8 to
# avoid the data full problem on store9
mkdir /cluster/data/fr2/bed/multiz5way/phastCons5Scores
cd /cluster/data/fr2/bed/multiz5way/phastCons5Scores
cp -p /san/sanvol1/scratch/fr2/cons/consRun1/phastCons5Scores/* .
# make a README.txt file here, and an md5sum
ssh hgwdev
mkdir /usr/local/apache/htdocs/goldenPath/fr2/phastCons5Scores
cd /usr/local/apache/htdocs/goldenPath/fr2/phastCons5Scores
ln -s /cluster/data/fr2/bed/multiz5way/phastCons5Scores/* .
# Load gbdb and database with wiggle.
ssh hgwdev
cd /cluster/data/fr2/bed/multiz5way
ln -s `pwd`/phastCons5.wib /gbdb/fr2/wib/phastCons5.wib
# ended up using the set: --expected-length 10 --target-coverage 0.20
time nice -n +19 hgLoadWiggle fr2 phastCons5 phastCons5.wig
# real 0m9.256s
# Create histogram to get an overview of all the data
ssh hgwdev
cd /cluster/data/fr2/bed/multiz5way
time nice -n +19 hgWiggle -doHistogram \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
-db=fr2 phastCons5 > histogram.data 2>&1
# real 0m30.744
# create plot of histogram:
cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Stickleback fr2 Histogram phastCons5 track"
set xlabel " phastCons5 score - --expected-length 10 --target-coverage 0.20"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]
plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
"histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
display histo.png &
# The mostConserved can also be characterized by a histogram
awk '{print $3-$2}' mostConserved5Way.bed > mostCons.txt
textHistogram -verbose=2 -autoScale=1000 -pValues mostCons.txt \
> mostCons.histo.txt
cat << '_EOF_' | gnuplot > mostCons.png
set terminal png small color \
xffffff x000000 x000000 x444444 xaa4400 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Stickleback fr2 histogram: lengths of mostConserved track elements"
set xlabel " mostConserved element length - --expected-length 10 --target-coverage 0.20"
set ylabel " # of elements at this length"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set xrange [0:200]
set y2tics
set boxwidth 2
set style fill solid
plot "mostCons.histo.txt" using 2:3 title " # of elements" with boxes, \
"mostCons.histo.txt" using 2:6 axes x1y2 title " CRF" with lines
'_EOF_'
# << happy emacs
############################################################################
## Set a more interesting default position, location of CABIN1 gene
## DONE - 2007-02-08 - Hiram
ssh hgwdev
hgsql -e "update
hgsql -e \
'update dbDb set defaultPos="chrUn:29,305,920-29,330,760" where name = "fr2";' \
-h genome-testdb hgcentraltest
############################################################################
## DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram)
ssh hgwdev
cd /cluster/data/fr2
~/kent/src/hg/utils/automation/makeDownloads.pl fr2 \
> makeDownloads.out 2>&1
# Doesn't work due to missing Repeat masker outputs
# Create WindowMasker separate files by chrom, for downloads
ssh kkstore05
cd /cluster/data/fr2/bed/WindowMasker.2007-01-22
# This name change here is so the names created by splitFileByColumn
# will be reasonable
zcat windowmasker.sdust.bed.gz > chr.fr2.WMSdust.bed
splitFileByColumn chr.fr2.WMSdust.bed chrWM
# Creating chrWM/chr1.fr2.WMSdust.bed
# Creating chrWM/chr2.fr2.WMSdust.bed
# ... etc ...
cd chrWM
tar cvzf ../chromWMSdust.bed.tar.gz *.bed
cd ..
# Verify this process didn't destroy anything:
cat chrWM/*.bed | awk '{sum += $3-$2}END{printf "total size: %d\n",sum}'
# total size: 115825307
zcat windowmasker.sdust.bed.gz \
| awk '{sum += $3-$2}END{printf "total size: %d\n",sum}'
# total size: 115825307
# deliver to bigZips
cd /cluster/data/fr2/goldenPath/bigZips
ln -s \
/cluster/data/fr2/bed/WindowMasker.2007-01-22/chromWMSdust.bed.tar.gz .
# remove the chromOut.tar.gz file and re-make the md5sum.txt
md5sum *gz > md5sum.txt
# go back to simpleRepeat and attempt to make a chrM.bed - turns out to
# be an empty result, so leave an empty file there. Then running this
# makeDownloads.pl again, create two empty .out files to get through
# that.
cd /cluster/data/fr2
touch Un/chrUn.fa.out
touch M/chrM.fa.out
~/kent/src/hg/utils/automation/makeDownloads.pl fr2 \
> makeDownloads.out 2>&1
cd goldenPath/bigZips
ln -s \
/cluster/data/fr2/bed/WindowMasker.2007-01-22/chromWMSdust.bed.tar.gz .
# get GenBank native mRNAs
ssh hgwdev
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=fr2 -native \
GenBank mrna /cluster/data/fr2/goldenPath/bigZips/mrna.fa
# get GenBank xeno mRNAs
./bin/i386/gbGetSeqs -db=fr2 -xeno \
GenBank mrna /cluster/data/fr2/goldenPath/bigZips/xenoMrna.fa
cd /cluster/data/fr2/goldenPath/bigZips
ssh kkstore05
gzip mrna.fa xenoMrna.fa
md5sum *.gz > md5sum.txt
# Edit the README.txt file to be correct
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/fr2/bigZips
ln -s /cluster/data/fr2/goldenPath/bigZips/* .
############################################################################
## Fugu Photograph - obtained from Byrappa Venkatesh in email 2007-02-15
## (DONE - 2007-02-16 - Hiram)
ssh hgwdev
mkdir /cluster/data/fr2/photograph
cd /cluster/data/fr2/photograph
## original: Byrappa.Venkatesh.Fugu.jpg
identify By*
# Byrappa.Venkatesh.Fugu.jpg JPEG 700x286 DirectClass 43kb 0.000u 0:01
convert -quality 80 -geometry 300x200 Byrappa.Venkatesh.Fugu.jpg \
Takifugu_rubripes.jpg
## check this file into the browser/images CVS source tree and
## copy to /usr/local/apache/htdocs/images
##########################################################################
## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram)
ssh kkstore02
mkdir /cluster/data/fr2/bed/RepeatMasker
cd /cluster/data/fr2/bed/RepeatMasker
time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \
-buildDir=/cluster/data/fr2/bed/RepeatMasker fr2 > do.log 2>&1
###########################################################################
## Chicken/Fugu chain/net swap - (DONE - 2007-03-12 - Hiram)
mkdir /cluster/data/fr2/bed/blastz.galGal3.swap
cd /cluster/data/fr2/bed/blastz.galGal3.swap
time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-swap > swap.log 2>&1 &
time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
/cluster/data/galGal3/bed/blastz.fr2.2007-03-09/DEF \
-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
-continue=net -swap > swap_net.log 2>&1 &
# real 3m1.239s
cat fb.fr2.chainGalGal3Link.txt
# 36175581 bases of 393312790 (9.198%) in intersection
###########################################################################
# Create liftover fr1 to fr2 (DONE - 2007-04-09 - Hiram)
ssh kkstore02
mkdir /cluster/data/fr2/bed/blat.fr1
cd /cluster/data/fr2/bed/blat.fr1
time nice -n +19 doSameSpeciesLiftOver.pl fr1 fr2 -bigClusterHub pk \
-buildDir=/cluster/data/fr2/bed/blat.fr1 > do.log 2>&1
cp -p fr1ToFr2.over.chain.gz ../liftOver
cd ../liftOver
md5sum *.gz > ../../goldenPath/liftOver/md5sum.txt
ssh hgwdev
cd /usr/local/apache/htdocs/goldenPath/fr2/liftOver
ln -s /cluster/data/fr2/bed/liftOver/fr1ToFr2.over.chain.gz .
##############################################################################
# SWAP DANRER5 BLASTZ RESULT TO CREATE DANRER5 CHAINS AND NETS TRACKS,
# AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# (DONE, 2007-09-19 and 2007-09-22, hartera)
ssh kkstore02
mkdir /cluster/data/fr2/bed/blastz.swap.danRer5
cd /cluster/data/fr2/bed/blastz.swap.danRer5
# blastz parameters used to align fr2 as query to danRer5 as target:
# BLASTZ_H=2500
# BLASTZ_M=50
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# Results for fr2 blastz on danRer5 are in:
# /cluster/data/danRer5/bed/blastz.fr2.2007-09-18
time nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
-qRepeats=windowmaskerSdust -swap \
/cluster/data/danRer5/bed/blastz.fr2.2007-09-18/DEF \
>& swap.log &
# 0.139u 0.098s 14:09.59 0.0% 0+0k 0+0io 0pf+0w
ssh hgwdev
cat \
/cluster/data/fr2/bed/blastz.danRer5.swap/fb.fr2.chainDanRer5Link.txt
# 78259559 bases of 393312790 (19.898%) in intersection
# look at coverage of ensGene CDS, there is no native RefSeqs
# track for fugu, fr2.
featureBits fr2 ensGene:cds chainDanRer5Link -enrichment
# ensGene:cds 8.216%, chainDanRer5Link 19.898%, both 7.130%, cover 86.78%,
# enrich 4.36x
featureBits fr2 ensGene:cds chainDanRer4Link -enrichment
# ensGene:cds 8.216%, chainDanRer4Link 20.585%, both 7.030%, cover 85.56%,
# enrich 4.16x
featureBits fr2 ensGene:cds netDanRer5 -enrichment
# ensGene:cds 8.216%, netDanRer5 66.051%, both 7.845%, cover 95.48%,
# enrich 1.45x
featureBits fr2 ensGene:cds netDanRer4 -enrichment
# ensGene:cds 8.216%, netDanRer4 65.374%, both 7.766%, cover 94.51%,
# enrich 1.45x
# clean up a little (2007-09-22, hartera)
ssh kkstore02
cd /cluster/data/fr2/bed
mv ./blastz.swap.danRer5/swap.log ./blastz.danRer5.swap
rm -r blastz.swap.danRer5
ln -s blastz.danRer5.swap blastz.danRer5
############################################################################
# TRANSMAP vertebrate.2008-05-20 build (2008-05-24 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build (2008-06-30 markd)
vertebrate-wide transMap alignments were built Tracks are created and loaded
by a single Makefile. This is available from:
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
see doc/builds.txt for specific details.
############################################################################
# SWAP BLASTZ Medaka oryLat2 (DONE - 2008-08-27 - Hiram)
ssh kkstore04 # not too important since everything moved to the hive
screen # use a screen to control this job
cd /cluster/data/oryLat2/bed/blastz.fr2.2008-08-25
cat fb.oryLat2.chainFr2Link.txt
# 180945351 bases of 700386597 (25.835%) in intersection
mkdir /cluster/data/fr2/bed/blastz.oryLat2.swap
cd /cluster/data/fr2/bed/blastz.oryLat2.swap
time doBlastzChainNet.pl -verbose=2 -swap \
/cluster/data/oryLat2/bed/blastz.fr2.2008-08-25/DEF \
-chainMinScore=3000 -chainLinearGap=medium \
-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
-smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
# real 24m32.826s
cat fb.fr2.chainOryLat2Link.txt
# 153621820 bases of 393312790 (39.058%) in intersection
############################################################################
+############################################################################
+# TRANSMAP vertebrate.2009-07-01 build (2009-07-21 markd)
+
+vertebrate-wide transMap alignments were built Tracks are created and loaded
+by a single Makefile. This is available from:
+ svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
+
+see doc/builds.txt for specific details.
+############################################################################